@@ -176,7 +176,7 @@ ModelState::Create(TRITONBACKEND_Model* triton_model, ModelState** state)
176
176
177
177
ModelState::ModelState (TRITONBACKEND_Model* triton_model)
178
178
: BackendModel(triton_model), enable_optimized_execution_(true ),
179
- enable_inference_mode_ (false ), enable_cache_cleaning_(false ),
179
+ enable_inference_mode_ (true ), enable_cache_cleaning_(false ),
180
180
enable_weight_sharing_(false ), enable_tensor_fuser_pair_({false , true }),
181
181
enable_jit_profiling_pair_({false , true }),
182
182
enable_jit_executor_pair_({false , true }),
@@ -1312,12 +1312,12 @@ ModelInstanceState::Execute(
1312
1312
torch::jit::overrideCanFuseOnCPU (false );
1313
1313
torch::jit::overrideCanFuseOnGPU (false );
1314
1314
torch::jit::setTensorExprFuserEnabled (false );
1315
- torch::jit::fuser::cuda::setEnabled (true );
1315
+ torch::jit::fuser::cuda::setEnabled (true );
1316
1316
} else {
1317
1317
torch::jit::overrideCanFuseOnCPU (true );
1318
1318
torch::jit::overrideCanFuseOnGPU (true );
1319
1319
torch::jit::setTensorExprFuserEnabled (true );
1320
- torch::jit::fuser::cuda::setEnabled (false );
1320
+ torch::jit::fuser::cuda::setEnabled (false );
1321
1321
}
1322
1322
}
1323
1323
@@ -1761,9 +1761,9 @@ ModelInstanceState::SetInputTensors(
1761
1761
1762
1762
batchn_shape[0 ] += GetElementCount (input_shape, input_dims_count);
1763
1763
}
1764
- }
1765
- else {
1766
- batchn_shape = std::vector<int64_t >(input_shape, input_shape + input_dims_count);
1764
+ } else {
1765
+ batchn_shape =
1766
+ std::vector<int64_t >(input_shape, input_shape + input_dims_count);
1767
1767
if (supports_batching_) {
1768
1768
batchn_shape[0 ] = total_batch_size;
1769
1769
}
@@ -1887,9 +1887,11 @@ ModelInstanceState::ReadOutputTensors(
1887
1887
1888
1888
// Output tensors may not reside on the same device as model
1889
1889
torch::Device tensor_device = output_flat.device ();
1890
- const auto memory_type = (tensor_device.type () == torch::kCPU ) ? TRITONSERVER_MEMORY_CPU
1891
- : TRITONSERVER_MEMORY_GPU;
1892
- const auto memory_id = (tensor_device.type () == torch::kCPU ) ? 0 : tensor_device.index ();
1890
+ const auto memory_type = (tensor_device.type () == torch::kCPU )
1891
+ ? TRITONSERVER_MEMORY_CPU
1892
+ : TRITONSERVER_MEMORY_GPU;
1893
+ const auto memory_id =
1894
+ (tensor_device.type () == torch::kCPU ) ? 0 : tensor_device.index ();
1893
1895
1894
1896
// Batch output doesn't support string data type yet, as it is not trivial
1895
1897
// to parse string output
@@ -1906,16 +1908,16 @@ ModelInstanceState::ReadOutputTensors(
1906
1908
return TRITONSERVER_ErrorNew (
1907
1909
TRITONSERVER_ERROR_INVALID_ARG,
1908
1910
(std::string (" output '" ) + name +
1909
- " ' is a scalar which is not supported." )
1911
+ " ' is a scalar which is not supported." )
1910
1912
.c_str ());
1911
1913
}
1912
1914
1913
1915
responder.ProcessTensor (
1914
- name, output_dtype, batchn_shape, output_buffer,
1915
- memory_type, memory_id);
1916
+ name, output_dtype, batchn_shape, output_buffer, memory_type,
1917
+ memory_id);
1916
1918
} else {
1917
1919
responder.ProcessBatchOutput (
1918
- name, *batch_output, output_buffer, memory_type, memory_id);
1920
+ name, *batch_output, output_buffer, memory_type, memory_id);
1919
1921
}
1920
1922
} else if (output_tensors[op_index].isList ()) {
1921
1923
// Custom handling for string/bytes tensor...
0 commit comments