Skip to content

Commit 1d94c6a

Browse files
committed
Fix up
1 parent ea76722 commit 1d94c6a

File tree

1 file changed

+134
-52
lines changed

1 file changed

+134
-52
lines changed

src/onnxruntime.cc

Lines changed: 134 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
// Copyright 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1+
// Copyright 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
22
//
33
// Redistribution and use in source and binary forms, with or without
44
// modification, are permitted provided that the following conditions
@@ -419,6 +419,7 @@ ModelState::LoadModel(
419419
#ifdef TRITON_ENABLE_GPU
420420
if ((instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
421421
(instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_AUTO)) {
422+
std::map<std::string, std::string> cuda_options_map;
422423
triton::common::TritonJson::Value optimization;
423424
if (model_config_.Find("optimization", &optimization)) {
424425
triton::common::TritonJson::Value eas;
@@ -673,8 +674,13 @@ ModelState::LoadModel(
673674
key = "trt_ep_context_embed_mode";
674675
value = value_string;
675676
} else {
676-
key = param_key;
677-
params.MemberAsString(param_key.c_str(), &value);
677+
return TRITONSERVER_ErrorNew(
678+
TRITONSERVER_ERROR_INVALID_ARG,
679+
std::string(
680+
"unknown parameter '" + param_key +
681+
"' is provided for TensorRT Execution "
682+
"Accelerator")
683+
.c_str());
678684
}
679685
if (!key.empty() && !value.empty()) {
680686
keys.push_back(key);
@@ -687,25 +693,9 @@ ModelState::LoadModel(
687693
c_keys.push_back(keys[i].c_str());
688694
c_values.push_back(values[i].c_str());
689695
}
690-
auto status = ort_api->UpdateTensorRTProviderOptions(
696+
RETURN_IF_ORT_ERROR(ort_api->UpdateTensorRTProviderOptions(
691697
rel_trt_options.get(), c_keys.data(), c_values.data(),
692-
keys.size());
693-
if (status != nullptr) {
694-
OrtAllocator* allocator;
695-
char* options;
696-
RETURN_IF_ORT_ERROR(
697-
ort_api->GetAllocatorWithDefaultOptions(&allocator));
698-
RETURN_IF_ORT_ERROR(
699-
ort_api->GetTensorRTProviderOptionsAsString(
700-
rel_trt_options.get(), allocator, &options));
701-
return TRITONSERVER_ErrorNew(
702-
TRITONSERVER_ERROR_INVALID_ARG,
703-
(std::string("unknown parameters in config following "
704-
"options are supported for TensorRT "
705-
"Execution Provider: ") +
706-
std::string(options))
707-
.c_str());
708-
}
698+
keys.size()));
709699
}
710700
}
711701

@@ -722,11 +712,41 @@ ModelState::LoadModel(
722712
continue;
723713
}
724714
#endif // TRITON_ENABLE_ONNXRUNTIME_TENSORRT
725-
return TRITONSERVER_ErrorNew(
726-
TRITONSERVER_ERROR_INVALID_ARG,
727-
(std::string("unknown Execution Accelerator '") + name +
728-
"' is requested")
729-
.c_str());
715+
716+
if (name == "cuda") {
717+
// Parse CUDA EP configurations
718+
triton::common::TritonJson::Value params;
719+
if (ea.Find("parameters", &params)) {
720+
std::vector<std::string> param_keys;
721+
RETURN_IF_ERROR(params.Members(&param_keys));
722+
for (const auto& param_key : param_keys) {
723+
std::string value_string, key, value;
724+
// Special handling for boolean values
725+
if (param_key == "do_copy_in_default_stream" ||
726+
param_key == "use_ep_level_unified_stream") {
727+
RETURN_IF_ERROR(params.MemberAsString(
728+
param_key.c_str(), &value_string));
729+
bool bool_value;
730+
RETURN_IF_ERROR(ParseBoolValue(value_string, &bool_value));
731+
key = param_key;
732+
value = value_string;
733+
} else {
734+
key = param_key;
735+
RETURN_IF_ERROR(
736+
params.MemberAsString(param_key.c_str(), &value));
737+
}
738+
if (!key.empty() && !value.empty()) {
739+
cuda_options_map[key] = value;
740+
}
741+
}
742+
}
743+
} else {
744+
return TRITONSERVER_ErrorNew(
745+
TRITONSERVER_ERROR_INVALID_ARG,
746+
(std::string("unknown Execution Accelerator '") + name +
747+
"' is requested")
748+
.c_str());
749+
}
730750
}
731751
}
732752
}
@@ -740,55 +760,117 @@ ModelState::LoadModel(
740760
std::unique_ptr<
741761
OrtCUDAProviderOptionsV2, decltype(ort_api->ReleaseCUDAProviderOptions)>
742762
rel_cuda_options(cuda_options, ort_api->ReleaseCUDAProviderOptions);
743-
std::map<std::string, std::string> options;
744-
options["device_id"] = std::to_string(instance_group_device_id);
763+
cuda_options_map["device_id"] = std::to_string(instance_group_device_id);
764+
cuda_options_map["has_user_compute_stream"] = stream != nullptr ? "1" : "0";
765+
RETURN_IF_ORT_ERROR(ort_api->UpdateCUDAProviderOptionsWithValue(
766+
rel_cuda_options.get(), "default_memory_arena_cfg", nullptr));
745767
{
746-
// Parse CUDA EP configurations
768+
// Parse CUDA EP configurations directly from the parameters field.
769+
// This is deprecated with adding support for CUDA EP in the
770+
// gpu_execution_accelerator field. Keeping this for backward
771+
// compatibility.
747772
triton::common::TritonJson::Value params;
748773
if (model_config_.Find("parameters", &params)) {
749-
std::vector<std::string> members;
750-
RETURN_IF_ERROR(params.Members(&members));
751-
for (auto& m : members) {
752-
const auto [it_value, success] = options.insert({m, ""});
753-
if (success) {
754-
params.MemberAsString(m.c_str(), &it_value->second);
774+
triton::common::TritonJson::Value json_value;
775+
if (params.Find("cudnn_conv_algo_search", &json_value)) {
776+
int cudnn_conv_algo_search = 0;
777+
RETURN_IF_ERROR(TryParseModelStringParameter(
778+
params, "cudnn_conv_algo_search", &cudnn_conv_algo_search, 0));
779+
std::string string_value;
780+
switch (cudnn_conv_algo_search) {
781+
case 0:
782+
string_value = "EXHAUSTIVE";
783+
break;
784+
case 1:
785+
string_value = "HEURISTIC";
786+
break;
787+
case 2:
788+
string_value = "DEFAULT";
789+
break;
790+
default:
791+
return TRITONSERVER_ErrorNew(
792+
TRITONSERVER_ERROR_INVALID_ARG,
793+
(std::string("unsupported cudnn_conv_algo_search value '") +
794+
std::to_string(cudnn_conv_algo_search) + "' is requested")
795+
.c_str());
755796
}
797+
cuda_options_map["cudnn_conv_algo_search"] = string_value;
798+
} else {
799+
cuda_options_map["cudnn_conv_algo_search"] = "EXHAUSTIVE";
800+
}
801+
802+
if (params.Find("gpu_mem_limit", &json_value)) {
803+
std::string string_value;
804+
RETURN_IF_ERROR(
805+
json_value.MemberAsString("string_value", &string_value));
806+
cuda_options_map["gpu_mem_limit"] = string_value;
807+
} else {
808+
cuda_options_map["gpu_mem_limit"] =
809+
std::to_string(std::numeric_limits<size_t>::max());
810+
}
811+
812+
if (params.Find("arena_extend_strategy", &json_value)) {
813+
int arena_extend_strategy = 0;
814+
RETURN_IF_ERROR(TryParseModelStringParameter(
815+
params, "arena_extend_strategy", &arena_extend_strategy, 0));
816+
std::string string_value;
817+
switch (arena_extend_strategy) {
818+
case 0:
819+
string_value = "kNextPowerOfTwo";
820+
break;
821+
case 1:
822+
string_value = "kSameAsRequested";
823+
break;
824+
default:
825+
return TRITONSERVER_ErrorNew(
826+
TRITONSERVER_ERROR_INVALID_ARG,
827+
(std::string("unsupported arena_extend_strategy value '") +
828+
std::to_string(arena_extend_strategy) + "' is requested")
829+
.c_str());
830+
}
831+
cuda_options_map["arena_extend_strategy"] = string_value;
832+
} else {
833+
cuda_options_map["arena_extend_strategy"] = "kNextPowerOfTwo";
834+
}
835+
836+
if (params.Find("do_copy_in_default_stream", &json_value)) {
837+
std::string string_value;
838+
RETURN_IF_ERROR(
839+
json_value.MemberAsString("string_value", &string_value));
840+
cuda_options_map["do_copy_in_default_stream"] = string_value;
841+
} else {
842+
cuda_options_map["do_copy_in_default_stream"] = "1";
756843
}
757844
}
758845
}
759846

760847
std::vector<const char*> option_names, option_values;
761-
for (const auto& [key, value] : options) {
848+
for (const auto& [key, value] : cuda_options_map) {
762849
option_names.push_back(key.c_str());
763850
option_values.push_back(value.c_str());
764851
}
765-
auto status = ort_api->UpdateCUDAProviderOptions(
852+
853+
RETURN_IF_ORT_ERROR(ort_api->UpdateCUDAProviderOptions(
766854
rel_cuda_options.get(), option_names.data(), option_values.data(),
767-
option_values.size());
768-
if (status != nullptr) {
769-
OrtAllocator* allocator;
770-
char* options;
771-
RETURN_IF_ORT_ERROR(ort_api->GetAllocatorWithDefaultOptions(&allocator));
772-
RETURN_IF_ORT_ERROR(ort_api->GetCUDAProviderOptionsAsString(
773-
rel_cuda_options.get(), allocator, &options));
774-
return TRITONSERVER_ErrorNew(
775-
TRITONSERVER_ERROR_INVALID_ARG,
776-
(std::string("unknown parameters in config following options are "
777-
"supported for CUDA Execution Provider: ") +
778-
std::string(options))
779-
.c_str());
780-
}
855+
option_values.size()));
781856

782857
if (stream != nullptr) {
783858
RETURN_IF_ORT_ERROR(ort_api->UpdateCUDAProviderOptionsWithValue(
784859
rel_cuda_options.get(), "user_compute_stream", stream));
785860
}
786861
RETURN_IF_ORT_ERROR(ort_api->SessionOptionsAppendExecutionProvider_CUDA_V2(
787862
soptions, cuda_options));
863+
864+
OrtAllocator* allocator;
865+
char* options;
866+
RETURN_IF_ORT_ERROR(ort_api->GetAllocatorWithDefaultOptions(&allocator));
867+
RETURN_IF_ORT_ERROR(ort_api->GetCUDAProviderOptionsAsString(
868+
rel_cuda_options.get(), allocator, &options));
788869
LOG_MESSAGE(
789870
TRITONSERVER_LOG_VERBOSE,
790871
(std::string("CUDA Execution Accelerator is set for '") + Name() +
791-
"' on device " + std::to_string(instance_group_device_id))
872+
"' on device " + std::to_string(instance_group_device_id) +
873+
std::string(" with options: ") + std::string(options))
792874
.c_str());
793875
}
794876
#endif // TRITON_ENABLE_GPU

0 commit comments

Comments
 (0)