1
- // Copyright 2019-2023 , NVIDIA CORPORATION & AFFILIATES. All rights reserved.
1
+ // Copyright 2019-2024 , NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
2
//
3
3
// Redistribution and use in source and binary forms, with or without
4
4
// modification, are permitted provided that the following conditions
@@ -419,6 +419,7 @@ ModelState::LoadModel(
419
419
#ifdef TRITON_ENABLE_GPU
420
420
if ((instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_GPU) ||
421
421
(instance_group_kind == TRITONSERVER_INSTANCEGROUPKIND_AUTO)) {
422
+ std::map<std::string, std::string> cuda_options_map;
422
423
triton::common::TritonJson::Value optimization;
423
424
if (model_config_.Find (" optimization" , &optimization)) {
424
425
triton::common::TritonJson::Value eas;
@@ -673,8 +674,13 @@ ModelState::LoadModel(
673
674
key = " trt_ep_context_embed_mode" ;
674
675
value = value_string;
675
676
} else {
676
- key = param_key;
677
- params.MemberAsString (param_key.c_str (), &value);
677
+ return TRITONSERVER_ErrorNew (
678
+ TRITONSERVER_ERROR_INVALID_ARG,
679
+ std::string (
680
+ " unknown parameter '" + param_key +
681
+ " ' is provided for TensorRT Execution "
682
+ " Accelerator" )
683
+ .c_str ());
678
684
}
679
685
if (!key.empty () && !value.empty ()) {
680
686
keys.push_back (key);
@@ -687,25 +693,9 @@ ModelState::LoadModel(
687
693
c_keys.push_back (keys[i].c_str ());
688
694
c_values.push_back (values[i].c_str ());
689
695
}
690
- auto status = ort_api->UpdateTensorRTProviderOptions (
696
+ RETURN_IF_ORT_ERROR ( ort_api->UpdateTensorRTProviderOptions (
691
697
rel_trt_options.get (), c_keys.data (), c_values.data (),
692
- keys.size ());
693
- if (status != nullptr ) {
694
- OrtAllocator* allocator;
695
- char * options;
696
- RETURN_IF_ORT_ERROR (
697
- ort_api->GetAllocatorWithDefaultOptions (&allocator));
698
- RETURN_IF_ORT_ERROR (
699
- ort_api->GetTensorRTProviderOptionsAsString (
700
- rel_trt_options.get (), allocator, &options));
701
- return TRITONSERVER_ErrorNew (
702
- TRITONSERVER_ERROR_INVALID_ARG,
703
- (std::string (" unknown parameters in config following "
704
- " options are supported for TensorRT "
705
- " Execution Provider: " ) +
706
- std::string (options))
707
- .c_str ());
708
- }
698
+ keys.size ()));
709
699
}
710
700
}
711
701
@@ -722,11 +712,41 @@ ModelState::LoadModel(
722
712
continue ;
723
713
}
724
714
#endif // TRITON_ENABLE_ONNXRUNTIME_TENSORRT
725
- return TRITONSERVER_ErrorNew (
726
- TRITONSERVER_ERROR_INVALID_ARG,
727
- (std::string (" unknown Execution Accelerator '" ) + name +
728
- " ' is requested" )
729
- .c_str ());
715
+
716
+ if (name == " cuda" ) {
717
+ // Parse CUDA EP configurations
718
+ triton::common::TritonJson::Value params;
719
+ if (ea.Find (" parameters" , ¶ms)) {
720
+ std::vector<std::string> param_keys;
721
+ RETURN_IF_ERROR (params.Members (¶m_keys));
722
+ for (const auto & param_key : param_keys) {
723
+ std::string value_string, key, value;
724
+ // Special handling for boolean values
725
+ if (param_key == " do_copy_in_default_stream" ||
726
+ param_key == " use_ep_level_unified_stream" ) {
727
+ RETURN_IF_ERROR (params.MemberAsString (
728
+ param_key.c_str (), &value_string));
729
+ bool bool_value;
730
+ RETURN_IF_ERROR (ParseBoolValue (value_string, &bool_value));
731
+ key = param_key;
732
+ value = value_string;
733
+ } else {
734
+ key = param_key;
735
+ RETURN_IF_ERROR (
736
+ params.MemberAsString (param_key.c_str (), &value));
737
+ }
738
+ if (!key.empty () && !value.empty ()) {
739
+ cuda_options_map[key] = value;
740
+ }
741
+ }
742
+ }
743
+ } else {
744
+ return TRITONSERVER_ErrorNew (
745
+ TRITONSERVER_ERROR_INVALID_ARG,
746
+ (std::string (" unknown Execution Accelerator '" ) + name +
747
+ " ' is requested" )
748
+ .c_str ());
749
+ }
730
750
}
731
751
}
732
752
}
@@ -740,55 +760,117 @@ ModelState::LoadModel(
740
760
std::unique_ptr<
741
761
OrtCUDAProviderOptionsV2, decltype (ort_api->ReleaseCUDAProviderOptions )>
742
762
rel_cuda_options (cuda_options, ort_api->ReleaseCUDAProviderOptions );
743
- std::map<std::string, std::string> options;
744
- options[" device_id" ] = std::to_string (instance_group_device_id);
763
+ cuda_options_map[" device_id" ] = std::to_string (instance_group_device_id);
764
+ cuda_options_map[" has_user_compute_stream" ] = stream != nullptr ? " 1" : " 0" ;
765
+ RETURN_IF_ORT_ERROR (ort_api->UpdateCUDAProviderOptionsWithValue (
766
+ rel_cuda_options.get (), " default_memory_arena_cfg" , nullptr ));
745
767
{
746
- // Parse CUDA EP configurations
768
+ // Parse CUDA EP configurations directly from the parameters field.
769
+ // This is deprecated with adding support for CUDA EP in the
770
+ // gpu_execution_accelerator field. Keeping this for backward
771
+ // compatibility.
747
772
triton::common::TritonJson::Value params;
748
773
if (model_config_.Find (" parameters" , ¶ms)) {
749
- std::vector<std::string> members;
750
- RETURN_IF_ERROR (params.Members (&members));
751
- for (auto & m : members) {
752
- const auto [it_value, success] = options.insert ({m, " " });
753
- if (success) {
754
- params.MemberAsString (m.c_str (), &it_value->second );
774
+ triton::common::TritonJson::Value json_value;
775
+ if (params.Find (" cudnn_conv_algo_search" , &json_value)) {
776
+ int cudnn_conv_algo_search = 0 ;
777
+ RETURN_IF_ERROR (TryParseModelStringParameter (
778
+ params, " cudnn_conv_algo_search" , &cudnn_conv_algo_search, 0 ));
779
+ std::string string_value;
780
+ switch (cudnn_conv_algo_search) {
781
+ case 0 :
782
+ string_value = " EXHAUSTIVE" ;
783
+ break ;
784
+ case 1 :
785
+ string_value = " HEURISTIC" ;
786
+ break ;
787
+ case 2 :
788
+ string_value = " DEFAULT" ;
789
+ break ;
790
+ default :
791
+ return TRITONSERVER_ErrorNew (
792
+ TRITONSERVER_ERROR_INVALID_ARG,
793
+ (std::string (" unsupported cudnn_conv_algo_search value '" ) +
794
+ std::to_string (cudnn_conv_algo_search) + " ' is requested" )
795
+ .c_str ());
755
796
}
797
+ cuda_options_map[" cudnn_conv_algo_search" ] = string_value;
798
+ } else {
799
+ cuda_options_map[" cudnn_conv_algo_search" ] = " EXHAUSTIVE" ;
800
+ }
801
+
802
+ if (params.Find (" gpu_mem_limit" , &json_value)) {
803
+ std::string string_value;
804
+ RETURN_IF_ERROR (
805
+ json_value.MemberAsString (" string_value" , &string_value));
806
+ cuda_options_map[" gpu_mem_limit" ] = string_value;
807
+ } else {
808
+ cuda_options_map[" gpu_mem_limit" ] =
809
+ std::to_string (std::numeric_limits<size_t >::max ());
810
+ }
811
+
812
+ if (params.Find (" arena_extend_strategy" , &json_value)) {
813
+ int arena_extend_strategy = 0 ;
814
+ RETURN_IF_ERROR (TryParseModelStringParameter (
815
+ params, " arena_extend_strategy" , &arena_extend_strategy, 0 ));
816
+ std::string string_value;
817
+ switch (arena_extend_strategy) {
818
+ case 0 :
819
+ string_value = " kNextPowerOfTwo" ;
820
+ break ;
821
+ case 1 :
822
+ string_value = " kSameAsRequested" ;
823
+ break ;
824
+ default :
825
+ return TRITONSERVER_ErrorNew (
826
+ TRITONSERVER_ERROR_INVALID_ARG,
827
+ (std::string (" unsupported arena_extend_strategy value '" ) +
828
+ std::to_string (arena_extend_strategy) + " ' is requested" )
829
+ .c_str ());
830
+ }
831
+ cuda_options_map[" arena_extend_strategy" ] = string_value;
832
+ } else {
833
+ cuda_options_map[" arena_extend_strategy" ] = " kNextPowerOfTwo" ;
834
+ }
835
+
836
+ if (params.Find (" do_copy_in_default_stream" , &json_value)) {
837
+ std::string string_value;
838
+ RETURN_IF_ERROR (
839
+ json_value.MemberAsString (" string_value" , &string_value));
840
+ cuda_options_map[" do_copy_in_default_stream" ] = string_value;
841
+ } else {
842
+ cuda_options_map[" do_copy_in_default_stream" ] = " 1" ;
756
843
}
757
844
}
758
845
}
759
846
760
847
std::vector<const char *> option_names, option_values;
761
- for (const auto & [key, value] : options ) {
848
+ for (const auto & [key, value] : cuda_options_map ) {
762
849
option_names.push_back (key.c_str ());
763
850
option_values.push_back (value.c_str ());
764
851
}
765
- auto status = ort_api->UpdateCUDAProviderOptions (
852
+
853
+ RETURN_IF_ORT_ERROR (ort_api->UpdateCUDAProviderOptions (
766
854
rel_cuda_options.get (), option_names.data (), option_values.data (),
767
- option_values.size ());
768
- if (status != nullptr ) {
769
- OrtAllocator* allocator;
770
- char * options;
771
- RETURN_IF_ORT_ERROR (ort_api->GetAllocatorWithDefaultOptions (&allocator));
772
- RETURN_IF_ORT_ERROR (ort_api->GetCUDAProviderOptionsAsString (
773
- rel_cuda_options.get (), allocator, &options));
774
- return TRITONSERVER_ErrorNew (
775
- TRITONSERVER_ERROR_INVALID_ARG,
776
- (std::string (" unknown parameters in config following options are "
777
- " supported for CUDA Execution Provider: " ) +
778
- std::string (options))
779
- .c_str ());
780
- }
855
+ option_values.size ()));
781
856
782
857
if (stream != nullptr ) {
783
858
RETURN_IF_ORT_ERROR (ort_api->UpdateCUDAProviderOptionsWithValue (
784
859
rel_cuda_options.get (), " user_compute_stream" , stream));
785
860
}
786
861
RETURN_IF_ORT_ERROR (ort_api->SessionOptionsAppendExecutionProvider_CUDA_V2 (
787
862
soptions, cuda_options));
863
+
864
+ OrtAllocator* allocator;
865
+ char * options;
866
+ RETURN_IF_ORT_ERROR (ort_api->GetAllocatorWithDefaultOptions (&allocator));
867
+ RETURN_IF_ORT_ERROR (ort_api->GetCUDAProviderOptionsAsString (
868
+ rel_cuda_options.get (), allocator, &options));
788
869
LOG_MESSAGE (
789
870
TRITONSERVER_LOG_VERBOSE,
790
871
(std::string (" CUDA Execution Accelerator is set for '" ) + Name () +
791
- " ' on device " + std::to_string (instance_group_device_id))
872
+ " ' on device " + std::to_string (instance_group_device_id) +
873
+ std::string (" with options: " ) + std::string (options))
792
874
.c_str ());
793
875
}
794
876
#endif // TRITON_ENABLE_GPU
0 commit comments