@@ -88,9 +88,12 @@ def __init__(
88
88
)
89
89
90
90
91
- def _determine_engine_for_model_type (model_type : str ):
91
+ def _determine_engine_for_model (model_type : str , tensor_parallel_degree : int ):
92
92
"""Placeholder docstring"""
93
93
94
+ if tensor_parallel_degree and tensor_parallel_degree > 1 :
95
+ return DeepSpeedModel
96
+
94
97
if model_type in defaults .DEEPSPEED_RECOMMENDED_ARCHITECTURES :
95
98
return DeepSpeedModel
96
99
return HuggingFaceAccelerateModel
@@ -126,6 +129,7 @@ class DJLLargeModel(FrameworkModel):
126
129
def __new__ (
127
130
cls ,
128
131
uncompressed_model_data : str ,
132
+ tensor_parallel_degree : int = None ,
129
133
* args ,
130
134
** kwargs ,
131
135
):
@@ -143,14 +147,16 @@ def __new__(
143
147
144
148
model_type = json .loads (s3 .S3Downloader .read_file (config_file )).get ("model_type" )
145
149
cls_to_create = (
146
- cls if cls is not DJLLargeModel else _determine_engine_for_model_type (model_type )
150
+ cls
151
+ if cls is not DJLLargeModel
152
+ else _determine_engine_for_model (model_type , tensor_parallel_degree )
147
153
)
148
154
return super (DJLLargeModel , cls ).__new__ (cls_to_create )
149
155
150
156
def __init__ (
151
157
self ,
152
158
uncompressed_model_data : str ,
153
- role : str ,
159
+ role : str = None ,
154
160
djl_version : str = None ,
155
161
task : str = None ,
156
162
data_type : str = "fp32" ,
@@ -190,9 +196,10 @@ def __init__(
190
196
than or equal to the number of gpus available on the instance. Defaults to None.
191
197
If not provided, no tensor parallel sharding is done. If the provided value is
192
198
greater than 1, DeepSpeed will be used as the backend.
193
- data_parallel_degree (int): The number of copies of the model to instantiate. It should be
199
+ data_parallel_degree (int): The number of replicas of the model to instantiate. It should be
194
200
less than or equal to the number of gpus available on the instance. Defaults to None.
195
- If not provided, all available gpus will be used.
201
+ If not provided, all available gpus will be used. If tensor_parallel_degree is set,
202
+ data_parallel_degree will be computed by DJL Serving based on the number of available GPUs.
196
203
min_workers (int): The minimum number of worker processes. DJL Serving will auto detect
197
204
the minimum workers if not specified. Defaults to None.
198
205
max_workers (int): The maximum number of worker processes. DJL Serving will auto detect
@@ -536,8 +543,6 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
536
543
serving_properties ["engine" ] = self .engine .value [0 ]
537
544
serving_properties ["option.entryPoint" ] = self .engine .value [1 ]
538
545
serving_properties ["option.s3url" ] = self .uncompressed_model_data
539
- if self .tensor_parallel_degree :
540
- serving_properties ["option.tensor_parallel_degree" ] = self .tensor_parallel_degree
541
546
if self .entry_point :
542
547
serving_properties ["option.entryPoint" ] = self .entry_point
543
548
if self .task :
@@ -667,6 +672,8 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
667
672
dict: The model server configuration to use when deploying this model to SageMaker.
668
673
"""
669
674
serving_properties = super (DeepSpeedModel , self ).generate_serving_properties ()
675
+ if self .tensor_parallel_degree :
676
+ serving_properties ["option.tensor_parallel_degree" ] = self .tensor_parallel_degree
670
677
if self .max_tokens :
671
678
serving_properties ["option.max_tokens" ] = self .max_tokens
672
679
if self .low_cpu_mem_usage :
@@ -681,8 +688,6 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
681
688
serving_properties ["option.triangular_masking" ] = self .triangular_masking
682
689
if self .return_tuple :
683
690
serving_properties ["option.return_tuple" ] = self .return_tuple
684
- if self .deepspeed_checkpoint_file :
685
- serving_properties ["option.checkpoint" ] = self .deepspeed_checkpoint_file
686
691
687
692
return serving_properties
688
693
@@ -759,7 +764,11 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
759
764
dict: The model server configuration to use when deploying this model to SageMaker.
760
765
"""
761
766
serving_properties = super (HuggingFaceAccelerateModel , self ).generate_serving_properties ()
767
+ if self .data_parallel_degree :
768
+ serving_properties ["option.tensor_parallel_degree" ] = self .data_parallel_degree
762
769
if self .device_id :
770
+ if self .data_parallel_degree > 1 :
771
+ raise ValueError ("device_id cannot be set when data_parallel_degree is > 1" )
763
772
serving_properties ["option.device_id" ] = self .device_id
764
773
if self .device_map :
765
774
serving_properties ["option.device_map" ] = self .device_map
0 commit comments