@@ -88,12 +88,9 @@ def __init__(
88
88
)
89
89
90
90
91
- def _determine_engine_for_model (model_type : str , tensor_parallel_degree : int ):
91
+ def _determine_engine_for_model (model_type : str ):
92
92
"""Placeholder docstring"""
93
93
94
- if tensor_parallel_degree and tensor_parallel_degree > 1 :
95
- return DeepSpeedModel
96
-
97
94
if model_type in defaults .DEEPSPEED_RECOMMENDED_ARCHITECTURES :
98
95
return DeepSpeedModel
99
96
return HuggingFaceAccelerateModel
@@ -129,7 +126,6 @@ class DJLLargeModel(FrameworkModel):
129
126
def __new__ (
130
127
cls ,
131
128
uncompressed_model_data : str ,
132
- tensor_parallel_degree : int = None ,
133
129
* args ,
134
130
** kwargs ,
135
131
):
@@ -149,7 +145,7 @@ def __new__(
149
145
cls_to_create = (
150
146
cls
151
147
if cls is not DJLLargeModel
152
- else _determine_engine_for_model (model_type , tensor_parallel_degree )
148
+ else _determine_engine_for_model (model_type )
153
149
)
154
150
return super (DJLLargeModel , cls ).__new__ (cls_to_create )
155
151
@@ -160,8 +156,7 @@ def __init__(
160
156
djl_version : str = None ,
161
157
task : str = None ,
162
158
data_type : str = "fp32" ,
163
- tensor_parallel_degree : int = None ,
164
- data_parallel_degree : int = None ,
159
+ number_of_partitions : int = None ,
165
160
min_workers : int = None ,
166
161
max_workers : int = None ,
167
162
job_queue_size : int = None ,
@@ -192,16 +187,10 @@ def __init__(
192
187
If not provided, the task will be inferred from the model architecture by DJL.
193
188
data_type (str): The data type to use for loading your model. Accepted values are "fp32",
194
189
"fp16", "bf16", "int8". Defaults to "fp32".
195
- tensor_parallel_degree (int): The number of tensor parallel shards to use. It should be less
196
- than or equal to the number of gpus available on the instance. Defaults to None.
197
- If not provided, no tensor parallel sharding is done. If the provided value is
198
- greater than 1, DeepSpeed will be used as the backend.
199
- data_parallel_degree (int): The number of replicas of the model to instantiate. It should be
200
- less than or equal to the number of gpus available on the instance. Defaults to None.
201
- If not provided, all available gpus will be used. If tensor_parallel_degree is set,
202
- data_parallel_degree will be computed by DJL Serving based on the number of available GPUs.
203
- min_workers (int): The minimum number of worker processes. DJL Serving will auto detect
204
- the minimum workers if not specified. Defaults to None.
190
+ number_of_partitions (int): The number of GPUs to partition the model across. The partitioning
191
+ strategy is determined by the selected backend. If DeepSpeed is selected, this is
192
+ tensor parallelism. If HuggingFace Accelerate is selected, this is a naive sharding strategy
193
+ that splits the model layers across the available resources.
205
194
max_workers (int): The maximum number of worker processes. DJL Serving will auto detect
206
195
the maximum workers if not specified. Defaults to None.
207
196
job_queue_size (int): The request job queue size. Defaults to None. If not specified,
@@ -235,7 +224,8 @@ def __init__(
235
224
:class:`~sagemaker.djl_inference.HuggingFaceAccelerateModel` based on our framework recommendation for the model type.
236
225
237
226
If you want to use a specific framework to deploy your model with, we recommend instantiating that specific
238
- model class directly.
227
+ model class directly. The available framework specific classes are :class:`~sagemaker.djl_inference.DeepSpeedModel` or
228
+ :class:`~sagemaker.djl_inference.HuggingFaceAccelerateModel`
239
229
"""
240
230
if kwargs .get ("model_data" ):
241
231
logger .warning (
@@ -250,8 +240,7 @@ def __init__(
250
240
self .djl_version = djl_version
251
241
self .task = task
252
242
self .data_type = data_type
253
- self .tensor_parallel_degree = tensor_parallel_degree
254
- self .data_parallel_degree = data_parallel_degree
243
+ self .number_of_partitions = number_of_partitions
255
244
self .min_workers = min_workers
256
245
self .max_workers = max_workers
257
246
self .job_queue_size = job_queue_size
@@ -273,7 +262,7 @@ def package_for_edge(
273
262
):
274
263
"""Not implemented.
275
264
276
- The class doesn't SageMaker edge.
265
+ The class doesn't support SageMaker edge.
277
266
278
267
Raises:
279
268
NotImplementedError
@@ -543,6 +532,8 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
543
532
serving_properties ["engine" ] = self .engine .value [0 ]
544
533
serving_properties ["option.entryPoint" ] = self .engine .value [1 ]
545
534
serving_properties ["option.s3url" ] = self .uncompressed_model_data
535
+ if self .number_of_partitions :
536
+ serving_properties ["option.tensor_parallel_degree" ] = self .number_of_partitions
546
537
if self .entry_point :
547
538
serving_properties ["option.entryPoint" ] = self .entry_point
548
539
if self .task :
@@ -620,7 +611,7 @@ def __init__(
620
611
artifacts. The model artifacts are expected to be in HuggingFace pre-trained model
621
612
format (i.e. model should be loadable from the huggingface transformers from_pretrained
622
613
method).
623
- tensor_parallel_degree (int): The number of gpus to shard a single instance of the model across.
614
+ tensor_parallel_degree (int): The number of gpus to shard a single instance of the model across via tensor_parallelism .
624
615
This should be set to greater than 1 if the size of the model is larger than the memory available
625
616
on a single GPU on the instance. Defaults to None. If not set, no tensor parallel sharding is done.
626
617
max_tokens (int): The maximum number of tokens (input + output tokens) the DeepSpeed engine
@@ -646,12 +637,13 @@ def __init__(
646
637
"""
647
638
super (DeepSpeedModel , self ).__init__ (
648
639
uncompressed_model_data ,
649
- tensor_parallel_degree = tensor_parallel_degree ,
650
640
** kwargs ,
651
641
)
652
- if self .data_parallel_degree is not None :
653
- logger .warn ("data_parallel_degree is not used by DeepSpeedModels. The data_parallel_degree will be auto"
654
- "computed by DJL based on the tensor_parallel_degree, min_workers, and max_workers" )
642
+ if self .number_of_partitions and tensor_parallel_degree :
643
+ logger .warn ("Both number_of_partitions and tensor_parallelism_degree have been set for DeepSpeedModel."
644
+ "These mean the same thing for DeepSpeedModel. Please only set tensor_parallel_degree."
645
+ "number_of_partitions will be ignored" )
646
+ self .number_of_partitions = tensor_parallel_degree
655
647
self .engine = DJLEngine .DEEPSPEED
656
648
self .max_tokens = max_tokens
657
649
self .low_cpu_mem_usage = low_cpu_mem_usage
@@ -675,14 +667,12 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
675
667
dict: The model server configuration to use when deploying this model to SageMaker.
676
668
"""
677
669
serving_properties = super (DeepSpeedModel , self ).generate_serving_properties ()
678
- if self .tensor_parallel_degree :
679
- serving_properties ["option.tensor_parallel_degree" ] = self .tensor_parallel_degree
680
670
if self .max_tokens :
681
671
serving_properties ["option.max_tokens" ] = self .max_tokens
682
672
if self .low_cpu_mem_usage :
683
673
serving_properties ["option.low_cpu_mem_usage" ] = self .low_cpu_mem_usage
684
674
if self .enable_cuda_graph :
685
- if self .tensor_parallel_degree > 1 :
675
+ if self .number_of_partitions > 1 :
686
676
raise ValueError (
687
677
"enable_cuda_graph is not supported when tensor_parallel_degree > 1"
688
678
)
@@ -703,7 +693,6 @@ class HuggingFaceAccelerateModel(DJLLargeModel):
703
693
def __init__ (
704
694
self ,
705
695
uncompressed_model_data : str ,
706
- data_parallel_degree : int = None ,
707
696
device_id : int = None ,
708
697
device_map : Union [str , Dict [str , str ]] = None ,
709
698
load_in_8bit : bool = False ,
@@ -742,12 +731,8 @@ def __init__(
742
731
"""
743
732
super (HuggingFaceAccelerateModel , self ).__init__ (
744
733
uncompressed_model_data ,
745
- data_parallel_degree = data_parallel_degree ,
746
734
** kwargs ,
747
735
)
748
- if self .tensor_parallel_degree is not None :
749
- logger .warn ("tensor_parallel_degree is not used by HuggingFaceAccelerateModels. "
750
- "Please use data_parallel_degree instead" )
751
736
self .engine = DJLEngine .HUGGINGFACE_ACCELERATE
752
737
self .device_id = device_id
753
738
self .device_map = device_map
@@ -770,11 +755,9 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
770
755
dict: The model server configuration to use when deploying this model to SageMaker.
771
756
"""
772
757
serving_properties = super (HuggingFaceAccelerateModel , self ).generate_serving_properties ()
773
- if self .data_parallel_degree :
774
- serving_properties ["option.tensor_parallel_degree" ] = self .data_parallel_degree
775
758
if self .device_id :
776
- if self .data_parallel_degree > 1 :
777
- raise ValueError ("device_id cannot be set when data_parallel_degree is > 1" )
759
+ if self .number_of_partitions > 1 :
760
+ raise ValueError ("device_id cannot be set when number_of_partitions is > 1" )
778
761
serving_properties ["option.device_id" ] = self .device_id
779
762
if self .device_map :
780
763
serving_properties ["option.device_map" ] = self .device_map
0 commit comments