Skip to content

Commit f17f059

Browse files
committed
Refactor api with respect to model sharding strategies
1 parent ba0401f commit f17f059

File tree

2 files changed

+23
-40
lines changed

2 files changed

+23
-40
lines changed

src/sagemaker/djl_inference/defaults.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
"bloom",
1818
"opt",
1919
"gpt_neox",
20-
# "gptj",
20+
"gptj",
2121
"gpt_neo",
2222
"gpt2",
2323
"xlm-roberta",

src/sagemaker/djl_inference/model.py

Lines changed: 22 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,9 @@ def __init__(
8888
)
8989

9090

91-
def _determine_engine_for_model(model_type: str, tensor_parallel_degree: int):
91+
def _determine_engine_for_model(model_type: str):
9292
"""Placeholder docstring"""
9393

94-
if tensor_parallel_degree and tensor_parallel_degree > 1:
95-
return DeepSpeedModel
96-
9794
if model_type in defaults.DEEPSPEED_RECOMMENDED_ARCHITECTURES:
9895
return DeepSpeedModel
9996
return HuggingFaceAccelerateModel
@@ -129,7 +126,6 @@ class DJLLargeModel(FrameworkModel):
129126
def __new__(
130127
cls,
131128
uncompressed_model_data: str,
132-
tensor_parallel_degree: int = None,
133129
*args,
134130
**kwargs,
135131
):
@@ -149,7 +145,7 @@ def __new__(
149145
cls_to_create = (
150146
cls
151147
if cls is not DJLLargeModel
152-
else _determine_engine_for_model(model_type, tensor_parallel_degree)
148+
else _determine_engine_for_model(model_type)
153149
)
154150
return super(DJLLargeModel, cls).__new__(cls_to_create)
155151

@@ -160,8 +156,7 @@ def __init__(
160156
djl_version: str = None,
161157
task: str = None,
162158
data_type: str = "fp32",
163-
tensor_parallel_degree: int = None,
164-
data_parallel_degree: int = None,
159+
number_of_partitions: int = None,
165160
min_workers: int = None,
166161
max_workers: int = None,
167162
job_queue_size: int = None,
@@ -192,16 +187,10 @@ def __init__(
192187
If not provided, the task will be inferred from the model architecture by DJL.
193188
data_type (str): The data type to use for loading your model. Accepted values are "fp32",
194189
"fp16", "bf16", "int8". Defaults to "fp32".
195-
tensor_parallel_degree (int): The number of tensor parallel shards to use. It should be less
196-
than or equal to the number of gpus available on the instance. Defaults to None.
197-
If not provided, no tensor parallel sharding is done. If the provided value is
198-
greater than 1, DeepSpeed will be used as the backend.
199-
data_parallel_degree (int): The number of replicas of the model to instantiate. It should be
200-
less than or equal to the number of gpus available on the instance. Defaults to None.
201-
If not provided, all available gpus will be used. If tensor_parallel_degree is set,
202-
data_parallel_degree will be computed by DJL Serving based on the number of available GPUs.
203-
min_workers (int): The minimum number of worker processes. DJL Serving will auto detect
204-
the minimum workers if not specified. Defaults to None.
190+
number_of_partitions (int): The number of GPUs to partition the model across. The partitioning
191+
strategy is determined by the selected backend. If DeepSpeed is selected, this is
192+
tensor parallelism. If HuggingFace Accelerate is selected, this is a naive sharding strategy
193+
that splits the model layers across the available resources.
205194
max_workers (int): The maximum number of worker processes. DJL Serving will auto detect
206195
the maximum workers if not specified. Defaults to None.
207196
job_queue_size (int): The request job queue size. Defaults to None. If not specified,
@@ -235,7 +224,8 @@ def __init__(
235224
:class:`~sagemaker.djl_inference.HuggingFaceAccelerateModel` based on our framework recommendation for the model type.
236225
237226
If you want to use a specific framework to deploy your model with, we recommend instantiating that specific
238-
model class directly.
227+
model class directly. The available framework specific classes are :class:`~sagemaker.djl_inference.DeepSpeedModel` or
228+
:class:`~sagemaker.djl_inference.HuggingFaceAccelerateModel`
239229
"""
240230
if kwargs.get("model_data"):
241231
logger.warning(
@@ -250,8 +240,7 @@ def __init__(
250240
self.djl_version = djl_version
251241
self.task = task
252242
self.data_type = data_type
253-
self.tensor_parallel_degree = tensor_parallel_degree
254-
self.data_parallel_degree = data_parallel_degree
243+
self.number_of_partitions = number_of_partitions
255244
self.min_workers = min_workers
256245
self.max_workers = max_workers
257246
self.job_queue_size = job_queue_size
@@ -273,7 +262,7 @@ def package_for_edge(
273262
):
274263
"""Not implemented.
275264
276-
The class doesn't SageMaker edge.
265+
The class doesn't support SageMaker edge.
277266
278267
Raises:
279268
NotImplementedError
@@ -543,6 +532,8 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
543532
serving_properties["engine"] = self.engine.value[0]
544533
serving_properties["option.entryPoint"] = self.engine.value[1]
545534
serving_properties["option.s3url"] = self.uncompressed_model_data
535+
if self.number_of_partitions:
536+
serving_properties["option.tensor_parallel_degree"] = self.number_of_partitions
546537
if self.entry_point:
547538
serving_properties["option.entryPoint"] = self.entry_point
548539
if self.task:
@@ -620,7 +611,7 @@ def __init__(
620611
artifacts. The model artifacts are expected to be in HuggingFace pre-trained model
621612
format (i.e. model should be loadable from the huggingface transformers from_pretrained
622613
method).
623-
tensor_parallel_degree (int): The number of gpus to shard a single instance of the model across.
614+
tensor_parallel_degree (int): The number of gpus to shard a single instance of the model across via tensor_parallelism.
624615
This should be set to greater than 1 if the size of the model is larger than the memory available
625616
on a single GPU on the instance. Defaults to None. If not set, no tensor parallel sharding is done.
626617
max_tokens (int): The maximum number of tokens (input + output tokens) the DeepSpeed engine
@@ -646,12 +637,13 @@ def __init__(
646637
"""
647638
super(DeepSpeedModel, self).__init__(
648639
uncompressed_model_data,
649-
tensor_parallel_degree=tensor_parallel_degree,
650640
**kwargs,
651641
)
652-
if self.data_parallel_degree is not None:
653-
logger.warn("data_parallel_degree is not used by DeepSpeedModels. The data_parallel_degree will be auto"
654-
"computed by DJL based on the tensor_parallel_degree, min_workers, and max_workers")
642+
if self.number_of_partitions and tensor_parallel_degree:
643+
logger.warn("Both number_of_partitions and tensor_parallelism_degree have been set for DeepSpeedModel."
644+
"These mean the same thing for DeepSpeedModel. Please only set tensor_parallel_degree."
645+
"number_of_partitions will be ignored")
646+
self.number_of_partitions = tensor_parallel_degree
655647
self.engine = DJLEngine.DEEPSPEED
656648
self.max_tokens = max_tokens
657649
self.low_cpu_mem_usage = low_cpu_mem_usage
@@ -675,14 +667,12 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
675667
dict: The model server configuration to use when deploying this model to SageMaker.
676668
"""
677669
serving_properties = super(DeepSpeedModel, self).generate_serving_properties()
678-
if self.tensor_parallel_degree:
679-
serving_properties["option.tensor_parallel_degree"] = self.tensor_parallel_degree
680670
if self.max_tokens:
681671
serving_properties["option.max_tokens"] = self.max_tokens
682672
if self.low_cpu_mem_usage:
683673
serving_properties["option.low_cpu_mem_usage"] = self.low_cpu_mem_usage
684674
if self.enable_cuda_graph:
685-
if self.tensor_parallel_degree > 1:
675+
if self.number_of_partitions > 1:
686676
raise ValueError(
687677
"enable_cuda_graph is not supported when tensor_parallel_degree > 1"
688678
)
@@ -703,7 +693,6 @@ class HuggingFaceAccelerateModel(DJLLargeModel):
703693
def __init__(
704694
self,
705695
uncompressed_model_data: str,
706-
data_parallel_degree: int = None,
707696
device_id: int = None,
708697
device_map: Union[str, Dict[str, str]] = None,
709698
load_in_8bit: bool = False,
@@ -742,12 +731,8 @@ def __init__(
742731
"""
743732
super(HuggingFaceAccelerateModel, self).__init__(
744733
uncompressed_model_data,
745-
data_parallel_degree=data_parallel_degree,
746734
**kwargs,
747735
)
748-
if self.tensor_parallel_degree is not None:
749-
logger.warn("tensor_parallel_degree is not used by HuggingFaceAccelerateModels. "
750-
"Please use data_parallel_degree instead")
751736
self.engine = DJLEngine.HUGGINGFACE_ACCELERATE
752737
self.device_id = device_id
753738
self.device_map = device_map
@@ -770,11 +755,9 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
770755
dict: The model server configuration to use when deploying this model to SageMaker.
771756
"""
772757
serving_properties = super(HuggingFaceAccelerateModel, self).generate_serving_properties()
773-
if self.data_parallel_degree:
774-
serving_properties["option.tensor_parallel_degree"] = self.data_parallel_degree
775758
if self.device_id:
776-
if self.data_parallel_degree > 1:
777-
raise ValueError("device_id cannot be set when data_parallel_degree is > 1")
759+
if self.number_of_partitions > 1:
760+
raise ValueError("device_id cannot be set when number_of_partitions is > 1")
778761
serving_properties["option.device_id"] = self.device_id
779762
if self.device_map:
780763
serving_properties["option.device_map"] = self.device_map

0 commit comments

Comments
 (0)