Refactor api with respect to model sharding strategies

siddvenk · siddvenk · commit f17f059ab8c5 · 2023-02-13T12:17:11.000-08:00
diff --git a/src/sagemaker/djl_inference/defaults.py b/src/sagemaker/djl_inference/defaults.py
@@ -17,7 +17,7 @@
     "bloom",
     "opt",
     "gpt_neox",
-    # "gptj",
+    "gptj",
     "gpt_neo",
     "gpt2",
     "xlm-roberta",
diff --git a/src/sagemaker/djl_inference/model.py b/src/sagemaker/djl_inference/model.py
@@ -88,12 +88,9 @@ def __init__(
         )
 
 
-def _determine_engine_for_model(model_type: str, tensor_parallel_degree: int):
+def _determine_engine_for_model(model_type: str):
     """Placeholder docstring"""
 
-    if tensor_parallel_degree and tensor_parallel_degree > 1:
-        return DeepSpeedModel
-
     if model_type in defaults.DEEPSPEED_RECOMMENDED_ARCHITECTURES:
         return DeepSpeedModel
     return HuggingFaceAccelerateModel
@@ -129,7 +126,6 @@ class DJLLargeModel(FrameworkModel):
     def __new__(
         cls,
         uncompressed_model_data: str,
-        tensor_parallel_degree: int = None,
         *args,
         **kwargs,
     ):
@@ -149,7 +145,7 @@ def __new__(
         cls_to_create = (
             cls
             if cls is not DJLLargeModel
-            else _determine_engine_for_model(model_type, tensor_parallel_degree)
+            else _determine_engine_for_model(model_type)
         )
         return super(DJLLargeModel, cls).__new__(cls_to_create)
 
@@ -160,8 +156,7 @@ def __init__(
         djl_version: str = None,
         task: str = None,
         data_type: str = "fp32",
-        tensor_parallel_degree: int = None,
-        data_parallel_degree: int = None,
+        number_of_partitions: int = None,
         min_workers: int = None,
         max_workers: int = None,
         job_queue_size: int = None,
@@ -192,16 +187,10 @@ def __init__(
                 If not provided, the task will be inferred from the model architecture by DJL.
             data_type (str): The data type to use for loading your model. Accepted values are "fp32",
                 "fp16", "bf16", "int8". Defaults to "fp32".
-            tensor_parallel_degree (int): The number of tensor parallel shards to use. It should be less
-                than or equal to the number of gpus available on the instance. Defaults to None.
-                If not provided, no tensor parallel sharding is done. If the provided value is
-                greater than 1, DeepSpeed will be used as the backend.
-            data_parallel_degree (int): The number of replicas of the model to instantiate. It should be
-                less than or equal to the number of gpus available on the instance. Defaults to None.
-                If not provided, all available gpus will be used. If tensor_parallel_degree is set,
-                data_parallel_degree will be computed by DJL Serving based on the number of available GPUs.
-            min_workers (int): The minimum number of worker processes. DJL Serving will auto detect
-                the minimum workers if not specified. Defaults to None.
+            number_of_partitions (int): The number of GPUs to partition the model across. The partitioning
+                strategy is determined by the selected backend. If DeepSpeed is selected, this is
+                tensor parallelism. If HuggingFace Accelerate is selected, this is a naive sharding strategy
+                that splits the model layers across the available resources.
             max_workers (int): The maximum number of worker processes. DJL Serving will auto detect
                 the maximum workers if not specified. Defaults to None.
             job_queue_size (int): The request job queue size. Defaults to None. If not specified,
@@ -235,7 +224,8 @@ def __init__(
             :class:`~sagemaker.djl_inference.HuggingFaceAccelerateModel` based on our framework recommendation for the model type.
 
             If you want to use a specific framework to deploy your model with, we recommend instantiating that specific
-            model class directly.
+            model class directly. The available framework specific classes are :class:`~sagemaker.djl_inference.DeepSpeedModel` or
+            :class:`~sagemaker.djl_inference.HuggingFaceAccelerateModel`    
         """
         if kwargs.get("model_data"):
             logger.warning(
@@ -250,8 +240,7 @@ def __init__(
         self.djl_version = djl_version
         self.task = task
         self.data_type = data_type
-        self.tensor_parallel_degree = tensor_parallel_degree
-        self.data_parallel_degree = data_parallel_degree
+        self.number_of_partitions = number_of_partitions
         self.min_workers = min_workers
         self.max_workers = max_workers
         self.job_queue_size = job_queue_size
@@ -273,7 +262,7 @@ def package_for_edge(
     ):
         """Not implemented.
 
-        The class doesn't SageMaker edge.
+        The class doesn't support SageMaker edge.
 
         Raises:
             NotImplementedError
@@ -543,6 +532,8 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
         serving_properties["engine"] = self.engine.value[0]
         serving_properties["option.entryPoint"] = self.engine.value[1]
         serving_properties["option.s3url"] = self.uncompressed_model_data
+        if self.number_of_partitions:
+            serving_properties["option.tensor_parallel_degree"] = self.number_of_partitions
         if self.entry_point:
             serving_properties["option.entryPoint"] = self.entry_point
         if self.task:
@@ -620,7 +611,7 @@ def __init__(
                 artifacts. The model artifacts are expected to be in HuggingFace pre-trained model
                 format (i.e. model should be loadable from the huggingface transformers from_pretrained
                 method).
-            tensor_parallel_degree (int): The number of gpus to shard a single instance of the model across.
+            tensor_parallel_degree (int): The number of gpus to shard a single instance of the model across via tensor_parallelism.
                  This should be set to greater than 1 if the size of the model is larger than the memory available
                  on a single GPU on the instance. Defaults to None. If not set, no tensor parallel sharding is done.
             max_tokens (int): The maximum number of tokens (input + output tokens) the DeepSpeed engine
@@ -646,12 +637,13 @@ def __init__(
         """
         super(DeepSpeedModel, self).__init__(
             uncompressed_model_data,
-            tensor_parallel_degree=tensor_parallel_degree,
             **kwargs,
         )
-        if self.data_parallel_degree is not None:
-            logger.warn("data_parallel_degree is not used by DeepSpeedModels. The data_parallel_degree will be auto"
-                        "computed by DJL based on the tensor_parallel_degree, min_workers, and max_workers")
+        if self.number_of_partitions and tensor_parallel_degree:
+            logger.warn("Both number_of_partitions and tensor_parallelism_degree have been set for DeepSpeedModel."
+                        "These mean the same thing for DeepSpeedModel. Please only set tensor_parallel_degree."
+                        "number_of_partitions will be ignored")
+        self.number_of_partitions = tensor_parallel_degree
         self.engine = DJLEngine.DEEPSPEED
         self.max_tokens = max_tokens
         self.low_cpu_mem_usage = low_cpu_mem_usage
@@ -675,14 +667,12 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
             dict: The model server configuration to use when deploying this model to SageMaker.
         """
         serving_properties = super(DeepSpeedModel, self).generate_serving_properties()
-        if self.tensor_parallel_degree:
-            serving_properties["option.tensor_parallel_degree"] = self.tensor_parallel_degree
         if self.max_tokens:
             serving_properties["option.max_tokens"] = self.max_tokens
         if self.low_cpu_mem_usage:
             serving_properties["option.low_cpu_mem_usage"] = self.low_cpu_mem_usage
         if self.enable_cuda_graph:
-            if self.tensor_parallel_degree > 1:
+            if self.number_of_partitions > 1:
                 raise ValueError(
                     "enable_cuda_graph is not supported when tensor_parallel_degree > 1"
                 )
@@ -703,7 +693,6 @@ class HuggingFaceAccelerateModel(DJLLargeModel):
     def __init__(
         self,
         uncompressed_model_data: str,
-        data_parallel_degree: int = None,
         device_id: int = None,
         device_map: Union[str, Dict[str, str]] = None,
         load_in_8bit: bool = False,
@@ -742,12 +731,8 @@ def __init__(
         """
         super(HuggingFaceAccelerateModel, self).__init__(
             uncompressed_model_data,
-            data_parallel_degree=data_parallel_degree,
             **kwargs,
         )
-        if self.tensor_parallel_degree is not None:
-            logger.warn("tensor_parallel_degree is not used by HuggingFaceAccelerateModels. "
-                        "Please use data_parallel_degree instead")
         self.engine = DJLEngine.HUGGINGFACE_ACCELERATE
         self.device_id = device_id
         self.device_map = device_map
@@ -770,11 +755,9 @@ def generate_serving_properties(self, serving_properties={}) -> Dict[str, str]:
             dict: The model server configuration to use when deploying this model to SageMaker.
         """
         serving_properties = super(HuggingFaceAccelerateModel, self).generate_serving_properties()
-        if self.data_parallel_degree:
-            serving_properties["option.tensor_parallel_degree"] = self.data_parallel_degree
         if self.device_id:
-            if self.data_parallel_degree > 1:
-                raise ValueError("device_id cannot be set when data_parallel_degree is > 1")
+            if self.number_of_partitions > 1:
+                raise ValueError("device_id cannot be set when number_of_partitions is > 1")
             serving_properties["option.device_id"] = self.device_id
         if self.device_map:
             serving_properties["option.device_map"] = self.device_map