aws · knikure · May 4, 2023 · May 3, 2023
@@ -31,7 +31,7 @@ You can either deploy your model using DeepSpeed or HuggingFace Accelerate, or l
     djl_model = DJLModel(
         "s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id
         "my_sagemaker_role",
-        data_type="fp16",
+        dtype="fp16",
         task="text-generation",
         number_of_partitions=2 # number of gpus to partition the model across
     )
@@ -48,7 +48,7 @@ If you want to use a specific backend, then you can create an instance of the co
     deepspeed_model = DeepSpeedModel(
         "s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id
         "my_sagemaker_role",
-        data_type="bf16",
+        dtype="bf16",
         task="text-generation",
         tensor_parallel_degree=2, # number of gpus to partition the model across using tensor parallelism
     )
@@ -58,7 +58,7 @@ If you want to use a specific backend, then you can create an instance of the co
     hf_accelerate_model = HuggingFaceAccelerateModel(
         "s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id
         "my_sagemaker_role",
-        data_type="fp16",
+        dtype="fp16",
         task="text-generation",
         number_of_partitions=2, # number of gpus to partition the model across
     )
@@ -109,7 +109,7 @@ For example, you can deploy the EleutherAI gpt-j-6B model like this:
     model = DJLModel(
         "EleutherAI/gpt-j-6B",
         "my_sagemaker_role",
-        data_type="fp16",
+        dtype="fp16",
         number_of_partitions=2
     )
 
@@ -142,7 +142,7 @@ You would then pass "s3://my_bucket/gpt-j-6B" as ``model_id`` to the ``DJLModel`
     model = DJLModel(
         "s3://my_bucket/gpt-j-6B",
         "my_sagemaker_role",
-        data_type="fp16",
+        dtype="fp16",
         number_of_partitions=2
     )
 

@@ -233,7 +233,7 @@ def __init__(
         role: str,
         djl_version: Optional[str] = None,
         task: Optional[str] = None,
-        data_type: str = "fp32",
+        dtype: str = "fp32",
         number_of_partitions: Optional[int] = None,
         min_workers: Optional[int] = None,
         max_workers: Optional[int] = None,
@@ -264,7 +264,7 @@ def __init__(
             task (str): The HuggingFace/NLP task you want to launch this model for. Defaults to
                 None.
                 If not provided, the task will be inferred from the model architecture by DJL.
-            data_type (str): The data type to use for loading your model. Accepted values are
+            dtype (str): The data type to use for loading your model. Accepted values are
                 "fp32", "fp16", "bf16", "int8". Defaults to "fp32".
             number_of_partitions (int): The number of GPUs to partition the model across. The
                 partitioning strategy is determined by the selected backend. If DeepSpeed is
@@ -322,13 +322,20 @@ def __init__(
                 "You only need to set model_id and ensure it points to uncompressed model "
                 "artifacts in s3, or a valid HuggingFace Hub model_id."
             )
+        data_type = kwargs.pop("data_type", None)
+        if data_type:
+            logger.warning(
+                "data_type is being deprecated in favor of dtype. Please migrate use of data_type"
+                " to dtype. Support for data_type will be removed in a future release"
+            )
+            dtype = dtype or data_type
         super(DJLModel, self).__init__(
             None, image_uri, role, entry_point, predictor_cls=predictor_cls, **kwargs
         )
         self.model_id = model_id
         self.djl_version = djl_version
         self.task = task
-        self.data_type = data_type
+        self.dtype = dtype
         self.number_of_partitions = number_of_partitions
         self.min_workers = min_workers
         self.max_workers = max_workers
@@ -372,7 +379,7 @@ def transformer(self, **_):
             "DJLModels do not currently support Batch Transform inference jobs"
         )
 
-    def right_size(self, checkpoint_data_type: str):
+    def right_size(self, **_):
         """Not implemented.
 
         DJLModels do not support SageMaker Inference Recommendation Jobs.
@@ -573,8 +580,8 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str]
             serving_properties["option.entryPoint"] = self.entry_point
         if self.task:
             serving_properties["option.task"] = self.task
-        if self.data_type:
-            serving_properties["option.dtype"] = self.data_type
+        if self.dtype:
+            serving_properties["option.dtype"] = self.dtype
         if self.min_workers:
             serving_properties["minWorkers"] = self.min_workers
         if self.max_workers:
@@ -779,7 +786,7 @@ def __init__(
                 None.
             load_in_8bit (bool): Whether to load the model in int8 precision using bits and bytes
                 quantization. This is only supported for select model architectures.
-                Defaults to False. If ``data_type`` is int8, then this is set to True.
+                Defaults to False. If ``dtype`` is int8, then this is set to True.
             low_cpu_mem_usage (bool): Whether to limit CPU memory usage to 1x model size during
                 model loading. This is an experimental feature in HuggingFace. This is useful when
                 loading multiple instances of your model in parallel. Defaults to False.
@@ -832,19 +839,19 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str]
         if self.device_map:
             serving_properties["option.device_map"] = self.device_map
         if self.load_in_8bit:
-            if self.data_type != "int8":
-                raise ValueError("Set data_type='int8' to use load_in_8bit")
+            if self.dtype != "int8":
+                raise ValueError("Set dtype='int8' to use load_in_8bit")
             serving_properties["option.load_in_8bit"] = self.load_in_8bit
-        if self.data_type == "int8":
+        if self.dtype == "int8":
             serving_properties["option.load_in_8bit"] = True
         if self.low_cpu_mem_usage:
             serving_properties["option.low_cpu_mem_usage"] = self.low_cpu_mem_usage
         # This is a workaround due to a bug in our built in handler for huggingface
         # TODO: This needs to be fixed when new dlc is published
         if (
             serving_properties["option.entryPoint"] == "djl_python.huggingface"
-            and self.data_type
-            and self.data_type != "auto"
+            and self.dtype
+            and self.dtype != "auto"
         ):
             serving_properties["option.dtype"] = "auto"
             serving_properties.pop("option.load_in_8bit", None)

@@ -351,12 +351,12 @@ def test_generate_huggingface_serving_properties_invalid_configurations(
         VALID_UNCOMPRESSED_MODEL_DATA,
         ROLE,
         sagemaker_session=sagemaker_session,
-        data_type="fp16",
+        dtype="fp16",
         load_in_8bit=True,
     )
     with pytest.raises(ValueError) as invalid_config:
         _ = model.generate_serving_properties()
-    assert str(invalid_config.value).startswith("Set data_type='int8' to use load_in_8bit")
+    assert str(invalid_config.value).startswith("Set dtype='int8' to use load_in_8bit")
 
     model = HuggingFaceAccelerateModel(
         VALID_UNCOMPRESSED_MODEL_DATA,
@@ -391,7 +391,7 @@ def test_generate_serving_properties_with_valid_configurations(
         min_workers=1,
         max_workers=3,
         job_queue_size=4,
-        data_type="fp16",
+        dtype="fp16",
         parallel_loading=True,
         model_loading_timeout=120,
         prediction_timeout=4,
@@ -429,7 +429,7 @@ def test_generate_serving_properties_with_valid_configurations(
         sagemaker_session=sagemaker_session,
         tensor_parallel_degree=1,
         task="text-generation",
-        data_type="bf16",
+        dtype="bf16",
         max_tokens=2048,
         low_cpu_mem_usage=True,
         enable_cuda_graph=True,
@@ -459,7 +459,7 @@ def test_generate_serving_properties_with_valid_configurations(
         number_of_partitions=1,
         device_id=4,
         device_map="balanced",
-        data_type="fp32",
+        dtype="fp32",
         low_cpu_mem_usage=False,
     )
     serving_properties = model.generate_serving_properties()
@@ -513,7 +513,7 @@ def test_deploy_model_no_local_code(
         ROLE,
         sagemaker_session=sagemaker_session,
         number_of_partitions=4,
-        data_type="fp16",
+        dtype="fp16",
         container_log_level=logging.DEBUG,
         env=ENV,
     )