Skip to content

[change] change data_type argument to dtype to keep consistent with D… #3832

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 4, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions doc/frameworks/djl/using_djl.rst
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ You can either deploy your model using DeepSpeed or HuggingFace Accelerate, or l
djl_model = DJLModel(
"s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id
"my_sagemaker_role",
data_type="fp16",
dtype="fp16",
task="text-generation",
number_of_partitions=2 # number of gpus to partition the model across
)
Expand All @@ -48,7 +48,7 @@ If you want to use a specific backend, then you can create an instance of the co
deepspeed_model = DeepSpeedModel(
"s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id
"my_sagemaker_role",
data_type="bf16",
dtype="bf16",
task="text-generation",
tensor_parallel_degree=2, # number of gpus to partition the model across using tensor parallelism
)
Expand All @@ -58,7 +58,7 @@ If you want to use a specific backend, then you can create an instance of the co
hf_accelerate_model = HuggingFaceAccelerateModel(
"s3://my_bucket/my_saved_model_artifacts/", # This can also be a HuggingFace Hub model id
"my_sagemaker_role",
data_type="fp16",
dtype="fp16",
task="text-generation",
number_of_partitions=2, # number of gpus to partition the model across
)
Expand Down Expand Up @@ -109,7 +109,7 @@ For example, you can deploy the EleutherAI gpt-j-6B model like this:
model = DJLModel(
"EleutherAI/gpt-j-6B",
"my_sagemaker_role",
data_type="fp16",
dtype="fp16",
number_of_partitions=2
)

Expand Down Expand Up @@ -142,7 +142,7 @@ You would then pass "s3://my_bucket/gpt-j-6B" as ``model_id`` to the ``DJLModel`
model = DJLModel(
"s3://my_bucket/gpt-j-6B",
"my_sagemaker_role",
data_type="fp16",
dtype="fp16",
number_of_partitions=2
)

Expand Down
31 changes: 19 additions & 12 deletions src/sagemaker/djl_inference/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,7 +233,7 @@ def __init__(
role: str,
djl_version: Optional[str] = None,
task: Optional[str] = None,
data_type: str = "fp32",
dtype: str = "fp32",
number_of_partitions: Optional[int] = None,
min_workers: Optional[int] = None,
max_workers: Optional[int] = None,
Expand Down Expand Up @@ -264,7 +264,7 @@ def __init__(
task (str): The HuggingFace/NLP task you want to launch this model for. Defaults to
None.
If not provided, the task will be inferred from the model architecture by DJL.
data_type (str): The data type to use for loading your model. Accepted values are
dtype (str): The data type to use for loading your model. Accepted values are
"fp32", "fp16", "bf16", "int8". Defaults to "fp32".
number_of_partitions (int): The number of GPUs to partition the model across. The
partitioning strategy is determined by the selected backend. If DeepSpeed is
Expand Down Expand Up @@ -322,13 +322,20 @@ def __init__(
"You only need to set model_id and ensure it points to uncompressed model "
"artifacts in s3, or a valid HuggingFace Hub model_id."
)
data_type = kwargs.pop("data_type", None)
if data_type:
logger.warning(
"data_type is being deprecated in favor of dtype. Please migrate use of data_type"
" to dtype. Support for data_type will be removed in a future release"
)
dtype = dtype or data_type
super(DJLModel, self).__init__(
None, image_uri, role, entry_point, predictor_cls=predictor_cls, **kwargs
)
self.model_id = model_id
self.djl_version = djl_version
self.task = task
self.data_type = data_type
self.dtype = dtype
self.number_of_partitions = number_of_partitions
self.min_workers = min_workers
self.max_workers = max_workers
Expand Down Expand Up @@ -372,7 +379,7 @@ def transformer(self, **_):
"DJLModels do not currently support Batch Transform inference jobs"
)

def right_size(self, checkpoint_data_type: str):
def right_size(self, **_):
"""Not implemented.

DJLModels do not support SageMaker Inference Recommendation Jobs.
Expand Down Expand Up @@ -573,8 +580,8 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str]
serving_properties["option.entryPoint"] = self.entry_point
if self.task:
serving_properties["option.task"] = self.task
if self.data_type:
serving_properties["option.dtype"] = self.data_type
if self.dtype:
serving_properties["option.dtype"] = self.dtype
if self.min_workers:
serving_properties["minWorkers"] = self.min_workers
if self.max_workers:
Expand Down Expand Up @@ -779,7 +786,7 @@ def __init__(
None.
load_in_8bit (bool): Whether to load the model in int8 precision using bits and bytes
quantization. This is only supported for select model architectures.
Defaults to False. If ``data_type`` is int8, then this is set to True.
Defaults to False. If ``dtype`` is int8, then this is set to True.
low_cpu_mem_usage (bool): Whether to limit CPU memory usage to 1x model size during
model loading. This is an experimental feature in HuggingFace. This is useful when
loading multiple instances of your model in parallel. Defaults to False.
Expand Down Expand Up @@ -832,19 +839,19 @@ def generate_serving_properties(self, serving_properties=None) -> Dict[str, str]
if self.device_map:
serving_properties["option.device_map"] = self.device_map
if self.load_in_8bit:
if self.data_type != "int8":
raise ValueError("Set data_type='int8' to use load_in_8bit")
if self.dtype != "int8":
raise ValueError("Set dtype='int8' to use load_in_8bit")
serving_properties["option.load_in_8bit"] = self.load_in_8bit
if self.data_type == "int8":
if self.dtype == "int8":
serving_properties["option.load_in_8bit"] = True
if self.low_cpu_mem_usage:
serving_properties["option.low_cpu_mem_usage"] = self.low_cpu_mem_usage
# This is a workaround due to a bug in our built in handler for huggingface
# TODO: This needs to be fixed when new dlc is published
if (
serving_properties["option.entryPoint"] == "djl_python.huggingface"
and self.data_type
and self.data_type != "auto"
and self.dtype
and self.dtype != "auto"
):
serving_properties["option.dtype"] = "auto"
serving_properties.pop("option.load_in_8bit", None)
Expand Down
12 changes: 6 additions & 6 deletions tests/unit/test_djl_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,12 +351,12 @@ def test_generate_huggingface_serving_properties_invalid_configurations(
VALID_UNCOMPRESSED_MODEL_DATA,
ROLE,
sagemaker_session=sagemaker_session,
data_type="fp16",
dtype="fp16",
load_in_8bit=True,
)
with pytest.raises(ValueError) as invalid_config:
_ = model.generate_serving_properties()
assert str(invalid_config.value).startswith("Set data_type='int8' to use load_in_8bit")
assert str(invalid_config.value).startswith("Set dtype='int8' to use load_in_8bit")

model = HuggingFaceAccelerateModel(
VALID_UNCOMPRESSED_MODEL_DATA,
Expand Down Expand Up @@ -391,7 +391,7 @@ def test_generate_serving_properties_with_valid_configurations(
min_workers=1,
max_workers=3,
job_queue_size=4,
data_type="fp16",
dtype="fp16",
parallel_loading=True,
model_loading_timeout=120,
prediction_timeout=4,
Expand Down Expand Up @@ -429,7 +429,7 @@ def test_generate_serving_properties_with_valid_configurations(
sagemaker_session=sagemaker_session,
tensor_parallel_degree=1,
task="text-generation",
data_type="bf16",
dtype="bf16",
max_tokens=2048,
low_cpu_mem_usage=True,
enable_cuda_graph=True,
Expand Down Expand Up @@ -459,7 +459,7 @@ def test_generate_serving_properties_with_valid_configurations(
number_of_partitions=1,
device_id=4,
device_map="balanced",
data_type="fp32",
dtype="fp32",
low_cpu_mem_usage=False,
)
serving_properties = model.generate_serving_properties()
Expand Down Expand Up @@ -513,7 +513,7 @@ def test_deploy_model_no_local_code(
ROLE,
sagemaker_session=sagemaker_session,
number_of_partitions=4,
data_type="fp16",
dtype="fp16",
container_log_level=logging.DEBUG,
env=ENV,
)
Expand Down