Skip to content

feat: jumpstart instance specific hyperparameters #4180

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 10 additions & 6 deletions src/sagemaker/hyperparameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def retrieve_default(
region: Optional[str] = None,
model_id: Optional[str] = None,
model_version: Optional[str] = None,
instance_type: Optional[str] = None,
include_container_hyperparameters: bool = False,
tolerate_vulnerable_model: bool = False,
tolerate_deprecated_model: bool = False,
Expand All @@ -45,6 +46,8 @@ def retrieve_default(
retrieve the default hyperparameters. (Default: None).
model_version (str): The version of the model for which to retrieve the
default hyperparameters. (Default: None).
instance_type (str): An instance type to optionally supply in order to get hyperparameters
specific for the instance type.
include_container_hyperparameters (bool): ``True`` if the container hyperparameters
should be returned. Container hyperparameters are not used to tune
the specific algorithm. They are used by SageMaker Training jobs to set up
Expand Down Expand Up @@ -75,12 +78,13 @@ def retrieve_default(
)

return artifacts._retrieve_default_hyperparameters(
model_id,
model_version,
region,
include_container_hyperparameters,
tolerate_vulnerable_model,
tolerate_deprecated_model,
model_id=model_id,
model_version=model_version,
instance_type=instance_type,
region=region,
include_container_hyperparameters=include_container_hyperparameters,
tolerate_vulnerable_model=tolerate_vulnerable_model,
tolerate_deprecated_model=tolerate_deprecated_model,
sagemaker_session=sagemaker_session,
)

Expand Down
18 changes: 18 additions & 0 deletions src/sagemaker/jumpstart/artifacts/hyperparameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ def _retrieve_default_hyperparameters(
tolerate_vulnerable_model: bool = False,
tolerate_deprecated_model: bool = False,
sagemaker_session: Session = DEFAULT_JUMPSTART_SAGEMAKER_SESSION,
instance_type: Optional[str] = None,
):
"""Retrieves the training hyperparameters for the model matching the given arguments.

Expand Down Expand Up @@ -63,6 +64,8 @@ def _retrieve_default_hyperparameters(
object, used for SageMaker interactions. If not
specified, one is created using the default AWS configuration
chain. (Default: sagemaker.jumpstart.constants.DEFAULT_JUMPSTART_SAGEMAKER_SESSION).
instance_type (str): An instance type to optionally supply in order to get hyperparameters
specific for the instance type.
Returns:
dict: the hyperparameters to use for the model.
"""
Expand All @@ -86,4 +89,19 @@ def _retrieve_default_hyperparameters(
include_container_hyperparameters and hyperparameter.scope == VariableScope.CONTAINER
) or hyperparameter.scope == VariableScope.ALGORITHM:
default_hyperparameters[hyperparameter.name] = str(hyperparameter.default)

instance_specific_hyperparameters = (
model_specs.training_instance_type_variants.get_instance_specific_hyperparameters(
instance_type
)
if instance_type
and getattr(model_specs, "training_instance_type_variants", None) is not None
else []
)

for instance_specific_hyperparameter in instance_specific_hyperparameters:
default_hyperparameters[instance_specific_hyperparameter.name] = str(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would we ever want to remove a hyperparameter for a specific instance? If so, overriding the defaults may not work.

In the generated metadata, shouldn't the instance-specific hyperparameter dict contain all the fields rather than overriden ones?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point. However, to keep consistency with the environment variable variants, I think it should be additive.
In the situation you describe, the instance family or default metadata should be scoped down so that no hyperparameters need to be removed. In other words, we can construct any arbitrary hyperparameters for instances using this metadata design.

instance_specific_hyperparameter.default
)

return default_hyperparameters
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: rename to just hyperparameters for clarity?

1 change: 1 addition & 0 deletions src/sagemaker/jumpstart/factory/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,7 @@ def _add_hyperparameters_to_kwargs(
tolerate_deprecated_model=kwargs.tolerate_deprecated_model,
tolerate_vulnerable_model=kwargs.tolerate_vulnerable_model,
sagemaker_session=kwargs.sagemaker_session,
instance_type=kwargs.instance_type,
)

for key, value in default_hyperparameters.items():
Expand Down
2 changes: 1 addition & 1 deletion src/sagemaker/jumpstart/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ def _is_valid_model_id_hook():

super(JumpStartModel, self).__init__(**model_init_kwargs.to_kwargs_dict())

def retrieve_all_examples(self) -> Optional[List[JumpStartSerializablePayload]]:
def retrieve_all_example_payloads(self) -> Optional[List[JumpStartSerializablePayload]]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this example backward incompatible? If so, please provide an alias (as in retrieve_all_examples returns retrieve_all_example_payloads)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, this was a mistake from a previous PR. I'd vote to keep as is, the risk of breaking customers is very low for this new feature. We can also deprecate the old method signature

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no please make the method name consistent with the top level utility. You can choose to remove the old method, I agree that the risk to break customers is low, but get the Python SDK to sign off on that point in that case. Alternative, use an alias for the old method and add a deprecation warn.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me keep this as is for now. We can consider changing it later. This is a low priority issue

"""Returns all example payloads associated with the model.

Raises:
Expand Down
44 changes: 44 additions & 0 deletions src/sagemaker/jumpstart/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -402,6 +402,50 @@ def to_json(self) -> Dict[str, Any]:
json_obj = {att: getattr(self, att) for att in self.__slots__ if hasattr(self, att)}
return json_obj

def get_instance_specific_hyperparameters(
self, instance_type: str
) -> List[JumpStartHyperparameter]:
"""Returns instance specific hyperparameters.

Returns empty list if a model, instance type tuple does not have specific
hyperparameters.
"""

if self.variants is None:
return []

instance_specific_hyperparameters: List[JumpStartHyperparameter] = [
JumpStartHyperparameter(json)
for json in self.variants.get(instance_type, {})
.get("properties", {})
.get("hyperparameters", [])
]

instance_type_family = get_instance_type_family(instance_type)

instance_family_hyperparameters: List[JumpStartHyperparameter] = [
JumpStartHyperparameter(json)
for json in (
self.variants.get(instance_type_family, {})
.get("properties", {})
.get("hyperparameters", [])
if instance_type_family not in {"", None}
else []
)
]

instance_specific_hyperparameter_names = {
hyperparameter.name for hyperparameter in instance_specific_hyperparameters
}

hyperparams_to_return = deepcopy(instance_specific_hyperparameters)

for hyperparameter in instance_family_hyperparameters:
if hyperparameter.name not in instance_specific_hyperparameter_names:
hyperparams_to_return.append(hyperparameter)

return hyperparams_to_return

def get_instance_specific_environment_variables(self, instance_type: str) -> Dict[str, str]:
"""Returns instance specific environment variables.

Expand Down
73 changes: 72 additions & 1 deletion tests/unit/sagemaker/hyperparameters/jumpstart/test_default.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

from sagemaker import hyperparameters

from tests.unit.sagemaker.jumpstart.utils import get_spec_from_base_spec
from tests.unit.sagemaker.jumpstart.utils import get_spec_from_base_spec, get_special_model_spec


mock_client = boto3.client("s3")
Expand Down Expand Up @@ -116,3 +116,74 @@ def test_jumpstart_default_hyperparameters(patched_get_model_specs):
hyperparameters.retrieve_default(
model_id=model_id,
)


@patch("sagemaker.jumpstart.accessors.JumpStartModelsAccessor.get_model_specs")
def test_jumpstart_sdk_hyperparameters_instance_type_overrides(patched_get_model_specs):

patched_get_model_specs.side_effect = get_special_model_spec

model_id = "variant-model"
region = "us-west-2"

# assert that we can add hyperparameters to default
vars = hyperparameters.retrieve_default(
region=region,
model_id=model_id,
model_version="*",
sagemaker_session=mock_session,
instance_type="ml.p2.48xlarge",
)
assert vars == {
"adam-learning-rate": "0.05",
"batch-size": "4",
"epochs": "3",
"num_bag_sets": "5",
"num_stack_levels": "6",
"refit_full": "False",
"sagemaker_container_log_level": "20",
"sagemaker_program": "transfer_learning.py",
"sagemaker_submit_directory": "/opt/ml/input/data/code/sourcedir.tar.gz",
"save_space": "False",
"set_best_to_refit_full": "False",
"verbosity": "2",
}

# assert that we can override default environment variables (instance family + instance type
# specific)
vars = hyperparameters.retrieve_default(
region=region,
model_id=model_id,
model_version="*",
sagemaker_session=mock_session,
instance_type="ml.p2.12xlarge",
)
assert vars == {
"adam-learning-rate": "0.05",
"batch-size": "1",
"epochs": "3",
"num_bag_sets": "1",
"num_stack_levels": "0",
"refit_full": "False",
"eval_metric": "auto",
"num_bag_folds": "0",
"presets": "medium_quality",
"auto_stack": "False",
"sagemaker_container_log_level": "20",
"sagemaker_program": "transfer_learning.py",
"sagemaker_submit_directory": "/opt/ml/input/data/code/sourcedir.tar.gz",
"save_space": "False",
"set_best_to_refit_full": "False",
"verbosity": "2",
}

# assert that we can return default hyperparameters for unrecognized instance
vars = hyperparameters.retrieve_default(
region=region,
model_id=model_id,
model_version="*",
sagemaker_session=mock_session,
instance_type="ml.p9999.48xlarge",
)

assert vars == {"epochs": "3", "adam-learning-rate": "0.05", "batch-size": "4"}
136 changes: 135 additions & 1 deletion tests/unit/sagemaker/jumpstart/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,7 +214,141 @@
"framework_version": "1.5.0",
"py_version": "py3",
},
"training_instance_type_variants": None,
"training_instance_type_variants": {
"variants": {
"ml.p2.12xlarge": {
"properties": {
"environment_variables": {"TENSOR_PARALLEL_DEGREE": "4"},
"hyperparameters": [
{
"name": "eval_metric",
"type": "text",
"default": "auto",
"scope": "algorithm",
},
{
"name": "presets",
"type": "text",
"default": "medium_quality",
"options": [
"best_quality",
"high_quality",
"good_quality",
"medium_quality",
"optimize_for_deployment",
"interpretable",
],
"scope": "algorithm",
},
{
"name": "auto_stack",
"type": "text",
"default": "False",
"options": ["True", "False"],
"scope": "algorithm",
},
{
"name": "num_bag_folds",
"type": "text",
"default": "0",
"options": ["0", "2", "3", "4", "5", "6", "7", "8", "9", "10"],
"scope": "algorithm",
},
{
"name": "num_bag_sets",
"type": "int",
"default": 1,
"min": 1,
"scope": "algorithm",
},
{
"name": "batch-size",
"type": "int",
"default": 1,
"min": 1,
"scope": "algorithm",
},
{
"name": "num_stack_levels",
"type": "int",
"default": 0,
"min": 0,
"max": 3,
"scope": "algorithm",
},
],
}
},
"p2": {
"properties": {
"hyperparameters": [
{
"name": "num_bag_sets",
"type": "int",
"default": 5,
"min": 5,
"scope": "algorithm",
},
{
"name": "num_stack_levels",
"type": "int",
"default": 6,
"min": 7,
"max": 3,
"scope": "algorithm",
},
{
"name": "refit_full",
"type": "text",
"default": "False",
"options": ["True", "False"],
"scope": "algorithm",
},
{
"name": "set_best_to_refit_full",
"type": "text",
"default": "False",
"options": ["True", "False"],
"scope": "algorithm",
},
{
"name": "save_space",
"type": "text",
"default": "False",
"options": ["True", "False"],
"scope": "algorithm",
},
{
"name": "verbosity",
"type": "int",
"default": 2,
"min": 0,
"max": 4,
"scope": "algorithm",
},
{
"name": "sagemaker_submit_directory",
"type": "text",
"default": "/opt/ml/input/data/code/sourcedir.tar.gz",
"scope": "container",
},
{
"name": "sagemaker_program",
"type": "text",
"default": "transfer_learning.py",
"scope": "container",
},
{
"name": "sagemaker_container_log_level",
"type": "text",
"default": "20",
"scope": "container",
},
]
}
},
}
},
"hosting_artifact_key": "pytorch-infer/infer-pytorch-ic-mobilenet-v2.tar.gz",
"training_artifact_key": "pytorch-training/train-pytorch-ic-mobilenet-v2.tar.gz",
"hosting_script_key": "source-directory-tarballs/pytorch/inference/ic/v1.0.0/sourcedir.tar.gz",
Expand Down
Loading