Skip to content

fix: jumpstart amt tracking #3077

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 15 commits into from
May 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions src/sagemaker/tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
from sagemaker.estimator import Framework
from sagemaker.inputs import TrainingInput
from sagemaker.job import _Job
from sagemaker.jumpstart.utils import add_jumpstart_tags, get_jumpstart_base_name_if_jumpstart_model
from sagemaker.parameter import (
CategoricalParameter,
ContinuousParameter,
Expand Down Expand Up @@ -319,6 +320,42 @@ def _prepare_for_tuning(self, job_name=None, include_cls_metadata=False):
"""Prepare the tuner instance for tuning (fit)."""
self._prepare_job_name_for_tuning(job_name=job_name)
self._prepare_static_hyperparameters_for_tuning(include_cls_metadata=include_cls_metadata)
self._prepare_tags_for_tuning()

def _get_model_uri(
self,
estimator,
):
"""Return the model artifact URI used by the Estimator instance.

This attribute can live in multiple places, and accessing the attribute can
raise a TypeError, which needs to be handled.
"""
try:
return getattr(estimator, "model_data", None)
except TypeError:
return getattr(estimator, "model_uri", None)

def _prepare_tags_for_tuning(self):
"""Add tags to tuning job (from Estimator and JumpStart tags)."""

# Add tags from Estimator class
estimator = self.estimator or self.estimator_dict[sorted(self.estimator_dict.keys())[0]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will 'estimator_dict' always have a value

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It should always have a value. I used the same code as here:

self.estimator or self.estimator_dict[sorted(self.estimator_dict.keys())[0]]


estimator_tags = getattr(estimator, "tags", []) or []

if self.tags is None and len(estimator_tags) > 0:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Recommendation generated by Amazon CodeGuru Reviewer. Leave feedback on this recommendation by replying to the comment or by reacting to the comment using emoji.

To check if a container or sequence (string, list, tuple) is empty, use if not val. Do not compare its length using if len(val) == 0 or if len(val) > 0

Learn more

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

+1

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, I disagree with this suggestion. Because python has no type checking, if not val can resolve differently and unexpectedly when val is a boolean, string, or list type. By explicitly writing len(estimator_tags) > 0, it's easier for the reader to understand that this is a list that has nothing in it.

self.tags = []

for tag in estimator_tags:
if tag not in self.tags:
self.tags.append(tag)

self.tags = add_jumpstart_tags(
tags=self.tags,
training_script_uri=getattr(estimator, "source_dir", None),
training_model_uri=self._get_model_uri(estimator),
)

def _prepare_job_name_for_tuning(self, job_name=None):
"""Set current job name before starting tuning."""
Expand All @@ -331,6 +368,12 @@ def _prepare_job_name_for_tuning(self, job_name=None):
self.estimator or self.estimator_dict[sorted(self.estimator_dict.keys())[0]]
)
base_name = base_name_from_image(estimator.training_image_uri())

jumpstart_base_name = get_jumpstart_base_name_if_jumpstart_model(
getattr(estimator, "source_dir", None),
self._get_model_uri(estimator),
)
base_name = jumpstart_base_name or base_name
self._current_job_name = name_from_base(
base_name, max_length=self.TUNING_JOB_NAME_MAX_LENGTH, short=True
)
Expand Down
256 changes: 256 additions & 0 deletions tests/unit/test_tuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
from sagemaker import Predictor, TrainingInput, utils
from sagemaker.amazon.amazon_estimator import RecordSet
from sagemaker.estimator import Framework
from sagemaker.fw_utils import UploadedCode
from sagemaker.jumpstart.constants import JUMPSTART_BUCKET_NAME_SET, JUMPSTART_RESOURCE_BASE_NAME
from sagemaker.jumpstart.enums import JumpStartTag
from sagemaker.mxnet import MXNet
from sagemaker.parameter import ParameterRange
from sagemaker.tuner import (
Expand Down Expand Up @@ -1518,3 +1521,256 @@ def _convert_tuning_job_details(job_details, estimator_name):
job_details_copy["TrainingJobDefinitions"] = [training_details]

return job_details_copy


@patch("time.time", return_value=510006209.073025)
@patch("sagemaker.estimator.tar_and_upload_dir")
@patch("sagemaker.model.Model._upload_code")
def test_tags_prefixes_jumpstart_models(
patched_upload_code, patched_tar_and_upload_dir, sagemaker_session
):

jumpstart_source_dir = f"s3://{list(JUMPSTART_BUCKET_NAME_SET)[0]}/source_dirs/source.tar.gz"
jumpstart_source_dir_2 = f"s3://{list(JUMPSTART_BUCKET_NAME_SET)[1]}/source_dirs/source.tar.gz"
jumpstart_source_dir_3 = f"s3://{list(JUMPSTART_BUCKET_NAME_SET)[2]}/source_dirs/source.tar.gz"

estimator_tag = {"Key": "estimator-tag-key", "Value": "estimator-tag-value"}
hp_tag = {"Key": "hp-tuner-tag-key", "Value": "hp-tuner-estimator-tag-value"}
training_model_uri_tag = {
"Key": JumpStartTag.TRAINING_MODEL_URI.value,
"Value": jumpstart_source_dir_2,
}
training_script_uri_tag = {
"Key": JumpStartTag.TRAINING_SCRIPT_URI.value,
"Value": jumpstart_source_dir,
}
inference_script_uri_tag = {
"Key": JumpStartTag.INFERENCE_SCRIPT_URI.value,
"Value": jumpstart_source_dir_3,
}

patched_tar_and_upload_dir.return_value = UploadedCode(
s3_prefix="s3://%s/%s" % ("bucket", "key"), script_name="script_name"
)
sagemaker_session.boto_region_name = REGION

sagemaker_session.sagemaker_client.describe_training_job.return_value = {
"AlgorithmSpecification": {
"TrainingInputMode": "File",
"TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-other:1.0.4",
},
"HyperParameters": {
"sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
"checkpoint_path": '"s3://other/1508872349"',
"sagemaker_program": '"iris-dnn-classifier.py"',
"sagemaker_container_log_level": '"logging.INFO"',
"sagemaker_job_name": '"neo"',
"training_steps": "100",
},
"RoleArn": "arn:aws:iam::366:role/SageMakerRole",
"ResourceConfig": {
"VolumeSizeInGB": 30,
"InstanceCount": 1,
"InstanceType": "ml.c4.xlarge",
},
"EnableNetworkIsolation": False,
"StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
"TrainingJobName": "neo",
"TrainingJobStatus": "Completed",
"TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
"OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
"TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
"EnableInterContainerTrafficEncryption": False,
"ModelArtifacts": {"S3ModelArtifacts": "blah"},
}

sagemaker_session.sagemaker_client.list_tags.return_value = {
"Tags": [
estimator_tag,
hp_tag,
training_model_uri_tag,
training_script_uri_tag,
]
}

instance_type = "ml.p2.xlarge"
instance_count = 1

training_data_uri = "s3://bucket/mydata"

image_uri = "fake-image-uri"

generic_estimator = Estimator(
entry_point="transfer_learning.py",
role=ROLE,
region=REGION,
sagemaker_session=sagemaker_session,
instance_count=instance_count,
instance_type=instance_type,
source_dir=jumpstart_source_dir,
image_uri=image_uri,
model_uri=jumpstart_source_dir_2,
tags=[estimator_tag],
)

hp_tuner = HyperparameterTuner(
generic_estimator,
OBJECTIVE_METRIC_NAME,
HYPERPARAMETER_RANGES,
tags=[hp_tag],
)

hp_tuner.fit({"training": training_data_uri})

assert [
hp_tag,
estimator_tag,
training_model_uri_tag,
training_script_uri_tag,
] == sagemaker_session.create_tuning_job.call_args_list[0][1]["tags"]

assert sagemaker_session.create_tuning_job.call_args_list[0][1]["job_name"].startswith(
JUMPSTART_RESOURCE_BASE_NAME
)

hp_tuner.deploy(
initial_instance_count=INSTANCE_COUNT,
instance_type=INSTANCE_TYPE,
image_uri=image_uri,
source_dir=jumpstart_source_dir_3,
entry_point="inference.py",
role=ROLE,
)

assert sagemaker_session.create_model.call_args_list[0][1]["name"].startswith(
JUMPSTART_RESOURCE_BASE_NAME
)

assert sagemaker_session.endpoint_from_production_variants.call_args_list[0].startswith(
JUMPSTART_RESOURCE_BASE_NAME
)

assert sagemaker_session.create_model.call_args_list[0][1]["tags"] == [
training_model_uri_tag,
training_script_uri_tag,
inference_script_uri_tag,
]

assert sagemaker_session.endpoint_from_production_variants.call_args_list[0][1]["tags"] == [
training_model_uri_tag,
training_script_uri_tag,
inference_script_uri_tag,
]


@patch("time.time", return_value=510006209.073025)
@patch("sagemaker.estimator.tar_and_upload_dir")
@patch("sagemaker.model.Model._upload_code")
def test_no_tags_prefixes_non_jumpstart_models(
patched_upload_code, patched_tar_and_upload_dir, sagemaker_session
):

non_jumpstart_source_dir = "s3://blah1/source_dirs/source.tar.gz"
non_jumpstart_source_dir_2 = "s3://blah2/source_dirs/source.tar.gz"
non_jumpstart_source_dir_3 = "s3://blah3/source_dirs/source.tar.gz"

estimator_tag = {"Key": "estimator-tag-key", "Value": "estimator-tag-value"}
hp_tag = {"Key": "hp-tuner-tag-key", "Value": "hp-tuner-estimator-tag-value"}

patched_tar_and_upload_dir.return_value = UploadedCode(
s3_prefix="s3://%s/%s" % ("bucket", "key"), script_name="script_name"
)
sagemaker_session.boto_region_name = REGION

sagemaker_session.sagemaker_client.describe_training_job.return_value = {
"AlgorithmSpecification": {
"TrainingInputMode": "File",
"TrainingImage": "1.dkr.ecr.us-west-2.amazonaws.com/sagemaker-other:1.0.4",
},
"HyperParameters": {
"sagemaker_submit_directory": '"s3://some/sourcedir.tar.gz"',
"checkpoint_path": '"s3://other/1508872349"',
"sagemaker_program": '"iris-dnn-classifier.py"',
"sagemaker_container_log_level": '"logging.INFO"',
"sagemaker_job_name": '"neo"',
"training_steps": "100",
},
"RoleArn": "arn:aws:iam::366:role/SageMakerRole",
"ResourceConfig": {
"VolumeSizeInGB": 30,
"InstanceCount": 1,
"InstanceType": "ml.c4.xlarge",
},
"EnableNetworkIsolation": False,
"StoppingCondition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
"TrainingJobName": "neo",
"TrainingJobStatus": "Completed",
"TrainingJobArn": "arn:aws:sagemaker:us-west-2:336:training-job/neo",
"OutputDataConfig": {"KmsKeyId": "", "S3OutputPath": "s3://place/output/neo"},
"TrainingJobOutput": {"S3TrainingJobOutput": "s3://here/output.tar.gz"},
"EnableInterContainerTrafficEncryption": False,
"ModelArtifacts": {"S3ModelArtifacts": "blah"},
}

sagemaker_session.sagemaker_client.list_tags.return_value = {"Tags": []}

sagemaker_session.sagemaker_client.describe_hyper_parameter_tuning_job.return_value = {
"BestTrainingJob": {"TrainingJobName": "some-name"}
}
instance_type = "ml.p2.xlarge"
instance_count = 1

training_data_uri = "s3://bucket/mydata"

image_uri = "fake-image-uri"

generic_estimator = Estimator(
entry_point="transfer_learning.py",
role=ROLE,
region=REGION,
sagemaker_session=sagemaker_session,
instance_count=instance_count,
instance_type=instance_type,
source_dir=non_jumpstart_source_dir,
image_uri=image_uri,
model_uri=non_jumpstart_source_dir_2,
tags=[estimator_tag],
)

hp_tuner = HyperparameterTuner(
generic_estimator,
OBJECTIVE_METRIC_NAME,
HYPERPARAMETER_RANGES,
tags=[hp_tag],
)

hp_tuner.fit({"training": training_data_uri})

assert [hp_tag, estimator_tag] == sagemaker_session.create_tuning_job.call_args_list[0][1][
"tags"
]

assert not sagemaker_session.create_tuning_job.call_args_list[0][1]["job_name"].startswith(
JUMPSTART_RESOURCE_BASE_NAME
)

hp_tuner.deploy(
initial_instance_count=INSTANCE_COUNT,
instance_type=INSTANCE_TYPE,
image_uri=image_uri,
source_dir=non_jumpstart_source_dir_3,
entry_point="inference.py",
role=ROLE,
)

assert not sagemaker_session.create_model.call_args_list[0][1]["name"].startswith(
JUMPSTART_RESOURCE_BASE_NAME
)

assert not sagemaker_session.endpoint_from_production_variants.call_args_list[0][1][
"name"
].startswith(JUMPSTART_RESOURCE_BASE_NAME)

assert sagemaker_session.create_model.call_args_list[0][1]["tags"] == []

assert sagemaker_session.endpoint_from_production_variants.call_args_list[0][1]["tags"] == []