Skip to content

fix: set use_spot_instances and max_wait in init params from job #1872

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Sep 9, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions src/sagemaker/estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -871,6 +871,12 @@ class constructor
init_params["model_uri"] = channel["DataSource"]["S3DataSource"]["S3Uri"]
break

if job_details.get("EnableManagedSpotTraining", False):
init_params["use_spot_instances"] = True
max_wait = job_details.get("StoppingCondition", {}).get("MaxWaitTimeInSeconds")
if max_wait:
init_params["max_wait"] = max_wait

return init_params

def transformer(
Expand Down
21 changes: 19 additions & 2 deletions tests/unit/test_estimator.py
Original file line number Diff line number Diff line change
Expand Up @@ -2125,7 +2125,6 @@ def test_generic_deploy_accelerator_type(sagemaker_session):
e.deploy(INSTANCE_COUNT, INSTANCE_TYPE, accelerator_type=ACCELERATOR_TYPE)

args = e.sagemaker_session.endpoint_from_production_variants.call_args[1]
print(args)
assert args["name"].startswith(IMAGE_URI)
assert args["production_variants"][0]["AcceleratorType"] == ACCELERATOR_TYPE
assert args["production_variants"][0]["InitialInstanceCount"] == INSTANCE_COUNT
Expand Down Expand Up @@ -2182,7 +2181,6 @@ def test_local_mode(session_class, local_session_class):
session_class.return_value = session

e = Estimator(IMAGE_URI, ROLE, INSTANCE_COUNT, "local")
print(e.sagemaker_session.local_mode)
assert e.sagemaker_session.local_mode is True

e2 = Estimator(IMAGE_URI, ROLE, INSTANCE_COUNT, "local_gpu")
Expand Down Expand Up @@ -2248,6 +2246,25 @@ def test_prepare_init_params_from_job_description_with_algorithm_training_job():
)


def test_prepare_init_params_from_job_description_with_spot_training():
job_description = RETURNED_JOB_DESCRIPTION.copy()
job_description["EnableManagedSpotTraining"] = True
job_description["StoppingCondition"] = {
"MaxRuntimeInSeconds": 86400,
"MaxWaitTimeInSeconds": 87000,
}

init_params = EstimatorBase._prepare_init_params_from_job_description(
job_details=job_description
)

assert init_params["role"] == "arn:aws:iam::366:role/SageMakerRole"
assert init_params["instance_count"] == 1
assert init_params["use_spot_instances"]
assert init_params["max_run"] == 86400
assert init_params["max_wait"] == 87000


def test_prepare_init_params_from_job_description_with_invalid_training_job():

invalid_job_description = RETURNED_JOB_DESCRIPTION.copy()
Expand Down