feature: support RetryStrategy for training jobs

Shikha Panghal · Shikha Panghal · commit 206c8077d1e9 · 2021-04-29T17:02:42.000-07:00
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -124,6 +124,7 @@ def __init__(
         profiler_config=None,
         disable_profiler=False,
         environment=None,
+        max_retry_attempts=None,
         **kwargs,
     ):
         """Initialize an ``EstimatorBase`` instance.
@@ -269,6 +270,11 @@ def __init__(
                 will be disabled (default: ``False``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+             max_retry_attempts (int): The number of times to move a job to the STARTING status.
+                You can specify between 1 and 30 attempts.
+                If the value of attempts is greater than one, the job is retried on InternalServerFailure the same number of attempts as the value.
+                You can cap the total duration for your job by setting ``max_wait`` and ``max_run``
+                (default: ``None``)
 
         """
         instance_count = renamed_kwargs(
@@ -357,6 +363,8 @@ def __init__(
 
         self.environment = environment
 
+        self.max_retry_attempts = max_retry_attempts
+
         if not _region_supports_profiler(self.sagemaker_session.boto_region_name):
             self.disable_profiler = True
 
@@ -1114,6 +1122,13 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
             if max_wait:
                 init_params["max_wait"] = max_wait
 
+        if job_details.get("RetryStrategy", False):
+            init_params["max_retry_attempts"] = job_details.get("RetryStrategy", {}).get("MaximumRetryAttempts")
+            max_wait = job_details.get("StoppingCondition", {}).get("MaxWaitTimeInSeconds")
+            if max_wait:
+                init_params["max_wait"] = max_wait
+
+
         return init_params
 
     def transformer(
@@ -1489,6 +1504,11 @@ def _get_train_args(cls, estimator, inputs, experiment_config):
         if estimator.enable_network_isolation():
             train_args["enable_network_isolation"] = True
 
+        if estimator.max_retry_attempts is not None:
+            train_args["retry_strategy"] = {"MaximumRetryAttempts": estimator.max_retry_attempts}
+        else:
+            train_args["retry_strategy"] = None
+
         if estimator.encrypt_inter_container_traffic:
             train_args["encrypt_inter_container_traffic"] = True
 
@@ -1522,6 +1542,7 @@ def _get_train_args(cls, estimator, inputs, experiment_config):
 
         return train_args
 
+
     @classmethod
     def _add_spot_checkpoint_args(cls, local_mode, estimator, train_args):
         """Placeholder docstring"""
@@ -1666,6 +1687,7 @@ def __init__(
         profiler_config=None,
         disable_profiler=False,
         environment=None,
+        max_retry_attempts=None,
         **kwargs,
     ):
         """Initialize an ``Estimator`` instance.
@@ -1816,6 +1838,11 @@ def __init__(
                 will be disabled (default: ``False``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+            max_retry_attempts (int): The number of times to move a job to the STARTING status.
+                You can specify between 1 and 30 attempts.
+                If the value of attempts is greater than one, the job is retried on InternalServerFailure the same number of attempts as the value.
+                You can cap the total duration for your job by setting ``max_wait`` and ``max_run``
+                (default: ``None``)
         """
         self.image_uri = image_uri
         self.hyperparam_dict = hyperparameters.copy() if hyperparameters else {}
@@ -1850,6 +1877,7 @@ def __init__(
             profiler_config=profiler_config,
             disable_profiler=disable_profiler,
             environment=environment,
+            max_retry_attempts=max_retry_attempts,
             **kwargs,
         )
 
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -457,6 +457,7 @@ def train(  # noqa: C901
         profiler_rule_configs=None,
         profiler_config=None,
         environment=None,
+        retry_strategy=None,
     ):
         """Create an Amazon SageMaker training job.
 
@@ -529,6 +530,9 @@ def train(  # noqa: C901
                 with SageMaker Profiler. (default: ``None``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+            retry_strategy(dict): Defines RetryStrategy for InternalServerFailures.
+                * max_retry_attsmpts (int): Number of times a job should be retried.
+                The key in RetryStrategy is 'MaxRetryAttempts'.
 
         Returns:
             str: ARN of the training job, if it is created.
@@ -561,6 +565,7 @@ def train(  # noqa: C901
             profiler_rule_configs=profiler_rule_configs,
             profiler_config=profiler_config,
             environment=environment,
+            retry_strategy=retry_strategy,
         )
         LOGGER.info("Creating training-job with name: %s", job_name)
         LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
@@ -594,6 +599,7 @@ def _get_train_request(  # noqa: C901
         profiler_rule_configs=None,
         profiler_config=None,
         environment=None,
+        retry_strategy=None,
     ):
         """Constructs a request compatible for creating an Amazon SageMaker training job.
 
@@ -665,6 +671,9 @@ def _get_train_request(  # noqa: C901
                 SageMaker Profiler. (default: ``None``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+            retry_strategy(dict): Defines RetryStrategy for InternalServerFailures.
+                * max_retry_attsmpts (int): Number of times a job should be retried.
+                The key in RetryStrategy is 'MaxRetryAttempts'.
 
         Returns:
             Dict: a training request dict
@@ -749,6 +758,9 @@ def _get_train_request(  # noqa: C901
         if profiler_config is not None:
             train_request["ProfilerConfig"] = profiler_config
 
+        if retry_strategy is not None:
+            train_request["RetryStrategy"] = retry_strategy
+
         return train_request
 
     def update_training_job(
diff --git a/tests/integ/test_tf.py b/tests/integ/test_tf.py
@@ -61,6 +61,8 @@ def test_mnist_with_checkpoint_config(
         checkpoint_s3_uri=checkpoint_s3_uri,
         checkpoint_local_path=checkpoint_local_path,
         environment=ENV_INPUT,
+        max_wait=24 * 60 * 60,
+        max_retry_attempts=2,
     )
     inputs = estimator.sagemaker_session.upload_data(
         path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist"
@@ -89,8 +91,19 @@ def test_mnist_with_checkpoint_config(
             "Environment"
         ]
     )
+
+    expected_retry_strategy = {
+        "MaximumRetryAttempts": 2,
+    }
+    actual_retry_strategy = (
+        sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name)[
+            "RetryStrategy"
+        ]
+    )
     assert actual_training_checkpoint_config == expected_training_checkpoint_config
     assert actual_training_environment_variable_config == ENV_INPUT
+    assert actual_retry_strategy == expected_retry_strategy
+
 
 
 def test_server_side_encryption(sagemaker_session, tf_full_version, tf_full_py_version):
diff --git a/tests/unit/sagemaker/huggingface/test_estimator.py b/tests/unit/sagemaker/huggingface/test_estimator.py
@@ -150,6 +150,7 @@ def _create_train_job(version, base_framework_version):
         "vpc_config": None,
         "metric_definitions": None,
         "environment": None,
+        "retry_strategy": None,
         "experiment_config": None,
         "debugger_hook_config": {
             "CollectionConfigurations": [],
diff --git a/tests/unit/sagemaker/tensorflow/test_estimator.py b/tests/unit/sagemaker/tensorflow/test_estimator.py
@@ -127,6 +127,7 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
         },
         "hyperparameters": _hyperparameters(horovod, smdataparallel),
         "stop_condition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
+        "retry_strategy": None,
         "tags": None,
         "vpc_config": None,
         "metric_definitions": None,
diff --git a/tests/unit/test_chainer.py b/tests/unit/test_chainer.py
@@ -140,6 +140,7 @@ def _create_train_job(version, py_version):
             "sagemaker_region": '"us-west-2"',
         },
         "stop_condition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
+        "retry_strategy": None,
         "tags": None,
         "vpc_config": None,
         "metric_definitions": None,
diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py
@@ -245,6 +245,7 @@ def test_framework_all_init_args(sagemaker_session):
         enable_sagemaker_metrics=True,
         enable_network_isolation=True,
         environment=ENV_INPUT,
+        max_retry_attempts=2,
     )
     _TrainingJob.start_new(f, "s3://mydata", None)
     sagemaker_session.train.assert_called_once()
@@ -269,6 +270,7 @@ def test_framework_all_init_args(sagemaker_session):
         "output_config": {"KmsKeyId": "outputkms", "S3OutputPath": "outputpath"},
         "vpc_config": {"Subnets": ["123", "456"], "SecurityGroupIds": ["789", "012"]},
         "stop_condition": {"MaxRuntimeInSeconds": 456},
+        "retry_strategy": {"MaximumRetryAttempts": 2},
         "role": sagemaker_session.expand_role(),
         "job_name": None,
         "resource_config": {
@@ -1029,7 +1031,6 @@ def test_training_job_with_rule_job_summary(sagemaker_session, training_job_desc
         },
     ]
 
-
 def test_framework_with_spot_and_checkpoints(sagemaker_session):
     f = DummyFramework(
         "my_script.py",
@@ -1092,6 +1093,7 @@ def test_framework_with_spot_and_checkpoints(sagemaker_session):
         "checkpoint_local_path": "/tmp/checkpoints",
         "environment": None,
         "experiment_config": None,
+        "retry_strategy": None,
     }
 
 
@@ -2392,6 +2394,7 @@ def test_unsupported_type_in_dict():
         "VolumeSizeInGB": 30,
     },
     "stop_condition": {"MaxRuntimeInSeconds": 86400},
+    "retry_strategy": None,
     "tags": None,
     "vpc_config": None,
     "metric_definitions": None,
@@ -2703,6 +2706,24 @@ def test_add_environment_variables_to_train_args(sagemaker_session):
     assert args["environment"] == ENV_INPUT
 
 
+def test_add_retry_strategy_to_train_args(sagemaker_session):
+    e = Estimator(
+        IMAGE_URI,
+        ROLE,
+        INSTANCE_COUNT,
+        INSTANCE_TYPE,
+        output_path=OUTPUT_PATH,
+        sagemaker_session=sagemaker_session,
+        max_retry_attempts=2,
+    )
+
+    e.fit()
+
+    sagemaker_session.train.assert_called_once()
+    args = sagemaker_session.train.call_args[1]
+    assert args["retry_strategy"] == {"MaximumRetryAttempts": 2}
+
+
 def test_generic_to_fit_with_sagemaker_metrics_enabled(sagemaker_session):
     e = Estimator(
         IMAGE_URI,
@@ -3159,6 +3180,27 @@ def test_prepare_init_params_from_job_description_with_spot_training():
     assert init_params["max_wait"] == 87000
 
 
+def test_prepare_init_params_from_job_description_with_retry_strategy():
+    job_description = RETURNED_JOB_DESCRIPTION.copy()
+    job_description["RetryStrategy"] = {
+        "MaximumRetryAttempts": 2
+    }
+    job_description["StoppingCondition"] = {
+        "MaxRuntimeInSeconds": 86400,
+        "MaxWaitTimeInSeconds": 87000,
+    }
+
+    init_params = EstimatorBase._prepare_init_params_from_job_description(
+        job_details=job_description
+    )
+
+    assert init_params["role"] == "arn:aws:iam::366:role/SageMakerRole"
+    assert init_params["instance_count"] == 1
+    assert init_params["max_run"] == 86400
+    assert init_params["max_wait"] == 87000
+    assert init_params["max_retry_attempts"] == 2
+
+
 def test_prepare_init_params_from_job_description_with_invalid_training_job():
 
     invalid_job_description = RETURNED_JOB_DESCRIPTION.copy()
diff --git a/tests/unit/test_mxnet.py b/tests/unit/test_mxnet.py
@@ -147,6 +147,7 @@ def _get_train_args(job_name):
         "vpc_config": None,
         "metric_definitions": None,
         "environment": None,
+        "retry_strategy": None,
         "experiment_config": None,
         "debugger_hook_config": {
             "CollectionConfigurations": [],
@@ -993,7 +994,6 @@ def test_mx_missing_environment_variables(
     )
     assert not mx.environment
 
-
 def test_mx_enable_sm_metrics(sagemaker_session, mxnet_training_version, mxnet_training_py_version):
     mx = MXNet(
         entry_point=SCRIPT_PATH,
diff --git a/tests/unit/test_pytorch.py b/tests/unit/test_pytorch.py
@@ -149,6 +149,7 @@ def _create_train_job(version, py_version):
         "vpc_config": None,
         "metric_definitions": None,
         "environment": None,
+        "retry_strategy": None,
         "experiment_config": None,
         "debugger_hook_config": {
             "CollectionConfigurations": [],
diff --git a/tests/unit/test_rl.py b/tests/unit/test_rl.py
@@ -162,6 +162,7 @@ def _create_train_job(toolkit, toolkit_version, framework):
         "profiler_config": {
             "S3OutputPath": "s3://{}/".format(BUCKET_NAME),
         },
+        "retry_strategy": None,
     }
 
 
diff --git a/tests/unit/test_session.py b/tests/unit/test_session.py
@@ -154,7 +154,7 @@ def test_process(boto_session):
         },
         "role_arn": ROLE,
         "tags": [{"Name": "my-tag", "Value": "my-tag-value"}],
-        "experiment_config": {"ExperimentName": "AnExperiment"},
+        "experiment_config": {"ExperimentName": "AnExperiment"}
     }
     session.process(**process_request_args)
 
@@ -1208,6 +1208,7 @@ def test_train_pack_to_request_with_optional_params(sagemaker_session):
     }
 
     stop_cond = {"MaxRuntimeInSeconds": MAX_TIME}
+    RETRY_STRATEGY = {"MaximumRetryAttempts": 2}
     hyperparameters = {"foo": "bar"}
 
     sagemaker_session.train(
@@ -1229,6 +1230,7 @@ def test_train_pack_to_request_with_optional_params(sagemaker_session):
         checkpoint_local_path="/tmp/checkpoints",
         enable_sagemaker_metrics=True,
         environment=ENV_INPUT,
+        retry_strategy=RETRY_STRATEGY,
     )
 
     _, _, actual_train_args = sagemaker_session.sagemaker_client.method_calls[0]
@@ -1243,6 +1245,7 @@ def test_train_pack_to_request_with_optional_params(sagemaker_session):
     assert actual_train_args["CheckpointConfig"]["S3Uri"] == "s3://mybucket/checkpoints/"
     assert actual_train_args["CheckpointConfig"]["LocalPath"] == "/tmp/checkpoints"
     assert actual_train_args["Environment"] == ENV_INPUT
+    assert actual_train_args["RetryStrategy"] == RETRY_STRATEGY
 
 
 def test_transform_pack_to_request(sagemaker_session):
diff --git a/tests/unit/test_sklearn.py b/tests/unit/test_sklearn.py
@@ -129,6 +129,7 @@ def _create_train_job(version):
             "sagemaker_region": '"us-west-2"',
         },
         "stop_condition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
+        "retry_strategy": None,
         "metric_definitions": None,
         "tags": None,
         "vpc_config": None,
diff --git a/tests/unit/test_xgboost.py b/tests/unit/test_xgboost.py
@@ -142,6 +142,7 @@ def _create_train_job(version, instance_count=1, instance_type="ml.c4.4xlarge"):
             "sagemaker_region": '"us-west-2"',
         },
         "stop_condition": {"MaxRuntimeInSeconds": 24 * 60 * 60},
+        "retry_strategy": None,
         "metric_definitions": None,
         "tags": None,
         "vpc_config": None,

Original file line number	Diff line number	Diff line change
`@@ -162,6 +162,7 @@ def _create_train_job(toolkit, toolkit_version, framework):`
`162`	`162`	`"profiler_config": {`
`163`	`163`	`"S3OutputPath": "s3://{}/".format(BUCKET_NAME),`
`164`	`164`	`},`
	`165`	`+ "retry_strategy": None,`
`165`	`166`	`}`
`166`	`167`
`167`	`168`