aws
diff --git a/‎.github/ISSUE_TEMPLATE/config.yml
Lines changed: 2 additions & 2 deletions b/‎.github/ISSUE_TEMPLATE/config.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎CHANGELOG.md
Lines changed: 29 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 29 additions & 0 deletions
diff --git a/‎VERSION
Lines changed: 1 addition & 1 deletion b/‎VERSION
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst
Lines changed: 3 additions & 1 deletion b/‎doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst
Lines changed: 3 additions & 1 deletion
diff --git a/‎doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_tensorflow.rst
Lines changed: 3 additions & 1 deletion b/‎doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_tensorflow.rst
Lines changed: 3 additions & 1 deletion
diff --git a/‎doc/api/training/sdp_versions/v1.1.x/smd_data_parallel_tensorflow.rst
Lines changed: 3 additions & 1 deletion b/‎doc/api/training/sdp_versions/v1.1.x/smd_data_parallel_tensorflow.rst
Lines changed: 3 additions & 1 deletion
diff --git a/‎doc/frameworks/sklearn/using_sklearn.rst
Lines changed: 4 additions & 0 deletions b/‎doc/frameworks/sklearn/using_sklearn.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎doc/frameworks/xgboost/using_xgboost.rst
Lines changed: 4 additions & 0 deletions b/‎doc/frameworks/xgboost/using_xgboost.rst
Lines changed: 4 additions & 0 deletions
diff --git a/‎doc/overview.rst
Lines changed: 2 additions & 2 deletions b/‎doc/overview.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/sagemaker/estimator.py
Lines changed: 31 additions & 0 deletions b/‎src/sagemaker/estimator.py
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/sagemaker/image_uri_config/data-wrangler.json
Lines changed: 33 additions & 0 deletions b/‎src/sagemaker/image_uri_config/data-wrangler.json
Lines changed: 33 additions & 0 deletions
diff --git a/‎src/sagemaker/session.py
Lines changed: 12 additions & 0 deletions b/‎src/sagemaker/session.py
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/sagemaker/workflow/_repack_model.py
Lines changed: 19 additions & 1 deletion b/‎src/sagemaker/workflow/_repack_model.py
Lines changed: 19 additions & 1 deletion
diff --git a/‎src/sagemaker/workflow/conditions.py
Lines changed: 2 additions & 2 deletions b/‎src/sagemaker/workflow/conditions.py
Lines changed: 2 additions & 2 deletions
@@ -1,5 +1,5 @@
 blank_issues_enabled: false
 contact_links:
   - name: Ask a question
-    url: https://stackoverflow.com/questions/tagged/amazon-sagemaker
-    about: Use Stack Overflow to ask and answer questions
+    url: https://github.com/aws/sagemaker-python-sdk/discussions
+    about: Use GitHub Discussions to ask and answer questions
@@ -1,5 +1,34 @@
 # Changelog
 
+## v2.41.0 (2021-05-17)
+
+### Features
+
+ * add pipeline experiment config
+ * add data wrangler processor
+ * support RetryStrategy for training jobs
+
+### Bug Fixes and Other Changes
+
+ * fix repack pipeline step by putting inference.py in "code" sub dir
+ * add data wrangler image uri
+ * fix black-check errors
+
+## v2.40.0 (2021-05-11)
+
+### Features
+
+ * add xgboost framework version 1.2-2
+
+### Bug Fixes and Other Changes
+
+ * fix get_execution_role on Studio
+ * [fix] Check py_version existence in RegisterModel step
+
+### Documentation Changes
+
+ * SM Distributed EFA Launch
+
 ## v2.39.1 (2021-05-05)
 
 ### Bug Fixes and Other Changes
 
@@ -1 +1 @@
-2.39.2.dev0
+2.41.1.dev0
@@ -443,7 +443,7 @@ TensorFlow API
 
       *   Supported compression types - ``none``, ``fp16``
 
-   - ``sparse_as_dense:`` Not supported. Raises not supported error.
+   - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``.
 
    - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given.
 
@@ -482,6 +482,8 @@ TensorFlow API
 
       *   Supported compression types - ``none``, ``fp16``
 
+   - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``.
+
    - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given.
 
       *  Supported ops: ``AVERAGE``
 
@@ -456,7 +456,7 @@ TensorFlow API
 
       *   Supported compression types - ``none``, ``fp16``
 
-   - ``sparse_as_dense:`` Not supported. Raises not supported error.
+   - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``.
 
    - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given.
 
@@ -496,6 +496,8 @@ TensorFlow API
 
       *   Supported compression types - ``none``, ``fp16``
 
+   - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``.
+
    - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given.
 
       *  Supported ops: ``AVERAGE``
 
@@ -459,7 +459,7 @@ library with TensorFlow.
 
       *   Supported compression types - ``none``, ``fp16``
 
-   - ``sparse_as_dense:`` Not supported. Raises not supported error.
+   - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``.
 
    - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given.
 
@@ -499,6 +499,8 @@ library with TensorFlow.
 
       *   Supported compression types - ``none``, ``fp16``
 
+   - ``sparse_as_dense:`` Treats sparse gradient tensor as dense tensor. Defaults to ``False``.
+
    - ``op (smdistributed.dataparallel.tensorflow.ReduceOp)(optional)``: The reduction operation to combine tensors across different ranks. Defaults to ``Average`` if None is given.
 
       *  Supported ops: ``AVERAGE``
 
@@ -84,6 +84,10 @@ inadvertently run your training code at the wrong point in execution.
 
 For more on training environment variables, please visit https://github.com/aws/sagemaker-containers.
 
+.. important::
+    The sagemaker-containers repository has been deprecated,
+    however it is still used to define Scikit-learn and XGBoost environment variables.
+
 Save the Model
 --------------
 
 
@@ -88,6 +88,10 @@ but you can access useful properties about the training environment through vari
 
 For the exhaustive list of available environment variables, see the `SageMaker Containers documentation <https://github.com/aws/sagemaker-containers#list-of-provided-environment-variables-by-sagemaker-containers>`__.
 
+.. important::
+    The sagemaker-containers repository has been deprecated,
+    however it is still used to define Scikit-learn and XGBoost environment variables.
+
 Let's look at the main elements of the script. Starting with the ``__main__`` guard,
 use a parser to read the hyperparameters passed to the estimator when creating the training job.
 These hyperparameters are made available as arguments to our input script.
 
@@ -374,7 +374,7 @@ Here are examples of how to use Amazon FSx for Lustre as input for training:
 
         file_system_input = FileSystemInput(file_system_id='fs-2',
                                             file_system_type='FSxLustre',
-                                            directory_path='/fsx/tensorflow',
+                                            directory_path='/<mount-id>/tensorflow',
                                             file_system_access_mode='ro')
 
         # Start an Amazon SageMaker training job with FSx using the FileSystemInput class
@@ -394,7 +394,7 @@ Here are examples of how to use Amazon FSx for Lustre as input for training:
 
         records = FileSystemRecordSet(file_system_id='fs-=2,
                                       file_system_type='FSxLustre',
-                                      directory_path='/fsx/kmeans',
+                                      directory_path='/<mount-id>/kmeans',
                                       num_records=784,
                                       feature_dim=784)
 
 
@@ -124,6 +124,7 @@ def __init__(
         profiler_config=None,
         disable_profiler=False,
         environment=None,
+        max_retry_attempts=None,
         **kwargs,
     ):
         """Initialize an ``EstimatorBase`` instance.
@@ -269,6 +270,13 @@ def __init__(
                 will be disabled (default: ``False``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+             max_retry_attempts (int): The number of times to move a job to the STARTING status.
+                You can specify between 1 and 30 attempts.
+                If the value of attempts is greater than zero,
+                the job is retried on InternalServerFailure
+                the same number of attempts as the value.
+                You can cap the total duration for your job by setting ``max_wait`` and ``max_run``
+                (default: ``None``)
 
         """
         instance_count = renamed_kwargs(
@@ -357,6 +365,8 @@ def __init__(
 
         self.environment = environment
 
+        self.max_retry_attempts = max_retry_attempts
+
         if not _region_supports_profiler(self.sagemaker_session.boto_region_name):
             self.disable_profiler = True
 
@@ -1114,6 +1124,13 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
             if max_wait:
                 init_params["max_wait"] = max_wait
 
+        if job_details.get("RetryStrategy", False):
+            init_params["max_retry_attempts"] = job_details.get("RetryStrategy", {}).get(
+                "MaximumRetryAttempts"
+            )
+            max_wait = job_details.get("StoppingCondition", {}).get("MaxWaitTimeInSeconds")
+            if max_wait:
+                init_params["max_wait"] = max_wait
         return init_params
 
     def transformer(
@@ -1489,6 +1506,11 @@ def _get_train_args(cls, estimator, inputs, experiment_config):
         if estimator.enable_network_isolation():
             train_args["enable_network_isolation"] = True
 
+        if estimator.max_retry_attempts is not None:
+            train_args["retry_strategy"] = {"MaximumRetryAttempts": estimator.max_retry_attempts}
+        else:
+            train_args["retry_strategy"] = None
+
         if estimator.encrypt_inter_container_traffic:
             train_args["encrypt_inter_container_traffic"] = True
 
@@ -1666,6 +1688,7 @@ def __init__(
         profiler_config=None,
         disable_profiler=False,
         environment=None,
+        max_retry_attempts=None,
         **kwargs,
     ):
         """Initialize an ``Estimator`` instance.
@@ -1816,6 +1839,13 @@ def __init__(
                 will be disabled (default: ``False``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+            max_retry_attempts (int): The number of times to move a job to the STARTING status.
+                You can specify between 1 and 30 attempts.
+                If the value of attempts is greater than zero,
+                the job is retried on InternalServerFailure
+                the same number of attempts as the value.
+                You can cap the total duration for your job by setting ``max_wait`` and ``max_run``
+                (default: ``None``)
         """
         self.image_uri = image_uri
         self.hyperparam_dict = hyperparameters.copy() if hyperparameters else {}
@@ -1850,6 +1880,7 @@ def __init__(
             profiler_config=profiler_config,
             disable_profiler=disable_profiler,
             environment=environment,
+            max_retry_attempts=max_retry_attempts,
             **kwargs,
         )
 
 
@@ -0,0 +1,33 @@
+{
+  "processing": {
+    "versions": {
+      "1.x": {
+        "registries": {
+          "af-south-1": "143210264188",
+          "ap-east-1": "707077482487",
+          "ap-northeast-1": "649008135260",
+          "ap-northeast-2": "131546521161",
+          "ap-south-1": "089933028263",
+          "ap-southeast-1": "119527597002",
+          "ap-southeast-2": "422173101802",
+          "ca-central-1": "557239378090",
+          "eu-central-1": "024640144536",
+          "eu-north-1": "054986407534",
+          "eu-south-1": "488287956546",
+          "eu-west-1": "245179582081",
+          "eu-west-2": "894491911112",
+          "eu-west-3": "807237891255",
+          "me-south-1": "376037874950",
+          "sa-east-1": "424196993095",
+          "us-east-1": "663277389841",
+          "us-east-2": "415577184552",
+          "us-west-1": "926135532090",
+          "us-west-2": "174368400705",
+          "cn-north-1": "245909111842",
+          "cn-northwest-1": "249157047649"
+        },
+        "repository": "sagemaker-data-wrangler-container"
+      }
+    }
+  }
+}
@@ -457,6 +457,7 @@ def train(  # noqa: C901
         profiler_rule_configs=None,
         profiler_config=None,
         environment=None,
+        retry_strategy=None,
     ):
         """Create an Amazon SageMaker training job.
 
@@ -529,6 +530,9 @@ def train(  # noqa: C901
                 with SageMaker Profiler. (default: ``None``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+            retry_strategy(dict): Defines RetryStrategy for InternalServerFailures.
+                * max_retry_attsmpts (int): Number of times a job should be retried.
+                The key in RetryStrategy is 'MaxRetryAttempts'.
 
         Returns:
             str: ARN of the training job, if it is created.
@@ -561,6 +565,7 @@ def train(  # noqa: C901
             profiler_rule_configs=profiler_rule_configs,
             profiler_config=profiler_config,
             environment=environment,
+            retry_strategy=retry_strategy,
         )
         LOGGER.info("Creating training-job with name: %s", job_name)
         LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
@@ -594,6 +599,7 @@ def _get_train_request(  # noqa: C901
         profiler_rule_configs=None,
         profiler_config=None,
         environment=None,
+        retry_strategy=None,
     ):
         """Constructs a request compatible for creating an Amazon SageMaker training job.
 
@@ -665,6 +671,9 @@ def _get_train_request(  # noqa: C901
                 SageMaker Profiler. (default: ``None``).
             environment (dict[str, str]) : Environment variables to be set for
                 use during training job (default: ``None``)
+            retry_strategy(dict): Defines RetryStrategy for InternalServerFailures.
+                * max_retry_attsmpts (int): Number of times a job should be retried.
+                The key in RetryStrategy is 'MaxRetryAttempts'.
 
         Returns:
             Dict: a training request dict
@@ -749,6 +758,9 @@ def _get_train_request(  # noqa: C901
         if profiler_config is not None:
             train_request["ProfilerConfig"] = profiler_config
 
+        if retry_strategy is not None:
+            train_request["RetryStrategy"] = retry_strategy
+
         return train_request
 
     def update_training_job(
 
@@ -19,6 +19,13 @@
 import tarfile
 import tempfile
 
+# Repack Model
+# The following script is run via a training job which takes an existing model and a custom
+# entry point script as arguments. The script creates a new model archive with the custom
+# entry point in the "code" directory along with the existing model.  Subsequently, when the model
+# is unpacked for inference, the custom entry point will be used.
+# Reference: https://docs.aws.amazon.com/sagemaker/latest/dg/amazon-sagemaker-toolkits.html
+
 # distutils.dir_util.copy_tree works way better than the half-baked
 # shutil.copytree which bombs on previously existing target dirs...
 # alas ... https://bugs.python.org/issue10948
@@ -33,17 +40,28 @@
     parser.add_argument("--model_archive", type=str, default="model.tar.gz")
     args = parser.parse_args()
 
+    # the data directory contains a model archive generated by a previous training job
     data_directory = "/opt/ml/input/data/training"
     model_path = os.path.join(data_directory, args.model_archive)
 
+    # create a temporary directory
     with tempfile.TemporaryDirectory() as tmp:
         local_path = os.path.join(tmp, "local.tar.gz")
+        # copy the previous training job's model archive to the temporary directory
         shutil.copy2(model_path, local_path)
         src_dir = os.path.join(tmp, "src")
+        # create the "code" directory which will contain the inference script
+        os.makedirs(os.path.join(src_dir, "code"))
+        # extract the contents of the previous training job's model archive to the "src"
+        # directory of this training job
         with tarfile.open(name=local_path, mode="r:gz") as tf:
             tf.extractall(path=src_dir)
 
+        # generate a path to the custom inference script
         entry_point = os.path.join("/opt/ml/code", args.inference_script)
-        shutil.copy2(entry_point, os.path.join(src_dir, args.inference_script))
+        # copy the custom inference script to the "src" dir
+        shutil.copy2(entry_point, os.path.join(src_dir, "code", args.inference_script))
 
+        # copy the "src" dir, which includes the previous training job's model and the
+        # custom inference script, to the output of this training job
         copy_tree(src_dir, "/opt/ml/model")
@@ -186,8 +186,8 @@ def to_request(self) -> RequestType:
         """Get the request structure for workflow service calls."""
         return {
             "Type": self.condition_type.value,
-            "Value": self.value.expr,
-            "In": [primitive_or_expr(in_value) for in_value in self.in_values],
+            "QueryValue": self.value.expr,
+            "Values": [primitive_or_expr(in_value) for in_value in self.in_values],
         }
Original file line number	Diff line number	Diff line change
`@@ -186,8 +186,8 @@ def to_request(self) -> RequestType:`
`186`	`186`	`"""Get the request structure for workflow service calls."""`
`187`	`187`	`return {`
`188`	`188`	`"Type": self.condition_type.value,`
`189`		`- "Value": self.value.expr,`
`190`		`- "In": [primitive_or_expr(in_value) for in_value in self.in_values],`
	`189`	`+ "QueryValue": self.value.expr,`
	`190`	`+ "Values": [primitive_or_expr(in_value) for in_value in self.in_values],`
`191`	`191`	`}`
`192`	`192`
`193`	`193`