aws
diff --git a/‎CHANGELOG.md
Lines changed: 63 additions & 0 deletions b/‎CHANGELOG.md
Lines changed: 63 additions & 0 deletions
diff --git a/‎VERSION
Lines changed: 1 addition & 1 deletion b/‎VERSION
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/amazon_sagemaker_featurestore.rst
Lines changed: 1 addition & 1 deletion b/‎doc/amazon_sagemaker_featurestore.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/api/training/sdp_versions/latest.rst
Lines changed: 1 addition & 1 deletion b/‎doc/api/training/sdp_versions/latest.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/api/training/sdp_versions/latest/smd_data_parallel_pytorch.rst
Lines changed: 2 additions & 2 deletions b/‎doc/api/training/sdp_versions/latest/smd_data_parallel_pytorch.rst
Lines changed: 2 additions & 2 deletions
diff --git a/‎doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst
Lines changed: 7 additions & 4 deletions b/‎doc/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.rst
Lines changed: 7 additions & 4 deletions
diff --git a/‎doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_pytorch.rst
Lines changed: 6 additions & 8 deletions b/‎doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_pytorch.rst
Lines changed: 6 additions & 8 deletions
diff --git a/‎doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_tensorflow.rst
Lines changed: 5 additions & 7 deletions b/‎doc/api/training/sdp_versions/v1.0.0/smd_data_parallel_tensorflow.rst
Lines changed: 5 additions & 7 deletions
diff --git a/‎doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.md
Lines changed: 22 additions & 4 deletions b/‎doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.md
Lines changed: 22 additions & 4 deletions
diff --git a/‎doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.md
Lines changed: 30 additions & 0 deletions b/‎doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.md
Lines changed: 30 additions & 0 deletions
diff --git a/‎doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst
Lines changed: 1 addition & 1 deletion b/‎doc/api/training/smp_versions/latest/smd_model_parallel_pytorch.rst
Lines changed: 1 addition & 1 deletion
diff --git a/‎doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst
Lines changed: 17 additions & 9 deletions b/‎doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst
Lines changed: 17 additions & 9 deletions
diff --git a/‎doc/frameworks/huggingface/index.rst
Lines changed: 1 addition & 0 deletions b/‎doc/frameworks/huggingface/index.rst
Lines changed: 1 addition & 0 deletions
diff --git a/‎doc/requirements.txt
Lines changed: 1 addition & 0 deletions b/‎doc/requirements.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/sagemaker/clarify.py
Lines changed: 6 additions & 3 deletions b/‎src/sagemaker/clarify.py
Lines changed: 6 additions & 3 deletions
@@ -1,5 +1,68 @@
 # Changelog
 
+## v2.35.0 (2021-04-14)
+
+### Features
+
+ * add support for PyTorch 1.8.1
+
+### Bug Fixes and Other Changes
+
+ * boto3 client param updated for feature store
+ * Updated release notes and API doc for smd model parallel 1.3.1
+
+## v2.34.0 (2021-04-12)
+
+### Features
+
+ * Add support for accelerator in Clarify
+
+### Bug Fixes and Other Changes
+
+ * add Documentation for how to use
+ * enable local mode tests that were skipped
+ * add integ test for HuggingFace with TensorFlow
+
+### Documentation Changes
+
+ * release notes for smdistributed.dataparallel v1.1.1
+ * fixing the SageMaker distributed version references
+
+### Testing and Release Infrastructure
+
+ * pin version for ducutils
+
+## v2.33.0 (2021-04-05)
+
+### Features
+
+ * Add environment variable support for SageMaker training job
+
+### Bug Fixes and Other Changes
+
+ * add version length mismatch validation for HuggingFace
+ * Disable debugger when checkpointing is enabled with distributed training
+ * map user context is list associations response
+
+### Testing and Release Infrastructure
+
+ * disable_profiler on mx-horovod test
+
+## v2.32.1 (2021-04-01)
+
+### Bug Fixes and Other Changes
+
+ * disable profiler in some release tests
+ * remove outdated notebook from test
+ * add compilation option for ml_eia2
+ * add short version to smdataparallel supported list
+
+### Documentation Changes
+
+ * creating a "latest" version sm distributed docs
+ * add docs for Sagemaker Model Parallel 1.3, released with PT 1.8
+ * update PyTorch version in doc
+
 ## v2.32.0 (2021-03-26)
 
 ### Features
 
@@ -1 +1 @@
-2.32.1.dev0
+2.35.1.dev0
@@ -67,7 +67,7 @@ use the SageMaker default bucket and add a custom prefix to it.
    offline_feature_store_bucket = 's3://*{}*/*{}*'.format(default_bucket, prefix)
 
    sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
-   featurestore_runtime = boto_session.client(service_name='featurestore-runtime', region_name=region)
+   featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)
 
    feature_store_session = Session(
        boto_session=boto_session,
 
@@ -1,5 +1,5 @@
 
-Version 1.1.0 (Latest)
+Version 1.1.1 (Latest)
 ======================
 
 .. toctree::
 
@@ -153,9 +153,9 @@ you will have for distributed training with the distributed data parallel librar
 PyTorch API
 ===========
 
-**Supported versions:**
+.. rubric:: Supported versions
 
--  PyTorch 1.6.0, 1.8.0
+**PyTorch 1.7.1, 1.8.0**
 
 
 .. function:: smdistributed.dataparallel.torch.distributed.is_available()
 
@@ -16,8 +16,9 @@ The following steps show you how to convert a TensorFlow 2.x training
 script to utilize the distributed data parallel library.
 
 The distributed data parallel library APIs are designed to be close to Horovod APIs.
-See `SageMaker distributed data parallel TensorFlow examples <https://sagemaker-examples.readthedocs.io/en/latest/training/distributed_training/index.html#tensorflow-distributed>`__ for additional details on how to implement the data parallel library
-API offered for TensorFlow.
+See `SageMaker distributed data parallel TensorFlow examples
+<https://sagemaker-examples.readthedocs.io/en/latest/training/distributed_training/index.html#tensorflow-distributed>`__
+for additional details on how to implement the data parallel library.
 
 -  First import the distributed data parallel library’s TensorFlow client and initialize it:
 
@@ -156,8 +157,10 @@ TensorFlow API
 
 .. rubric:: Supported versions
 
--  TensorFlow 2.x - 2.3.1
-
+TensorFlow is supported in version 1.0.0 of ``sagemakerdistributed.dataparallel``.
+Reference version 1.0.0 `TensorFlow API documentation
+<https://sagemaker.readthedocs.io/en/stable/api/training/sdp_versions/latest/smd_data_parallel_tensorflow.html#tensorflow-sdp-api>`_
+for supported TensorFlow versions.
 
 .. function:: smdistributed.dataparallel.tensorflow.init()
 
 
@@ -4,11 +4,10 @@ PyTorch Guide to SageMaker's distributed data parallel library
 
 .. admonition:: Contents
 
-   - :ref:`pytorch-sdp-modify`
-   - :ref:`pytorch-sdp-api`
+   - :ref:`pytorch-sdp-modify-1.0.0`
+   - :ref:`pytorch-sdp-api-1.0.0`
 
-.. _pytorch-sdp-modify:
-   :noindex:
+.. _pytorch-sdp-modify-1.0.0:
 
 Modify a PyTorch training script to use SageMaker data parallel
 ======================================================================
@@ -149,15 +148,14 @@ you will have for distributed training with the distributed data parallel librar
        main()
 
 
-.. _pytorch-sdp-api:
-   :noindex:
+.. _pytorch-sdp-api-1.0.0:
 
 PyTorch API
 ===========
 
-**Supported versions:**
+.. rubric:: Supported versions
 
--  PyTorch 1.6.0
+**PyTorch 1.6.0, 1.7.1**
 
 
 .. function:: smdistributed.dataparallel.torch.distributed.is_available()
 
@@ -4,11 +4,10 @@ TensorFlow Guide to SageMaker's distributed data parallel library
 
 .. admonition:: Contents
 
-   - :ref:`tensorflow-sdp-modify`
-   - :ref:`tensorflow-sdp-api`
+   - :ref:`tensorflow-sdp-modify-1.0.0`
+   - :ref:`tensorflow-sdp-api-1.0.0`
 
-.. _tensorflow-sdp-modify:
-   :noindex:
+.. _tensorflow-sdp-modify-1.0.0:
 
 Modify a TensorFlow 2.x training script to use SageMaker data parallel
 ======================================================================
@@ -150,15 +149,14 @@ script you will have for distributed training with the library.
        checkpoint.save(checkpoint_dir)
 
 
-.. _tensorflow-sdp-api:
-   :noindex:
+.. _tensorflow-sdp-api-1.0.0:
 
 TensorFlow API
 ==============
 
 .. rubric:: Supported versions
 
--  TensorFlow 2.x - 2.3.1
+**TensorFlow 2.3.x - 2.4.1**
 
 
 .. function:: smdistributed.dataparallel.tensorflow.init()
 
@@ -1,23 +1,41 @@
+# Sagemaker Distributed Data Parallel 1.1.1 Release Notes
+
+* New Features
+* Bug Fixes
+* Known Issues
+
+*New Features:*
+
+* Adds support for PyTorch 1.8.1
+
+*Bug Fixes:*
+
+* Fixes a bug that was causing gradients from one of the worker nodes to be added twice resulting in incorrect `all_reduce` results under some conditions.
+
+*Known Issues:*
+
+* SageMaker distributed data parallel still is not efficient when run using a single node. For the best performance, use multi-node distributed training with `smdistributed.dataparallel`. Use a single node only for experimental runs while preparing your training pipeline.
+
 # Sagemaker Distributed Data Parallel 1.1.0 Release Notes
 
 * New Features
 * Bug Fixes
 * Improvements
 * Known Issues
 
-New Features:
+*New Features:*
 
 * Adds support for PyTorch 1.8.0 with CUDA 11.1 and CUDNN 8
 
-Bug Fixes:
+*Bug Fixes:*
 
 * Fixes crash issue when importing `smdataparallel` before PyTorch
 
-Improvements:
+*Improvements:*
 
 * Update `smdataparallel` name in python packages, descriptions, and log outputs
 
-Known Issues:
+*Known Issues:*
 
 * SageMaker DataParallel is not efficient when run using a single node. For the best performance, use multi-node distributed training with `smdataparallel`. Use a single node only for experimental runs while preparing your training pipeline.
 
 
@@ -1,3 +1,33 @@
+# Sagemaker Distributed Model Parallel 1.3.1 Release Notes
+
+- New Features
+- Bug Fixes
+- Known Issues
+
+## New Features
+
+### TensorFlow
+
+- Exposes a new decorator ``register_post_partition_hook``. This allows invoking the decorated methods just after model partition but before executing the first step. For example loading a checkpoint. Refer to the [SageMaker distributed model parallel API documentation](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_tensorflow.html) for more information.
+
+## Bug Fixes
+
+### PyTorch
+
+- Improved memory efficiency when using active microbatches by clearing activations at end of each microbatch.
+
+### TensorFlow
+
+- Fixed issue that caused hangs when training some models with XLA enabled.
+
+## Known Issues
+
+### PyTorch
+
+- A crash was observed when ``optimizer.step()`` was called for certain optimizers such as AdaDelta, when the partition on which this method was called has no local parameters assigned to it after partitioning. This is due to a bug in PyTorch which [has since been fixed](https://github.com/pytorch/pytorch/pull/52944). Till that makes its way to the next release of PyTorch, only call ``optimizer.step()`` on processes which have at least one local parameter. This can be checked like this ``len(list(model.local_parameters())) > 0``.
+
+- A performance regression still exists when training on SMP with PyTorch 1.7.1 compared to 1.6. The rootcause was found to be the slowdown in performance of `.grad` method calls in PyTorch 1.7.1 compared to 1.6. See the related discussion: https://github.com/pytorch/pytorch/issues/50636. This issue does not exist with PyTorch 1.8.
+
 # Sagemaker Distributed Model Parallel 1.3.0 Release Notes
 
 - New Features
 
@@ -6,7 +6,7 @@
 PyTorch API
 ===========
 
-**Supported versions: 1.7.1, 1.8.0**
+**Supported versions: 1.6.0, 1.7.1, 1.8.0**
 
 This API document assumes you use the following import statements in your training scripts.
 
 
@@ -83,7 +83,21 @@ TensorFlow API
           with smp.partition(3):
               z = tf.reduce_sum(y)             # placed in partition 3
 
-   
+
+.. function:: register_post_partition_hook(hook)
+
+    Registers a callable ``hook`` to
+    be executed after the model is partitioned. This is useful in situations
+    where an operation needs to be executed after the model partition during
+    the first call to ``smp.step``, but before the actual execution of the
+    first forward pass.
+
+    .. code:: python
+
+        @smp.register_post_partition_hook
+        def test_eager():
+            # All statements here will be executed right after partition but before the first forward pass
+            tf.print("Entered hook through eager context")
 
 .. class:: smp.CheckpointManager
 
@@ -102,13 +116,6 @@ TensorFlow API
                             max_to_keep=None,
                             checkpoint_name="ckpt")
 
-
-   **Important:** ``smp.CheckpointManager.restore()`` must be called after
-   the first training step. This is because the first call of the
-   ``smp.step`` function constructs and partitions the model, which must
-   take place before the checkpoint restore. Calling it before the first
-   ``smp.step`` call might result in hangs or unexpected behavior.
-
    **Parameters**
 
    -  ``checkpoint``: A `tf.train.Checkpoint
@@ -154,7 +161,8 @@ TensorFlow API
    .. code:: python
 
       for step, inputs in enumerate(train_ds):
-          if step == 1:                    # NOTE: restore occurs on the second step
+          if step == 0:
               ckpt_manager.restore()
           loss = train_step(inputs)
 
+
@@ -9,3 +9,4 @@ For general information about using the SageMaker Python SDK, see :ref:`overview
     :maxdepth: 2
 
     sagemaker.huggingface
+    Use Hugging Face with the SageMaker Python SDK <https://huggingface.co/transformers/sagemaker.html>
@@ -1,2 +1,3 @@
 sphinx==3.1.1
 sphinx-rtd-theme==0.5.0
+docutils==0.15.2
@@ -123,6 +123,7 @@ def __init__(
         content_type=None,
         content_template=None,
         custom_attributes=None,
+        accelerator_type=None,
     ):
         """Initializes a configuration of a model and the endpoint to be created for it.
 
@@ -151,6 +152,9 @@ def __init__(
                 Section 3.3.6. Field Value Components (
                 https://tools.ietf.org/html/rfc7230#section-3.2.6) of the Hypertext Transfer
                 Protocol (HTTP/1.1).
+            accelerator_type (str): The Elastic Inference accelerator type to deploy to the model
+                endpoint instance for making inferences to the model, see
+                https://docs.aws.amazon.com/sagemaker/latest/dg/ei.html.
         """
         self.predictor_config = {
             "model_name": model_name,
@@ -178,9 +182,8 @@ def __init__(
                     f" Please include a placeholder $features."
                 )
             self.predictor_config["content_template"] = content_template
-
-        if custom_attributes is not None:
-            self.predictor_config["custom_attributes"] = custom_attributes
+        _set(custom_attributes, "custom_attributes", self.predictor_config)
+        _set(accelerator_type, "accelerator_type", self.predictor_config)
 
     def get_predictor_config(self):
         """Returns part of the predictor dictionary of the analysis config."""
Original file line number	Diff line number	Diff line change
@@ -9,3 +9,4 @@ For general information about using the SageMaker Python SDK, see :ref:`overview
`9`	`9`	`:maxdepth: 2`
`10`	`10`
`11`	`11`	`sagemaker.huggingface`
	`12`	`+ Use Hugging Face with the SageMaker Python SDK <https://huggingface.co/transformers/sagemaker.html>`
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`sphinx==3.1.1`
`2`	`2`	`sphinx-rtd-theme==0.5.0`
	`3`	`+docutils==0.15.2`