Merge branch 'master' into smddp_custom_mpi_support

ahsan-z-khan · web-flow · commit 1b03c5d5fd14 · 2021-04-14T10:52:19.000-07:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # Changelog
 
+## v2.35.0 (2021-04-14)
+
+### Features
+
+ * add support for PyTorch 1.8.1
+
+### Bug Fixes and Other Changes
+
+ * boto3 client param updated for feature store
+ * Updated release notes and API doc for smd model parallel 1.3.1
+
 ## v2.34.0 (2021-04-12)
 
 ### Features
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.34.1.dev0
+2.35.1.dev0
diff --git a/doc/amazon_sagemaker_featurestore.rst b/doc/amazon_sagemaker_featurestore.rst
@@ -67,7 +67,7 @@ use the SageMaker default bucket and add a custom prefix to it.
    offline_feature_store_bucket = 's3://*{}*/*{}*'.format(default_bucket, prefix)
 
    sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
-   featurestore_runtime = boto_session.client(service_name='featurestore-runtime', region_name=region)
+   featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)
 
    feature_store_session = Session(
        boto_session=boto_session,
diff --git a/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.md b/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.md
@@ -1,3 +1,33 @@
+# Sagemaker Distributed Model Parallel 1.3.1 Release Notes
+
+- New Features
+- Bug Fixes
+- Known Issues
+
+## New Features
+
+### TensorFlow
+
+- Exposes a new decorator ``register_post_partition_hook``. This allows invoking the decorated methods just after model partition but before executing the first step. For example loading a checkpoint. Refer to the [SageMaker distributed model parallel API documentation](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_tensorflow.html) for more information.
+
+## Bug Fixes
+
+### PyTorch
+
+- Improved memory efficiency when using active microbatches by clearing activations at end of each microbatch.
+
+### TensorFlow
+
+- Fixed issue that caused hangs when training some models with XLA enabled.
+
+## Known Issues
+
+### PyTorch
+
+- A crash was observed when ``optimizer.step()`` was called for certain optimizers such as AdaDelta, when the partition on which this method was called has no local parameters assigned to it after partitioning. This is due to a bug in PyTorch which [has since been fixed](https://github.com/pytorch/pytorch/pull/52944). Till that makes its way to the next release of PyTorch, only call ``optimizer.step()`` on processes which have at least one local parameter. This can be checked like this ``len(list(model.local_parameters())) > 0``.
+
+- A performance regression still exists when training on SMP with PyTorch 1.7.1 compared to 1.6. The rootcause was found to be the slowdown in performance of `.grad` method calls in PyTorch 1.7.1 compared to 1.6. See the related discussion: https://github.com/pytorch/pytorch/issues/50636. This issue does not exist with PyTorch 1.8.
+
 # Sagemaker Distributed Model Parallel 1.3.0 Release Notes
 
 - New Features
diff --git a/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst
@@ -83,7 +83,21 @@ TensorFlow API
           with smp.partition(3):
               z = tf.reduce_sum(y)             # placed in partition 3
 
-   ​
+
+.. function:: register_post_partition_hook(hook)
+
+    Registers a callable ``hook`` to
+    be executed after the model is partitioned. This is useful in situations
+    where an operation needs to be executed after the model partition during
+    the first call to ``smp.step``, but before the actual execution of the
+    first forward pass.
+
+    .. code:: python
+
+        @smp.register_post_partition_hook
+        def test_eager():
+            # All statements here will be executed right after partition but before the first forward pass
+            tf.print("Entered hook through eager context")
 
 .. class:: smp.CheckpointManager
 
@@ -102,13 +116,6 @@ TensorFlow API
                             max_to_keep=None,
                             checkpoint_name="ckpt")
 
-
-   **Important:** ``smp.CheckpointManager.restore()`` must be called after
-   the first training step. This is because the first call of the
-   ``smp.step`` function constructs and partitions the model, which must
-   take place before the checkpoint restore. Calling it before the first
-   ``smp.step`` call might result in hangs or unexpected behavior.
-
    **Parameters**
 
    -  ``checkpoint``: A `tf.train.Checkpoint
@@ -154,7 +161,8 @@ TensorFlow API
    .. code:: python
 
       for step, inputs in enumerate(train_ds):
-          if step == 1:                    # NOTE: restore occurs on the second step
+          if step == 0:
               ckpt_manager.restore()
           loss = train_step(inputs)
 
+
diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py
@@ -60,7 +60,7 @@
 )
 SM_DATAPARALLEL_SUPPORTED_FRAMEWORK_VERSIONS = {
     "tensorflow": ["2.3", "2.3.1", "2.3.2", "2.4", "2.4.1"],
-    "pytorch": ["1.6", "1.6.0", "1.7", "1.7.1", "1.8", "1.8.0"],
+    "pytorch": ["1.6", "1.6.0", "1.7", "1.7.1", "1.8", "1.8.0", "1.8.1"],
 }
 SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]
 
diff --git a/src/sagemaker/image_uri_config/pytorch.json b/src/sagemaker/image_uri_config/pytorch.json
@@ -56,7 +56,7 @@
             "1.5": "1.5.0",
             "1.6": "1.6.0",
             "1.7": "1.7.1",
-            "1.8": "1.8.0"
+            "1.8": "1.8.1"
         },
         "versions": {
             "0.4.0": {
@@ -386,6 +386,39 @@
                     "us-west-2": "763104351884"
                 },
                 "repository": "pytorch-inference"
+            },
+            "1.8.1": {
+                "py_versions": [
+                    "py3",
+                    "py36"
+                ],
+                "registries": {
+                    "af-south-1": "626614931356",
+                    "ap-east-1": "871362719292",
+                    "ap-northeast-1": "763104351884",
+                    "ap-northeast-2": "763104351884",
+                    "ap-south-1": "763104351884",
+                    "ap-southeast-1": "763104351884",
+                    "ap-southeast-2": "763104351884",
+                    "ca-central-1": "763104351884",
+                    "cn-north-1": "727897471807",
+                    "cn-northwest-1": "727897471807",
+                    "eu-central-1": "763104351884",
+                    "eu-north-1": "763104351884",
+                    "eu-west-1": "763104351884",
+                    "eu-west-2": "763104351884",
+                    "eu-west-3": "763104351884",
+                    "eu-south-1": "692866216735",
+                    "me-south-1": "217643126080",
+                    "sa-east-1": "763104351884",
+                    "us-east-1": "763104351884",
+                    "us-east-2": "763104351884",
+                    "us-gov-west-1": "442386744353",
+                    "us-iso-east-1": "886529160074",
+                    "us-west-1": "763104351884",
+                    "us-west-2": "763104351884"
+                },
+                "repository": "pytorch-inference"
             }
         }
     },
@@ -404,7 +437,7 @@
             "1.5": "1.5.0",
             "1.6": "1.6.0",
             "1.7": "1.7.1",
-            "1.8": "1.8.0"
+            "1.8": "1.8.1"
         },
         "versions": {
             "0.4.0": {
@@ -735,6 +768,39 @@
                     "us-west-2": "763104351884"
                 },
                 "repository": "pytorch-training"
+            },
+            "1.8.1": {
+                "py_versions": [
+                    "py3",
+                    "py36"
+                ],
+                "registries": {
+                    "af-south-1": "626614931356",
+                    "ap-east-1": "871362719292",
+                    "ap-northeast-1": "763104351884",
+                    "ap-northeast-2": "763104351884",
+                    "ap-south-1": "763104351884",
+                    "ap-southeast-1": "763104351884",
+                    "ap-southeast-2": "763104351884",
+                    "ca-central-1": "763104351884",
+                    "cn-north-1": "727897471807",
+                    "cn-northwest-1": "727897471807",
+                    "eu-central-1": "763104351884",
+                    "eu-north-1": "763104351884",
+                    "eu-west-1": "763104351884",
+                    "eu-west-2": "763104351884",
+                    "eu-west-3": "763104351884",
+                    "eu-south-1": "692866216735",
+                    "me-south-1": "217643126080",
+                    "sa-east-1": "763104351884",
+                    "us-east-1": "763104351884",
+                    "us-east-2": "763104351884",
+                    "us-gov-west-1": "442386744353",
+                    "us-iso-east-1": "886529160074",
+                    "us-west-1": "763104351884",
+                    "us-west-2": "763104351884"
+                },
+                "repository": "pytorch-training"
             }
         }
     }
diff --git a/tests/unit/test_fw_utils.py b/tests/unit/test_fw_utils.py
@@ -649,6 +649,7 @@ def test_validate_smdataparallel_args_not_raises():
         ("ml.p3.16xlarge", "pytorch", "1.7.1", "py3", smdataparallel_enabled),
         ("ml.p3.16xlarge", "pytorch", "1.7", "py3", smdataparallel_enabled),
         ("ml.p3.16xlarge", "pytorch", "1.8.0", "py3", smdataparallel_enabled),
+        ("ml.p3.16xlarge", "pytorch", "1.8.1", "py3", smdataparallel_enabled),
         ("ml.p3.16xlarge", "pytorch", "1.8", "py3", smdataparallel_enabled),
         ("ml.p3.16xlarge", "tensorflow", "2.4.1", "py3", smdataparallel_enabled_custom_mpi),
         ("ml.p3.16xlarge", "pytorch", "1.8.0", "py3", smdataparallel_enabled_custom_mpi),

Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@`
`60`	`60`	`)`
`61`	`61`	`SM_DATAPARALLEL_SUPPORTED_FRAMEWORK_VERSIONS = {`
`62`	`62`	`"tensorflow": ["2.3", "2.3.1", "2.3.2", "2.4", "2.4.1"],`
`63`		`- "pytorch": ["1.6", "1.6.0", "1.7", "1.7.1", "1.8", "1.8.0"],`
	`63`	`+ "pytorch": ["1.6", "1.6.0", "1.7", "1.7.1", "1.8", "1.8.0", "1.8.1"],`
`64`	`64`	`}`
`65`	`65`	`SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]`
`66`	`66`