Merge branch 'master' into fix-disable-profiler-settings

metrizable · web-flow · commit 7a91c3f5f305 · 2021-02-16T10:50:53.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,16 @@
 # Changelog
 
+## v2.24.5 (2021-02-12)
+
+### Bug Fixes and Other Changes
+
+ * test_tag/test_tags method assert fix in association tests
+
+### Documentation Changes
+
+ * removing mention of TF 2.4 from SM distributed model parallel docs
+ * adding details about mpi options, other small updates
+
 ## v2.24.4 (2021-02-09)
 
 ### Bug Fixes and Other Changes
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.24.5.dev0
+2.24.6.dev0
diff --git a/buildspec-deploy.yml b/buildspec-deploy.yml
@@ -3,7 +3,16 @@ version: 0.2
 phases:
   build:
     commands:
-      - PACKAGE_FILE="$CODEBUILD_SRC_DIR_ARTIFACT_1/sagemaker-*.tar.gz"
+      # prepare the release (update versions, changelog etc.)
+      - git-release --prepare
+
+      # generate the distribution package
+      - python3 setup.py sdist
+
+      # publish the release to github
+      - git-release --publish
+
+      - PACKAGE_FILE="dist/sagemaker-*.tar.gz"
       - PYPI_USER=$(aws secretsmanager get-secret-value --secret-id /codebuild/pypi/user --query SecretString --output text)
       - PYPI_PASSWORD=$(aws secretsmanager get-secret-value --secret-id /codebuild/pypi/password --query SecretString --output text)
       - GPG_PRIVATE_KEY=$(aws secretsmanager get-secret-value --secret-id /codebuild/gpg/private_key --query SecretString --output text)
diff --git a/buildspec-release.yml b/buildspec-release.yml
@@ -3,9 +3,6 @@ version: 0.2
 phases:
   build:
     commands:
-      # prepare the release (update versions, changelog etc.)
-      - git-release --prepare
-
       # run linters
       - tox -e flake8,pylint
 
@@ -22,15 +19,3 @@ phases:
 
       # run a subset of the integration tests
       - IGNORE_COVERAGE=- tox -e py36 -- tests/integ -m canary_quick -n 64 --boxed --reruns 2
-
-      # generate the distribution package
-      - python3 setup.py sdist
-
-      # publish the release to github
-      - git-release --publish
-
-artifacts:
-  files:
-    - dist/sagemaker-*.tar.gz
-  name: ARTIFACT_1
-  discard-paths: yes
diff --git a/doc/api/training/smd_model_parallel_general.rst b/doc/api/training/smd_model_parallel_general.rst
@@ -5,13 +5,13 @@
 
 .. _sm-sdk-modelparallel-params:
 
-SageMaker Python SDK ``modelparallel`` parameters
-=================================================
+Required SageMaker Python SDK parameters
+========================================
 
 The TensorFlow and PyTorch ``Estimator`` objects contains a ``distribution`` parameter,
 which is used to enable and specify parameters for the
 initialization of the SageMaker distributed model parallel library. The library internally uses MPI,
-so in order to use model parallelism, MPI must be enabled using the ``distribution`` parameter.
+so in order to use model parallelism, MPI must also be enabled using the ``distribution`` parameter.
 
 The following is an example of how you can launch a new PyTorch training job with the library.
 
@@ -55,6 +55,9 @@ The following is an example of how you can launch a new PyTorch training job wit
 
    smd_mp_estimator.fit('s3://my_bucket/my_training_data/')
 
+``smdistributed`` Parameters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 You can use the following parameters to initialize the library using the ``parameters``
 in the ``smdistributed`` of ``distribution``.
 
@@ -302,6 +305,41 @@ table are optional.
    |                   |                         |                 | SageMaker.                        |
    +-------------------+-------------------------+-----------------+-----------------------------------+
 
+``mpi`` Parameters
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+For the ``"mpi"`` key, a dict must be passed which contains:
+
+* ``"enabled"``: Set to ``True`` to launch the training job with MPI.
+
+* ``"processes_per_host"``: Specifies the number of processes MPI should launch on each host.
+  In SageMaker a host is a single Amazon EC2 ml instance. The SageMaker distributed model parallel library maintains
+  a one-to-one mapping between processes and GPUs across model and data parallelism.
+  This means that SageMaker schedules each process on a single, separate GPU and no GPU contains more than one process.
+  If you are using PyTorch, you must restrict each process to its own device using
+  ``torch.cuda.set_device(smp.local_rank())``. To learn more, see
+  `Modify a PyTorch Training Script
+  <https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-customize-training-script.html#model-parallel-customize-training-script-pt-16>`_.
+
+  .. important::
+   ``process_per_host`` must be less than or equal to the number of GPUs per instance, and typically will be equal to
+   the number of GPUs per instance.
+
+  For example, if you use one instance with 4-way model parallelism and 2-way data parallelism,
+  then processes_per_host should be 2 x 4 = 8. Therefore, you must choose an instance that has at least 8 GPUs,
+  such as an ml.p3.16xlarge.
+
+  The following image illustrates how 2-way data parallelism and 4-way model parallelism is distributed across 8 GPUs:
+  the model is partitioned across 4 GPUs, and each partition is added to 2 GPUs.
+
+  .. image:: smp_versions/model-data-parallel.png
+      :width: 650
+      :alt: 2-way data parallelism and 4-way model parallelism distributed across 8 GPUs
+
+
+* ``"custom_mpi_options"``: Use this key to pass any custom MPI options you might need.
+  To avoid Docker warnings from contaminating your training logs, we recommend the following flag.
+  ```--mca btl_vader_single_copy_mechanism none```
+
 
 .. _ranking-basics:
 
diff --git a/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.md b/doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.md
@@ -17,10 +17,6 @@
 
 - Adds support for `_register_comm_hook` (PyTorch 1.7 only) which will register the callable as a communication hook for DDP. NOTE: Like in DDP, this is an experimental API and subject to change.
 
-### Tensorflow
-
-- Adds support for Tensorflow 2.4
-
 ## Bug Fixes
 
 ### PyTorch
diff --git a/doc/api/training/smp_versions/model-data-parallel.png b/doc/api/training/smp_versions/model-data-parallel.png
diff --git a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst b/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_common_api.rst
@@ -118,6 +118,9 @@ The following SageMaker distribute model parallel APIs are common across all fra
    -  https://www.tensorflow.org/api_docs/python/tf/function\
    -  https://www.tensorflow.org/guide/function\
 
+   Each ``smp.step`` decorated function must have a return value that depends on the
+   output of ``smp.DistributedModel``.
+
    **Common parameters**
 
    -  ``non_split_inputs`` (``list``): The list of arguments to the decorated function
diff --git a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst
@@ -31,7 +31,6 @@ This API document assumes you use the following import statements in your traini
    model in the training script can be wrapped with
    ``smp.DistributedModel``.
 
-
    **Example:**
 
    .. code:: python
@@ -89,6 +88,17 @@ This API document assumes you use the following import statements in your traini
    the model objects (``model(inputs)`` and ``model.backward(loss)``) must be made inside
    a ``smp.step``-decorated function.
 
+   **Using DDP**
+
+   If DDP is enabled, do not not place a PyTorch
+   ``DistributedDataParallel`` wrapper around the ``DistributedModel`` because
+   the ``DistributedModel`` wrapper will also handle data parallelism.
+
+   Unlike the original DDP wrapper, when you use ``DistributedModel``,
+   model parameters and buffers are not immediately broadcast across
+   processes when the wrapper is called. Instead, the broadcast is deferred to the first call of the
+   ``smp.step``-decorated function when the partition is done.
+
    **Parameters**
 
    -  ``module`` (``torch.nn.Module``): Module to be distributed (data parallelism and model parallelism).
@@ -248,11 +258,14 @@ This API document assumes you use the following import statements in your traini
    .. function:: join( )
 
       **Available for PyTorch 1.7 only**
+
       A context manager to be used in conjunction with an instance of
-      ``smp.DistributedModel``to be able to train with uneven inputs across
+      ``smp.DistributedModel`` to be able to train with uneven inputs across
       participating processes. This is only supported when ``ddp=True`` for
       ``smp.DistributedModel``. This will use the join with the wrapped
-      ``DistributedDataParallel`` instance. Please see: `join <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join>`__.
+      ``DistributedDataParallel`` instance. For more information, see:
+      `join <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel.join>`__
+      in the PyTorch documentation.
 
 
 .. class:: smp.DistributedOptimizer
diff --git a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst b/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_tensorflow.rst
@@ -1,7 +1,7 @@
 TensorFlow API
 ==============
 
-**Supported version: 2.4, 2.3**
+**Supported version: 2.3**
 
 **Important**: This API document assumes you use the following import statement in your training scripts.
 
diff --git a/src/sagemaker/analytics.py b/src/sagemaker/analytics.py
@@ -43,6 +43,7 @@ class AnalyticsMetricsBase(with_metaclass(ABCMeta, object)):
     """
 
     def __init__(self):
+        """Initializes ``AnalyticsMetricsBase`` instance."""
         self._dataframe = None
 
     def export_csv(self, filename):
diff --git a/src/sagemaker/image_uri_config/inferentia-mxnet.json b/src/sagemaker/image_uri_config/inferentia-mxnet.json
@@ -5,8 +5,29 @@
         "1.5.1": {
             "py_versions": ["py3"],
             "registries": {
+                "af-south-1": "774647643957",
+                "ap-east-1": "110948597952",
+                "ap-northeast-1": "941853720454",
+                "ap-northeast-2": "151534178276",
+                "ap-south-1": "763008648453",
+                "ap-southeast-1": "324986816169",
+                "ap-southeast-2": "355873309152",
+                "ca-central-1": "464438896020",
+                "cn-north-1": "472730292857",
+                "cn-northwest-1": "474822919863",
+                "eu-central-1": "746233611703",
+                "eu-north-1": "601324751636",
+                "eu-south-1": "966458181534",
+                "eu-west-1": "802834080501",
+                "eu-west-2": "205493899709",
+                "eu-west-3": "254080097072",
+                "me-south-1": "836785723513",
+                "sa-east-1": "756306329178",
                 "us-east-1": "785573368785",
-                "us-west-2": "301217895009"
+                "us-east-2": "007439368137",
+                "us-gov-west-1": "263933020539",
+                "us-west-1": "710691900526",
+                "us-west-2": "301217895009"                
             },
             "repository": "sagemaker-neo-mxnet"
         }
diff --git a/src/sagemaker/image_uri_config/inferentia-pytorch.json b/src/sagemaker/image_uri_config/inferentia-pytorch.json
@@ -5,7 +5,28 @@
         "1.5.1": {
             "py_versions": ["py3"],
             "registries": {
+                "af-south-1": "774647643957",
+                "ap-east-1": "110948597952",
+                "ap-northeast-1": "941853720454",
+                "ap-northeast-2": "151534178276",
+                "ap-south-1": "763008648453",
+                "ap-southeast-1": "324986816169",
+                "ap-southeast-2": "355873309152",
+                "ca-central-1": "464438896020",
+                "cn-north-1": "472730292857",
+                "cn-northwest-1": "474822919863",
+                "eu-central-1": "746233611703",
+                "eu-north-1": "601324751636",
+                "eu-south-1": "966458181534",
+                "eu-west-1": "802834080501",
+                "eu-west-2": "205493899709",
+                "eu-west-3": "254080097072",
+                "me-south-1": "836785723513",
+                "sa-east-1": "756306329178",
                 "us-east-1": "785573368785",
+                "us-east-2": "007439368137",
+                "us-gov-west-1": "263933020539",
+                "us-west-1": "710691900526",
                 "us-west-2": "301217895009"
             },
             "repository": "sagemaker-neo-pytorch"
diff --git a/src/sagemaker/image_uri_config/inferentia-tensorflow.json b/src/sagemaker/image_uri_config/inferentia-tensorflow.json
@@ -5,8 +5,29 @@
         "1.15.0": {
             "py_versions": ["py3"],
             "registries": {
+                "af-south-1": "774647643957",
+                "ap-east-1": "110948597952",
+                "ap-northeast-1": "941853720454",
+                "ap-northeast-2": "151534178276",
+                "ap-south-1": "763008648453",
+                "ap-southeast-1": "324986816169",
+                "ap-southeast-2": "355873309152",
+                "ca-central-1": "464438896020",
+                "cn-north-1": "472730292857",
+                "cn-northwest-1": "474822919863",
+                "eu-central-1": "746233611703",
+                "eu-north-1": "601324751636",
+                "eu-south-1": "966458181534",
+                "eu-west-1": "802834080501",
+                "eu-west-2": "205493899709",
+                "eu-west-3": "254080097072",
+                "me-south-1": "836785723513",
+                "sa-east-1": "756306329178",
                 "us-east-1": "785573368785",
-                "us-west-2": "301217895009"
+                "us-east-2": "007439368137",
+                "us-gov-west-1": "263933020539",
+                "us-west-1": "710691900526",
+                "us-west-2": "301217895009"                
             },
             "repository": "sagemaker-neo-tensorflow"
         }
diff --git a/tests/integ/sagemaker/lineage/test_action.py b/tests/integ/sagemaker/lineage/test_action.py
@@ -90,8 +90,8 @@ def test_tag(action_obj, sagemaker_session):
         )["Tags"]
         if actual_tags:
             break
-    # When sagemaker-client-config endpoint-url is passed as argument to hit beta,
-    # length of actual tags will be 2
+    # When sagemaker-client-config endpoint-url is passed as argument to hit some endpoints,
+    # length of actual tags will be greater than 1
     assert len(actual_tags) > 0
     assert actual_tags[0] == tag
 
@@ -106,7 +106,7 @@ def test_tags(action_obj, sagemaker_session):
         )["Tags"]
         if actual_tags:
             break
-    # When sagemaker-client-config endpoint-url is passed as argument to hit beta,
-    # length of actual tags will be 2
+    # When sagemaker-client-config endpoint-url is passed as argument to hit some endpoints,
+    # length of actual tags will be greater than 1
     assert len(actual_tags) > 0
     assert [actual_tags[-1]] == tags
diff --git a/tests/integ/sagemaker/lineage/test_artifact.py b/tests/integ/sagemaker/lineage/test_artifact.py
@@ -121,8 +121,8 @@ def test_tag(artifact_obj, sagemaker_session):
         )["Tags"]
         if actual_tags:
             break
-    # When sagemaker-client-config endpoint-url is passed as argument to hit beta,
-    # length of actual tags will be 2
+    # When sagemaker-client-config endpoint-url is passed as argument to hit some endpoints,
+    # length of actual tags will be greater than 1
     assert len(actual_tags) > 0
     assert actual_tags[0] == tag
 
@@ -137,7 +137,7 @@ def test_tags(artifact_obj, sagemaker_session):
         )["Tags"]
         if actual_tags:
             break
-    # When sagemaker-client-config endpoint-url is passed as argument to hit beta,
-    # length of actual tags will be 2
+    # When sagemaker-client-config endpoint-url is passed as argument to hit some endpoints,
+    # length of actual tags will be greater than 1
     assert len(actual_tags) > 0
     assert [actual_tags[-1]] == tags
diff --git a/tests/integ/sagemaker/lineage/test_association.py b/tests/integ/sagemaker/lineage/test_association.py
@@ -66,7 +66,9 @@ def test_set_tag(association_obj, sagemaker_session):
         if actual_tags:
             break
         time.sleep(1)
-    assert len(actual_tags) == 1
+    # When sagemaker-client-config endpoint-url is passed as argument to hit some endpoints,
+    # length of actual tags will be greater than 1
+    assert len(actual_tags) > 0
     assert actual_tags[0] == tag
 
 
@@ -81,5 +83,7 @@ def test_tags(association_obj, sagemaker_session):
         if actual_tags:
             break
         time.sleep(1)
-    assert len(actual_tags) == 1
-    assert actual_tags == tags
+    # When sagemaker-client-config endpoint-url is passed as argument to hit some endpoints,
+    # length of actual tags will be greater than 1
+    assert len(actual_tags) > 0
+    assert [actual_tags[-1]] == tags
diff --git a/tests/integ/sagemaker/lineage/test_context.py b/tests/integ/sagemaker/lineage/test_context.py
@@ -88,8 +88,8 @@ def test_tag(context_obj, sagemaker_session):
         )["Tags"]
         if actual_tags:
             break
-    # When sagemaker-client-config endpoint-url is passed as argument to hit beta,
-    # length of actual tags will be 2
+    # When sagemaker-client-config endpoint-url is passed as argument to hit some endpoints,
+    # length of actual tags will be greater than 1
     assert len(actual_tags) > 0
     assert actual_tags[0] == tag
 
@@ -104,7 +104,7 @@ def test_tags(context_obj, sagemaker_session):
         )["Tags"]
         if actual_tags:
             break
-    # When sagemaker-client-config endpoint-url is passed as argument to hit beta,
-    # length of actual tags will be 2
+    # When sagemaker-client-config endpoint-url is passed as argument to hit some endpoints,
+    # length of actual tags will be greater than 1
     assert len(actual_tags) > 0
     assert [actual_tags[-1]] == tags
diff --git a/tests/integ/test_clarify_model_monitor.py b/tests/integ/test_clarify_model_monitor.py
diff --git a/tests/integ/test_experiments_analytics.py b/tests/integ/test_experiments_analytics.py
diff --git a/tests/unit/sagemaker/image_uris/test_neo.py b/tests/unit/sagemaker/image_uris/test_neo.py
diff --git a/tox.ini b/tox.ini