Merge branch 'master' into iss106487

ajaykarpur · web-flow · commit 319d1355e321 · 2021-01-29T11:45:36.000-08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,30 @@
 # Changelog
 
+## v2.24.1 (2021-01-28)
+
+### Bug Fixes and Other Changes
+
+ * fix collect-tests tox env
+ * create profiler specific unsupported regions
+ * Update smd_model_parallel_pytorch.rst
+
+## v2.24.0 (2021-01-22)
+
+### Features
+
+ * add support for Std:Join for pipelines
+ * Map image name to image uri
+ * friendly names for short URIs
+
+### Bug Fixes and Other Changes
+
+ * increase allowed time for search to get updated
+ * refactor distribution config construction
+
+### Documentation Changes
+
+ * Add SMP 1.2.0 API docs
+
 ## v2.23.6 (2021-01-20)
 
 ### Bug Fixes and Other Changes
diff --git a/VERSION b/VERSION
@@ -1 +1 @@
-2.23.7.dev0
+2.24.2.dev0
diff --git a/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst b/doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst
@@ -140,16 +140,16 @@ This API document assumes you use the following import statements in your traini
       computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes
       (MB).
 
-    - ``trace_memory_usage`` (default: False): When set to True, the library attempts
+   -  ``trace_memory_usage`` (default: False): When set to True, the library attempts
       to measure memory usage per module during tracing. If this is disabled,
       memory usage will be estimated through the sizes of tensors returned from
       the module.
 
-    - ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``.
+   -  ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``.
       This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper.
       Please see: `broadcast_buffer <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
 
-    - ``gradient_as_bucket_view (PyTorch 1.7 only)`` (default: False): To be
+   -  ``gradient_as_bucket_view (PyTorch 1.7 only)`` (default: False): To be
       used with ``ddp=True``. This parameter is forwarded to the underlying
       ``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
 
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -49,6 +49,7 @@
     UploadedCode,
     validate_source_dir,
     _region_supports_debugger,
+    _region_supports_profiler,
     get_mp_parameters,
 )
 from sagemaker.inputs import TrainingInput
@@ -494,7 +495,7 @@ def _prepare_profiler_for_training(self):
         """Set necessary values and do basic validations in profiler config and profiler rules.
 
         When user explicitly set rules to an empty list, default profiler rule won't be enabled.
-        Default profiler rule will be enabled when either:
+        Default profiler rule will be enabled in supported regions when either:
         1. user doesn't specify any rules, i.e., rules=None; or
         2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
         """
@@ -503,7 +504,7 @@ def _prepare_profiler_for_training(self):
                 raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
             if self.profiler_rules:
                 raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
-        elif _region_supports_debugger(self.sagemaker_session.boto_region_name):
+        elif _region_supports_profiler(self.sagemaker_session.boto_region_name):
             if self.profiler_config is None:
                 self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
             if self.rules is None or (self.rules and not self.profiler_rules):
diff --git a/src/sagemaker/fw_utils.py b/src/sagemaker/fw_utils.py
@@ -49,6 +49,8 @@
 )
 
 DEBUGGER_UNSUPPORTED_REGIONS = ("us-iso-east-1",)
+PROFILER_UNSUPPORTED_REGIONS = ("us-iso-east-1", "cn-north-1", "cn-northwest-1")
+
 SINGLE_GPU_INSTANCE_TYPES = ("ml.p2.xlarge", "ml.p3.2xlarge")
 SM_DATAPARALLEL_SUPPORTED_INSTANCE_TYPES = (
     "ml.p3.16xlarge",
@@ -550,6 +552,19 @@ def _region_supports_debugger(region_name):
     return region_name.lower() not in DEBUGGER_UNSUPPORTED_REGIONS
 
 
+def _region_supports_profiler(region_name):
+    """Returns bool indicating whether region supports Amazon SageMaker Debugger profiling feature.
+
+    Args:
+        region_name (str): Name of the region to check against.
+
+    Returns:
+        bool: Whether or not the region supports Amazon SageMaker Debugger profiling feature.
+
+    """
+    return region_name.lower() not in PROFILER_UNSUPPORTED_REGIONS
+
+
 def validate_version_or_image_args(framework_version, py_version, image_uri):
     """Checks if version or image arguments are specified.
 
diff --git a/src/sagemaker/processing.py b/src/sagemaker/processing.py
@@ -31,6 +31,7 @@
 from sagemaker.session import Session
 from sagemaker.network import NetworkConfig  # noqa: F401 # pylint: disable=unused-import
 from sagemaker.workflow.properties import Properties
+from sagemaker.workflow.entities import Expression
 from sagemaker.dataset_definition.inputs import S3Input, DatasetDefinition
 from sagemaker.apiutils._base_types import ApiObject
 
@@ -338,6 +339,10 @@ def _normalize_outputs(self, outputs=None):
                 # Generate a name for the ProcessingOutput if it doesn't have one.
                 if output.output_name is None:
                     output.output_name = "output-{}".format(count)
+                # if the output's destination is a workflow expression, do no normalization
+                if isinstance(output.destination, Expression):
+                    normalized_outputs.append(output)
+                    continue
                 # If the output's destination is not an s3_uri, create one.
                 parse_result = urlparse(output.destination)
                 if parse_result.scheme != "s3":
diff --git a/src/sagemaker/workflow/functions.py b/src/sagemaker/workflow/functions.py
@@ -0,0 +1,46 @@
+# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""The step definitions for workflow."""
+from __future__ import absolute_import
+
+from typing import List
+
+import attr
+
+from sagemaker.workflow.entities import Expression
+
+
+@attr.s
+class Join(Expression):
+    """Join together properties.
+
+    Attributes:
+        values (List[Union[PrimitiveType, Parameter]]): The primitive types
+            and parameters to join.
+        on_str (str): The string to join the values on (Defaults to "").
+    """
+
+    on: str = attr.ib(factory=str)
+    values: List = attr.ib(factory=list)
+
+    @property
+    def expr(self):
+        """The expression dict for a `Join` function."""
+        return {
+            "Std:Join": {
+                "On": self.on,
+                "Values": [
+                    value.expr if hasattr(value, "expr") else value for value in self.values
+                ],
+            },
+        }
diff --git a/tests/integ/test_experiments_analytics.py b/tests/integ/test_experiments_analytics.py
@@ -38,6 +38,17 @@ def experiment(sagemaker_session):
 
         time.sleep(15)  # wait for search to get updated
 
+        # allow search time thrice
+        for _ in range(3):
+            analytics = ExperimentAnalytics(
+                experiment_name=experiment_name, sagemaker_session=sagemaker_session
+            )
+
+            if len(analytics.dataframe().columns) > 0:
+                break
+
+            time.sleep(15)
+
         yield experiment_name
     finally:
         _delete_resources(sm, experiment_name, trials)
@@ -79,6 +90,17 @@ def experiment_with_artifacts(sagemaker_session):
 
         time.sleep(15)  # wait for search to get updated
 
+        # allow search time thrice
+        for _ in range(3):
+            analytics = ExperimentAnalytics(
+                experiment_name=experiment_name, sagemaker_session=sagemaker_session
+            )
+
+            if len(analytics.dataframe().columns) > 0:
+                break
+
+            time.sleep(15)
+
         yield experiment_name
     finally:
         _delete_resources(sm, experiment_name, trials)
diff --git a/tests/integ/test_workflow.py b/tests/integ/test_workflow.py
@@ -38,6 +38,8 @@
 from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
 from sagemaker.workflow.condition_step import ConditionStep
 from sagemaker.dataset_definition.inputs import DatasetDefinition, AthenaDatasetDefinition
+from sagemaker.workflow.execution_variables import ExecutionVariables
+from sagemaker.workflow.functions import Join
 from sagemaker.workflow.parameters import (
     ParameterInteger,
     ParameterString,
@@ -72,16 +74,9 @@ def role(sagemaker_session):
     return get_execution_role(sagemaker_session)
 
 
-# TODO-reinvent-2020: remove use of specific region and this session
 @pytest.fixture(scope="module")
-def region():
-    return "us-east-2"
-
-
-# TODO-reinvent-2020: remove use of specific region and this session
-@pytest.fixture(scope="module")
-def workflow_session(region):
-    boto_session = boto3.Session(region_name=region)
+def workflow_session(region_name):
+    boto_session = boto3.Session(region_name=region_name)
 
     sagemaker_client_config = dict()
     sagemaker_client_config.setdefault("config", Config(retries=dict(max_attempts=2)))
@@ -134,6 +129,7 @@ def test_three_step_definition(
     framework_version = "0.20.0"
     instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
     instance_count = ParameterInteger(name="InstanceCount", default_value=1)
+    output_prefix = ParameterString(name="OutputPrefix", default_value="output")
 
     input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv"
 
@@ -154,7 +150,20 @@ def test_three_step_definition(
         ],
         outputs=[
             ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
-            ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),
+            ProcessingOutput(
+                output_name="test_data",
+                source="/opt/ml/processing/test",
+                destination=Join(
+                    on="/",
+                    values=[
+                        "s3:/",
+                        sagemaker_session.default_bucket(),
+                        "test-sklearn",
+                        output_prefix,
+                        ExecutionVariables.PIPELINE_EXECUTION_ID,
+                    ],
+                ),
+            ),
         ],
         code=os.path.join(script_dir, "preprocessing.py"),
     )
@@ -194,7 +203,7 @@ def test_three_step_definition(
 
     pipeline = Pipeline(
         name=pipeline_name,
-        parameters=[instance_type, instance_count],
+        parameters=[instance_type, instance_count, output_prefix],
         steps=[step_process, step_train, step_model],
         sagemaker_session=workflow_session,
     )
@@ -208,6 +217,7 @@ def test_three_step_definition(
                 {"Name": "InstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}.items()
             ),
             tuple({"Name": "InstanceCount", "Type": "Integer", "DefaultValue": 1}.items()),
+            tuple({"Name": "OutputPrefix", "Type": "String", "DefaultValue": "output"}.items()),
         ]
     )
 
@@ -251,17 +261,28 @@ def test_three_step_definition(
     assert model_args["PrimaryContainer"]["ModelDataUrl"] == {
         "Get": "Steps.my-train.ModelArtifacts.S3ModelArtifacts"
     }
+    try:
+        response = pipeline.create(role)
+        create_arn = response["PipelineArn"]
+        assert re.match(
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
+            create_arn,
+        )
+    finally:
+        try:
+            pipeline.delete()
+        except Exception:
+            pass
 
 
-# TODO-reinvent-2020: Modify use of the workflow client
 def test_one_step_sklearn_processing_pipeline(
     sagemaker_session,
     workflow_session,
     role,
     sklearn_latest_version,
     cpu_instance_type,
     pipeline_name,
-    region,
+    region_name,
     athena_dataset_definition,
 ):
     instance_count = ParameterInteger(name="InstanceCount", default_value=2)
@@ -305,21 +326,21 @@ def test_one_step_sklearn_processing_pipeline(
         response = pipeline.create(role)
         create_arn = response["PipelineArn"]
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}",
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
             create_arn,
         )
 
         pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)]
         response = pipeline.update(role)
         update_arn = response["PipelineArn"]
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}",
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
             update_arn,
         )
 
         execution = pipeline.start(parameters={})
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}/execution/",
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
             execution.arn,
         )
 
@@ -340,14 +361,13 @@ def test_one_step_sklearn_processing_pipeline(
             pass
 
 
-# TODO-reinvent-2020: Modify use of the workflow client
 def test_conditional_pytorch_training_model_registration(
     sagemaker_session,
     workflow_session,
     role,
     cpu_instance_type,
     pipeline_name,
-    region,
+    region_name,
 ):
     base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
     entry_point = os.path.join(base_dir, "mnist.py")
@@ -420,18 +440,18 @@ def test_conditional_pytorch_training_model_registration(
         response = pipeline.create(role)
         create_arn = response["PipelineArn"]
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}", create_arn
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn
         )
 
         execution = pipeline.start(parameters={})
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}/execution/",
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
             execution.arn,
         )
 
         execution = pipeline.start(parameters={"GoodEnoughInput": 0})
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}/execution/",
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
             execution.arn,
         )
     finally:
diff --git a/tests/unit/sagemaker/workflow/test_functions.py b/tests/unit/sagemaker/workflow/test_functions.py
diff --git a/tests/unit/test_estimator.py b/tests/unit/test_estimator.py
diff --git a/tox.ini b/tox.ini