feature: add support for Std:Join for pipelines (#2103)

metrizable · ndodda-amazon · web-flow · commit 92115628086e · 2021-01-22T14:41:45.000-08:00
* feature: add support for Std:Join for pipelines * Update src/sagemaker/workflow/functions.py * Update tests/unit/sagemaker/workflow/test_functions.py * fix: ensure region is specified for workflow client * feature: Map image name to image uri (#2100) * Map image name to image uri * fix bug in test Co-authored-by: Neelesh Dodda <ndodda@amazon.com>
diff --git a/src/sagemaker/processing.py b/src/sagemaker/processing.py
@@ -31,6 +31,7 @@
 from sagemaker.session import Session
 from sagemaker.network import NetworkConfig  # noqa: F401 # pylint: disable=unused-import
 from sagemaker.workflow.properties import Properties
+from sagemaker.workflow.entities import Expression
 from sagemaker.dataset_definition.inputs import S3Input, DatasetDefinition
 from sagemaker.apiutils._base_types import ApiObject
 
@@ -338,6 +339,10 @@ def _normalize_outputs(self, outputs=None):
                 # Generate a name for the ProcessingOutput if it doesn't have one.
                 if output.output_name is None:
                     output.output_name = "output-{}".format(count)
+                # if the output's destination is a workflow expression, do no normalization
+                if isinstance(output.destination, Expression):
+                    normalized_outputs.append(output)
+                    continue
                 # If the output's destination is not an s3_uri, create one.
                 parse_result = urlparse(output.destination)
                 if parse_result.scheme != "s3":
diff --git a/src/sagemaker/workflow/functions.py b/src/sagemaker/workflow/functions.py
@@ -0,0 +1,46 @@
+# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""The step definitions for workflow."""
+from __future__ import absolute_import
+
+from typing import List
+
+import attr
+
+from sagemaker.workflow.entities import Expression
+
+
+@attr.s
+class Join(Expression):
+    """Join together properties.
+
+    Attributes:
+        values (List[Union[PrimitiveType, Parameter]]): The primitive types
+            and parameters to join.
+        on_str (str): The string to join the values on (Defaults to "").
+    """
+
+    on: str = attr.ib(factory=str)
+    values: List = attr.ib(factory=list)
+
+    @property
+    def expr(self):
+        """The expression dict for a `Join` function."""
+        return {
+            "Std:Join": {
+                "On": self.on,
+                "Values": [
+                    value.expr if hasattr(value, "expr") else value for value in self.values
+                ],
+            },
+        }
diff --git a/tests/integ/test_workflow.py b/tests/integ/test_workflow.py
@@ -38,6 +38,8 @@
 from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
 from sagemaker.workflow.condition_step import ConditionStep
 from sagemaker.dataset_definition.inputs import DatasetDefinition, AthenaDatasetDefinition
+from sagemaker.workflow.execution_variables import ExecutionVariables
+from sagemaker.workflow.functions import Join
 from sagemaker.workflow.parameters import (
     ParameterInteger,
     ParameterString,
@@ -72,16 +74,9 @@ def role(sagemaker_session):
     return get_execution_role(sagemaker_session)
 
 
-# TODO-reinvent-2020: remove use of specific region and this session
 @pytest.fixture(scope="module")
-def region():
-    return "us-east-2"
-
-
-# TODO-reinvent-2020: remove use of specific region and this session
-@pytest.fixture(scope="module")
-def workflow_session(region):
-    boto_session = boto3.Session(region_name=region)
+def workflow_session(region_name):
+    boto_session = boto3.Session(region_name=region_name)
 
     sagemaker_client_config = dict()
     sagemaker_client_config.setdefault("config", Config(retries=dict(max_attempts=2)))
@@ -134,6 +129,7 @@ def test_three_step_definition(
     framework_version = "0.20.0"
     instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
     instance_count = ParameterInteger(name="InstanceCount", default_value=1)
+    output_prefix = ParameterString(name="OutputPrefix", default_value="output")
 
     input_data = f"s3://sagemaker-sample-data-{region_name}/processing/census/census-income.csv"
 
@@ -154,7 +150,20 @@ def test_three_step_definition(
         ],
         outputs=[
             ProcessingOutput(output_name="train_data", source="/opt/ml/processing/train"),
-            ProcessingOutput(output_name="test_data", source="/opt/ml/processing/test"),
+            ProcessingOutput(
+                output_name="test_data",
+                source="/opt/ml/processing/test",
+                destination=Join(
+                    on="/",
+                    values=[
+                        "s3:/",
+                        sagemaker_session.default_bucket(),
+                        "test-sklearn",
+                        output_prefix,
+                        ExecutionVariables.PIPELINE_EXECUTION_ID,
+                    ],
+                ),
+            ),
         ],
         code=os.path.join(script_dir, "preprocessing.py"),
     )
@@ -194,7 +203,7 @@ def test_three_step_definition(
 
     pipeline = Pipeline(
         name=pipeline_name,
-        parameters=[instance_type, instance_count],
+        parameters=[instance_type, instance_count, output_prefix],
         steps=[step_process, step_train, step_model],
         sagemaker_session=workflow_session,
     )
@@ -208,6 +217,7 @@ def test_three_step_definition(
                 {"Name": "InstanceType", "Type": "String", "DefaultValue": "ml.m5.xlarge"}.items()
             ),
             tuple({"Name": "InstanceCount", "Type": "Integer", "DefaultValue": 1}.items()),
+            tuple({"Name": "OutputPrefix", "Type": "String", "DefaultValue": "output"}.items()),
         ]
     )
 
@@ -251,17 +261,28 @@ def test_three_step_definition(
     assert model_args["PrimaryContainer"]["ModelDataUrl"] == {
         "Get": "Steps.my-train.ModelArtifacts.S3ModelArtifacts"
     }
+    try:
+        response = pipeline.create(role)
+        create_arn = response["PipelineArn"]
+        assert re.match(
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
+            create_arn,
+        )
+    finally:
+        try:
+            pipeline.delete()
+        except Exception:
+            pass
 
 
-# TODO-reinvent-2020: Modify use of the workflow client
 def test_one_step_sklearn_processing_pipeline(
     sagemaker_session,
     workflow_session,
     role,
     sklearn_latest_version,
     cpu_instance_type,
     pipeline_name,
-    region,
+    region_name,
     athena_dataset_definition,
 ):
     instance_count = ParameterInteger(name="InstanceCount", default_value=2)
@@ -305,21 +326,21 @@ def test_one_step_sklearn_processing_pipeline(
         response = pipeline.create(role)
         create_arn = response["PipelineArn"]
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}",
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
             create_arn,
         )
 
         pipeline.parameters = [ParameterInteger(name="InstanceCount", default_value=1)]
         response = pipeline.update(role)
         update_arn = response["PipelineArn"]
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}",
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}",
             update_arn,
         )
 
         execution = pipeline.start(parameters={})
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}/execution/",
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
             execution.arn,
         )
 
@@ -340,14 +361,13 @@ def test_one_step_sklearn_processing_pipeline(
             pass
 
 
-# TODO-reinvent-2020: Modify use of the workflow client
 def test_conditional_pytorch_training_model_registration(
     sagemaker_session,
     workflow_session,
     role,
     cpu_instance_type,
     pipeline_name,
-    region,
+    region_name,
 ):
     base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
     entry_point = os.path.join(base_dir, "mnist.py")
@@ -420,18 +440,18 @@ def test_conditional_pytorch_training_model_registration(
         response = pipeline.create(role)
         create_arn = response["PipelineArn"]
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}", create_arn
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}", create_arn
         )
 
         execution = pipeline.start(parameters={})
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}/execution/",
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
             execution.arn,
         )
 
         execution = pipeline.start(parameters={"GoodEnoughInput": 0})
         assert re.match(
-            fr"arn:aws:sagemaker:{region}:\d{{12}}:pipeline/{pipeline_name}/execution/",
+            fr"arn:aws:sagemaker:{region_name}:\d{{12}}:pipeline/{pipeline_name}/execution/",
             execution.arn,
         )
     finally:
diff --git a/tests/unit/sagemaker/workflow/test_functions.py b/tests/unit/sagemaker/workflow/test_functions.py
@@ -0,0 +1,68 @@
+# Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+from sagemaker.workflow.execution_variables import ExecutionVariables
+from sagemaker.workflow.functions import Join
+from sagemaker.workflow.parameters import (
+    ParameterFloat,
+    ParameterInteger,
+    ParameterString,
+)
+from sagemaker.workflow.properties import Properties
+
+
+def test_join_primitives_default_on():
+    assert Join(values=[1, "a", False, 1.1]).expr == {
+        "Std:Join": {
+            "On": "",
+            "Values": [1, "a", False, 1.1],
+        },
+    }
+
+
+def test_join_primitives():
+    assert Join(on=",", values=[1, "a", False, 1.1]).expr == {
+        "Std:Join": {
+            "On": ",",
+            "Values": [1, "a", False, 1.1],
+        },
+    }
+
+
+def test_join_expressions():
+    assert Join(
+        values=[
+            "foo",
+            ParameterFloat(name="MyFloat"),
+            ParameterInteger(name="MyInt"),
+            ParameterString(name="MyStr"),
+            Properties(path="Steps.foo.OutputPath.S3Uri"),
+            ExecutionVariables.PIPELINE_EXECUTION_ID,
+            Join(on=",", values=[1, "a", False, 1.1]),
+        ]
+    ).expr == {
+        "Std:Join": {
+            "On": "",
+            "Values": [
+                "foo",
+                {"Get": "Parameters.MyFloat"},
+                {"Get": "Parameters.MyInt"},
+                {"Get": "Parameters.MyStr"},
+                {"Get": "Steps.foo.OutputPath.S3Uri"},
+                {"Get": "Execution.PipelineExecutionId"},
+                {"Std:Join": {"On": ",", "Values": [1, "a", False, 1.1]}},
+            ],
+        },
+    }