aws
diff --git a/‎src/sagemaker/workflow/processing.py
Lines changed: 143 additions & 0 deletions b/‎src/sagemaker/workflow/processing.py
Lines changed: 143 additions & 0 deletions
diff --git a/‎tests/data/workflow/dummy_data.csv b/‎tests/data/workflow/dummy_data.csv
diff --git a/‎tests/data/workflow/dummy_recipe.flow b/‎tests/data/workflow/dummy_recipe.flow
diff --git a/‎tests/integ/test_workflow.py
Lines changed: 108 additions & 31 deletions b/‎tests/integ/test_workflow.py
Lines changed: 108 additions & 31 deletions
@@ -0,0 +1,143 @@
+#  Copyright 2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#  #
+#  Licensed under the Apache License, Version 2.0 (the "License"). You
+#  may not use this file except in compliance with the License. A copy of
+#  the License is located at
+#  #
+#      http://aws.amazon.com/apache2.0/
+#  #
+#  or in the "license" file accompanying this file. This file is
+#  distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+#  ANY KIND, either express or implied. See the License for the specific
+#  language governing permissions and limitations under the License.
+"""The process definitions for workflow."""
+
+from __future__ import absolute_import
+
+from sagemaker.processing import (
+    ProcessingInput,
+    Processor,
+)
+from sagemaker import image_uris
+from sagemaker.session import Session
+
+
+class DataWranglerProcessor(Processor):
+    """Handles Amazon SageMaker DataWrangler tasks"""
+
+    def __init__(
+        self,
+        role,
+        data_wrangler_recipe_source,
+        instance_count,
+        instance_type,
+        volume_size_in_gb=30,
+        volume_kms_key=None,
+        output_kms_key=None,
+        max_runtime_in_seconds=None,
+        base_job_name=None,
+        sagemaker_session=None,
+        tags=None,
+        network_config=None,
+    ):
+        """Initializes a ``Processor`` instance.
+
+        The ``Processor`` handles Amazon SageMaker Processing tasks.
+
+        Args:
+            role (str): An AWS IAM role name or ARN. Amazon SageMaker Processing
+                uses this role to access AWS resources, such as
+                data stored in Amazon S3.
+            data_wrangler_recipe_source (str): The source of the DaraWrangler recipe which will be
+                used for the DataWrangler job. If a local path is provided, it will automatically be uploaded to S3
+                under: "s3://<default-bucket-name>/<job-name>/input/<input-name>".
+            instance_count (int): The number of instances to run
+                a processing job with.
+            instance_type (str): The type of EC2 instance to use for
+                processing, for example, 'ml.c4.xlarge'.
+            volume_size_in_gb (int): Size in GB of the EBS volume
+                to use for storing data during processing (default: 30).
+            volume_kms_key (str): A KMS key for the processing
+                volume (default: None).
+            output_kms_key (str): The KMS key ID for processing job outputs (default: None).
+            max_runtime_in_seconds (int): Timeout in seconds (default: None).
+                After this amount of time, Amazon SageMaker terminates the job,
+                regardless of its current status. If `max_runtime_in_seconds` is not
+                specified, the default value is 24 hours.
+            base_job_name (str): Prefix for processing job name. If not specified,
+                the processor generates a default job name, based on the
+                processing image name and current timestamp.
+            sagemaker_session (:class:`~sagemaker.session.Session`):
+                Session object which manages interactions with Amazon SageMaker and
+                any other AWS services needed. If not specified, the processor creates
+                one using the default AWS configuration chain.
+            tags (list[dict]): List of tags to be passed to the processing job
+                (default: None). For more, see
+                https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html.
+            network_config (:class:`~sagemaker.network.NetworkConfig`):
+                A :class:`~sagemaker.network.NetworkConfig`
+                object that configures network isolation, encryption of
+                inter-container traffic, security group IDs, and subnets.
+        """
+        self.data_wrangler_recipe_source = data_wrangler_recipe_source
+        self.sagemaker_session = sagemaker_session or Session()
+        image_uri = image_uris.retrieve(
+            "data-wrangler", region=self.sagemaker_session.boto_region_name
+        )
+        super().__init__(
+            role,
+            image_uri,
+            instance_count,
+            instance_type,
+            volume_size_in_gb=volume_size_in_gb,
+            volume_kms_key=volume_kms_key,
+            output_kms_key=output_kms_key,
+            max_runtime_in_seconds=max_runtime_in_seconds,
+            base_job_name=base_job_name,
+            sagemaker_session=sagemaker_session,
+            tags=tags,
+            network_config=network_config,
+        )
+
+    def _normalize_args(
+        self,
+        job_name=None,
+        arguments=None,
+        inputs=None,
+        outputs=None,
+        code=None,
+        kms_key=None,
+    ):
+        """Normalizes the arguments so that they can be passed to the job run
+
+        Args:
+            job_name (str): Name of the processing job to be created. If not specified, one
+                is generated, using the base name given to the constructor, if applicable
+                (default: None).
+            arguments (list[str]): A list of string arguments to be passed to a
+                processing job (default: None).
+            inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for
+                the processing job. These must be provided as
+                :class:`~sagemaker.processing.ProcessingInput` objects (default: None).
+            outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for
+                the processing job. These can be specified as either path strings or
+                :class:`~sagemaker.processing.ProcessingOutput` objects (default: None).
+            code (str): This can be an S3 URI or a local path to a file with the framework
+                script to run (default: None). A no op in the base class.
+            kms_key (str): The ARN of the KMS key that is used to encrypt the
+                user code file (default: None).
+        """
+        inputs = inputs or []
+        inputs.append(self._get_recipe_input())
+        return super()._normalize_args(job_name, arguments, inputs, outputs, code, kms_key)
+
+    def _get_recipe_input(self):
+        """Creates a ProcessingInput with Data Wrangler recipe uri and appends it to inputs"""
+        return ProcessingInput(
+            source=self.data_wrangler_recipe_source,
+            destination="/opt/ml/processing/flow",
+            input_name="flow",
+            s3_data_type="S3Prefix",
+            s3_input_mode="File",
+            s3_data_distribution_type="FullyReplicated",
+        )
@@ -28,6 +28,7 @@
     rule_configs,
 )
 from datetime import datetime
+from sagemaker import image_uris
 from sagemaker.inputs import CreateModelInput, TrainingInput
 from sagemaker.model import Model
 from sagemaker.processing import ProcessingInput, ProcessingOutput
@@ -39,6 +40,7 @@
 from sagemaker.spark.processing import PySparkProcessor, SparkJarProcessor
 from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
 from sagemaker.workflow.condition_step import ConditionStep
+from sagemaker.workflow.processing import DataWranglerProcessor
 from sagemaker.dataset_definition.inputs import DatasetDefinition, AthenaDatasetDefinition
 from sagemaker.workflow.execution_variables import ExecutionVariables
 from sagemaker.workflow.functions import Join
@@ -84,7 +86,7 @@ def script_dir():
 
 @pytest.fixture
 def pipeline_name():
-    return f"my-pipeline-{int(time.time() * 10**7)}"
+    return f"my-pipeline-{int(time.time() * 10 ** 7)}"
 
 
 @pytest.fixture
@@ -228,12 +230,12 @@ def build_jar():
 
 
 def test_three_step_definition(
-    sagemaker_session,
-    region_name,
-    role,
-    script_dir,
-    pipeline_name,
-    athena_dataset_definition,
+        sagemaker_session,
+        region_name,
+        role,
+        script_dir,
+        pipeline_name,
+        athena_dataset_definition,
 ):
     framework_version = "0.20.0"
     instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
@@ -385,13 +387,13 @@ def test_three_step_definition(
 
 
 def test_one_step_sklearn_processing_pipeline(
-    sagemaker_session,
-    role,
-    sklearn_latest_version,
-    cpu_instance_type,
-    pipeline_name,
-    region_name,
-    athena_dataset_definition,
+        sagemaker_session,
+        role,
+        sklearn_latest_version,
+        cpu_instance_type,
+        pipeline_name,
+        region_name,
+        athena_dataset_definition,
 ):
     instance_count = ParameterInteger(name="InstanceCount", default_value=2)
     script_path = os.path.join(DATA_DIR, "dummy_script.py")
@@ -478,11 +480,11 @@ def test_one_step_sklearn_processing_pipeline(
 
 
 def test_one_step_pyspark_processing_pipeline(
-    sagemaker_session,
-    role,
-    cpu_instance_type,
-    pipeline_name,
-    region_name,
+        sagemaker_session,
+        role,
+        cpu_instance_type,
+        pipeline_name,
+        region_name,
 ):
     instance_count = ParameterInteger(name="InstanceCount", default_value=2)
     script_path = os.path.join(DATA_DIR, "dummy_script.py")
@@ -580,7 +582,7 @@ def test_one_step_pyspark_processing_pipeline(
 
 
 def test_one_step_sparkjar_processing_pipeline(
-    sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, configuration, build_jar
+        sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, configuration, build_jar
 ):
     instance_count = ParameterInteger(name="InstanceCount", default_value=2)
     cache_config = CacheConfig(enable_caching=True, expire_after="T30m")
@@ -677,11 +679,11 @@ def test_one_step_sparkjar_processing_pipeline(
 
 
 def test_conditional_pytorch_training_model_registration(
-    sagemaker_session,
-    role,
-    cpu_instance_type,
-    pipeline_name,
-    region_name,
+        sagemaker_session,
+        role,
+        cpu_instance_type,
+        pipeline_name,
+        region_name,
 ):
     base_dir = os.path.join(DATA_DIR, "pytorch_mnist")
     entry_point = os.path.join(base_dir, "mnist.py")
@@ -777,11 +779,11 @@ def test_conditional_pytorch_training_model_registration(
 
 
 def test_training_job_with_debugger_and_profiler(
-    sagemaker_session,
-    pipeline_name,
-    role,
-    pytorch_training_latest_version,
-    pytorch_training_latest_py_version,
+        sagemaker_session,
+        pipeline_name,
+        role,
+        pytorch_training_latest_version,
+        pytorch_training_latest_py_version,
 ):
     instance_count = ParameterInteger(name="InstanceCount", default_value=1)
     instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
@@ -858,7 +860,7 @@ def test_training_job_with_debugger_and_profiler(
             assert config["RuleEvaluatorImage"] == rule.image_uri
             assert config["VolumeSizeInGB"] == 0
             assert (
-                config["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]
+                    config["RuleParameters"]["rule_to_invoke"] == rule.rule_parameters["rule_to_invoke"]
             )
         assert job_description["DebugHookConfig"] == debugger_hook_config._to_request_dict()
 
@@ -869,3 +871,78 @@ def test_training_job_with_debugger_and_profiler(
             pipeline.delete()
         except Exception:
             pass
+
+
+def test_one_step_data_wrangler_processing_pipeline(
+        sagemaker_session, role, cpu_instance_type, pipeline_name, region_name
+):
+    instance_count = ParameterInteger(name="InstanceCount", default_value=1)
+
+    recipe_file_path = os.path.join(DATA_DIR, "workflow", "dummy_recipe.flow")
+    input_file_path = os.path.join(DATA_DIR, "workflow", "dummy_data.csv")
+
+    output_name = "1bd0aaad-9c93-41b2-8d42-58e214f0843f.default"
+    output_content_type = "CSV"
+    output_config = {output_name: {"content_type": output_content_type}}
+    job_argument = [f"--output-config '{json.dumps(output_config)}'"]
+
+    inputs = [ProcessingInput(input_name="job_data", source=input_file_path, destination="/opt/ml/processing")]
+
+    output_s3_uri = f"s3://{sagemaker_session.default_bucket()}/output"
+    outputs = [
+        ProcessingOutput(
+            output_name=output_name,
+            source="/opt/ml/processing/output",
+            destination=output_s3_uri,
+            s3_upload_mode="EndOfJob",
+        )
+    ]
+
+    data_wrangler_processor = DataWranglerProcessor(
+        role=role,
+        data_wrangler_recipe_source=recipe_file_path,
+        instance_count=instance_count,
+        instance_type=cpu_instance_type,
+        max_runtime_in_seconds=86400,
+    )
+
+    data_wrangler_step = ProcessingStep(
+        name="data-wrangler-step",
+        processor=data_wrangler_processor,
+        inputs=inputs,
+        outputs=outputs,
+        job_arguments=job_argument,
+    )
+
+    pipeline = Pipeline(
+        name=pipeline_name,
+        parameters=[instance_count],
+        steps=[data_wrangler_step],
+        sagemaker_session=sagemaker_session,
+    )
+
+    definition = json.loads(pipeline.definition())
+    expected_image_uri = image_uris.retrieve("data-wrangler", region=sagemaker_session.boto_region_name)
+    assert len(definition["Steps"]) == 1
+    assert definition["Steps"][0]["Arguments"]["AppSpecification"]["ImageUri"] is not None
+    assert definition["Steps"][0]["Arguments"]["AppSpecification"]["ImageUri"] == expected_image_uri
+
+    assert definition["Steps"][0]["Arguments"]["ProcessingInputs"] is not None
+    processing_inputs = definition["Steps"][0]["Arguments"]["ProcessingInputs"]
+    assert len(processing_inputs) == 2
+    for processing_input in processing_inputs:
+        if processing_input["InputName"] == "flow":
+            assert processing_input["S3Input"]["S3Uri"].endswith(".flow")
+            assert processing_input["S3Input"]["LocalPath"] == "/opt/ml/processing/flow"
+        elif processing_input["InputName"] == "job_data":
+            assert processing_input["S3Input"]["S3Uri"].endswith(".csv")
+            assert processing_input["S3Input"]["LocalPath"] == "/opt/ml/processing"
+        else:
+            raise AssertionError("Unknown input name")
+    assert definition["Steps"][0]["Arguments"]["ProcessingOutputConfig"] is not None
+    processing_outputs = definition["Steps"][0]["Arguments"]["ProcessingOutputConfig"]["Outputs"]
+    assert len(processing_outputs) == 1
+    assert processing_outputs[0]["OutputName"] == output_name
+    assert processing_outputs[0]["S3Output"] is not None
+    assert processing_outputs[0]["S3Output"]["LocalPath"] == "/opt/ml/processing/output"
+    assert processing_outputs[0]["S3Output"]["S3Uri"] == output_s3_uri