fix: add Batch Transform data processing options to Airflow config (#1514)

laurenyu · web-flow · commit a49cec208a2b · 2020-05-20T14:58:18.000-07:00
diff --git a/src/sagemaker/transformer.py b/src/sagemaker/transformer.py
@@ -135,6 +135,7 @@ def transform(
 
                 * 'ManifestFile' - the S3 URI points to a single manifest file listing each S3
                     object to use as an input for the transform job.
+
             content_type (str): MIME type of the input data (default: None).
             compression_type (str): Compression type of the input data, if
                 compressed (default: None). Valid values: 'Gzip', None.
diff --git a/src/sagemaker/workflow/airflow.py b/src/sagemaker/workflow/airflow.py
@@ -671,6 +671,9 @@ def transform_config(
     compression_type=None,
     split_type=None,
     job_name=None,
+    input_filter=None,
+    output_filter=None,
+    join_source=None,
 ):
     """Export Airflow transform config from a SageMaker transformer
 
@@ -686,13 +689,38 @@ def transform_config(
 
             * 'ManifestFile' - the S3 URI points to a single manifest file listing each S3 object
                   to use as an input for the transform job.
+
         content_type (str): MIME type of the input data (default: None).
         compression_type (str): Compression type of the input data, if
             compressed (default: None). Valid values: 'Gzip', None.
         split_type (str): The record delimiter for the input object (default:
             'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'.
         job_name (str): job name (default: None). If not specified, one will be
             generated.
+        input_filter (str): A JSONPath to select a portion of the input to
+            pass to the algorithm container for inference. If you omit the
+            field, it gets the value '$', representing the entire input.
+            For CSV data, each row is taken as a JSON array,
+            so only index-based JSONPaths can be applied, e.g. $[0], $[1:].
+            CSV data should follow the `RFC format <https://tools.ietf.org/html/rfc4180>`_.
+            See `Supported JSONPath Operators
+            <https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#data-processing-operators>`_
+            for a table of supported JSONPath operators.
+            For more information, see the SageMaker API documentation for
+            `CreateTransformJob
+            <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
+            Some examples: "$[1:]", "$.features" (default: None).
+        output_filter (str): A JSONPath to select a portion of the
+            joined/original output to return as the output.
+            For more information, see the SageMaker API documentation for
+            `CreateTransformJob
+            <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
+            Some examples: "$[1:]", "$.prediction" (default: None).
+        join_source (str): The source of data to be joined to the transform
+            output. It can be set to 'Input' meaning the entire input record
+            will be joined to the inference result. You can use OutputFilter
+            to select the useful portion before uploading to S3. (default:
+            None). Valid values: Input, None.
 
     Returns:
         dict: Transform config that can be directly used by
@@ -723,6 +751,12 @@ def transform_config(
         "TransformResources": job_config["resource_config"],
     }
 
+    data_processing = sagemaker.transformer._TransformJob._prepare_data_processing(
+        input_filter, output_filter, join_source
+    )
+    if data_processing is not None:
+        config["DataProcessing"] = data_processing
+
     if transformer.strategy is not None:
         config["BatchStrategy"] = transformer.strategy
 
@@ -768,6 +802,9 @@ def transform_config_from_estimator(
     model_server_workers=None,
     image=None,
     vpc_config_override=None,
+    input_filter=None,
+    output_filter=None,
+    join_source=None,
 ):
     """Export Airflow transform config from a SageMaker estimator
 
@@ -836,9 +873,35 @@ def transform_config_from_estimator(
         image (str): An container image to use for deploying the model
         vpc_config_override (dict[str, list[str]]): Override for VpcConfig set on
             the model. Default: use subnets and security groups from this Estimator.
+
             * 'Subnets' (list[str]): List of subnet ids.
             * 'SecurityGroupIds' (list[str]): List of security group ids.
 
+        input_filter (str): A JSONPath to select a portion of the input to
+            pass to the algorithm container for inference. If you omit the
+            field, it gets the value '$', representing the entire input.
+            For CSV data, each row is taken as a JSON array,
+            so only index-based JSONPaths can be applied, e.g. $[0], $[1:].
+            CSV data should follow the `RFC format <https://tools.ietf.org/html/rfc4180>`_.
+            See `Supported JSONPath Operators
+            <https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#data-processing-operators>`_
+            for a table of supported JSONPath operators.
+            For more information, see the SageMaker API documentation for
+            `CreateTransformJob
+            <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
+            Some examples: "$[1:]", "$.features" (default: None).
+        output_filter (str): A JSONPath to select a portion of the
+            joined/original output to return as the output.
+            For more information, see the SageMaker API documentation for
+            `CreateTransformJob
+            <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
+            Some examples: "$[1:]", "$.prediction" (default: None).
+        join_source (str): The source of data to be joined to the transform
+            output. It can be set to 'Input' meaning the entire input record
+            will be joined to the inference result. You can use OutputFilter
+            to select the useful portion before uploading to S3. (default:
+            None). Valid values: Input, None.
+
     Returns:
         dict: Transform config that can be directly used by
         SageMakerTransformOperator in Airflow.
@@ -891,7 +954,16 @@ def transform_config_from_estimator(
     transformer.model_name = model_base_config["ModelName"]
 
     transform_base_config = transform_config(
-        transformer, data, data_type, content_type, compression_type, split_type, job_name
+        transformer,
+        data,
+        data_type,
+        content_type,
+        compression_type,
+        split_type,
+        job_name,
+        input_filter,
+        output_filter,
+        join_source,
     )
 
     config = {"Model": model_base_config, "Transform": transform_base_config}
diff --git a/tests/integ/test_airflow_config.py b/tests/integ/test_airflow_config.py
@@ -682,6 +682,8 @@ def _build_airflow_workflow(estimator, instance_type, inputs=None, mini_batch_si
         instance_type=estimator.train_instance_type,
         data=inputs,
         content_type="text/csv",
+        input_filter="$",
+        output_filter="$",
     )
 
     default_args = {
diff --git a/tests/unit/test_airflow.py b/tests/unit/test_airflow.py
@@ -10,7 +10,6 @@
 # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 # ANY KIND, either express or implied. See the License for the specific
 # language governing permissions and limitations under the License.
-
 from __future__ import absolute_import
 
 import pytest
@@ -21,7 +20,6 @@
 from sagemaker.amazon import amazon_estimator
 from sagemaker.amazon import knn, linear_learner, ntm, pca
 
-
 REGION = "us-west-2"
 BUCKET_NAME = "output"
 TIME_STAMP = "1111"
@@ -1162,6 +1160,9 @@ def test_transform_config(sagemaker_session):
         content_type="{{ content_type }}",
         compression_type="{{ compression_type }}",
         split_type="{{ split_type }}",
+        input_filter="{{ input_filter }}",
+        output_filter="{{ output_filter }}",
+        join_source="{{ join_source }}",
     )
     expected_config = {
         "TransformJobName": "tensorflow-transform-%s" % TIME_STAMP,
@@ -1190,6 +1191,11 @@ def test_transform_config(sagemaker_session):
         "MaxPayloadInMB": "{{ max_payload }}",
         "Environment": {"{{ key }}": "{{ value }}"},
         "Tags": [{"{{ key }}": "{{ value }}"}],
+        "DataProcessing": {
+            "InputFilter": "{{ input_filter }}",
+            "JoinSource": "{{ join_source }}",
+            "OutputFilter": "{{ output_filter }}",
+        },
     }
 
     assert config == expected_config
@@ -1238,6 +1244,9 @@ def test_transform_config_from_framework_estimator(ecr_prefix, sagemaker_session
         instance_count="{{ instance_count }}",
         instance_type="ml.p2.xlarge",
         data=transform_data,
+        input_filter="{{ input_filter }}",
+        output_filter="{{ output_filter }}",
+        join_source="{{ join_source }}",
     )
     expected_config = {
         "Model": {
@@ -1272,6 +1281,11 @@ def test_transform_config_from_framework_estimator(ecr_prefix, sagemaker_session
                 "InstanceType": "ml.p2.xlarge",
             },
             "Environment": {},
+            "DataProcessing": {
+                "InputFilter": "{{ input_filter }}",
+                "JoinSource": "{{ join_source }}",
+                "OutputFilter": "{{ output_filter }}",
+            },
         },
     }
 

Original file line number	Diff line number	Diff line change
`@@ -682,6 +682,8 @@ def _build_airflow_workflow(estimator, instance_type, inputs=None, mini_batch_si`
`682`	`682`	`instance_type=estimator.train_instance_type,`
`683`	`683`	`data=inputs,`
`684`	`684`	`content_type="text/csv",`
	`685`	`+ input_filter="$",`
	`686`	`+ output_filter="$",`
`685`	`687`	`)`
`686`	`688`
`687`	`689`	`default_args = {`