Skip to content

fix: add Batch Transform data processing options to Airflow config #1514

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 20, 2020
1 change: 1 addition & 0 deletions src/sagemaker/transformer.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ def transform(

* 'ManifestFile' - the S3 URI points to a single manifest file listing each S3
object to use as an input for the transform job.

content_type (str): MIME type of the input data (default: None).
compression_type (str): Compression type of the input data, if
compressed (default: None). Valid values: 'Gzip', None.
Expand Down
74 changes: 73 additions & 1 deletion src/sagemaker/workflow/airflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -671,6 +671,9 @@ def transform_config(
compression_type=None,
split_type=None,
job_name=None,
input_filter=None,
output_filter=None,
join_source=None,
):
"""Export Airflow transform config from a SageMaker transformer

Expand All @@ -686,13 +689,38 @@ def transform_config(

* 'ManifestFile' - the S3 URI points to a single manifest file listing each S3 object
to use as an input for the transform job.

content_type (str): MIME type of the input data (default: None).
compression_type (str): Compression type of the input data, if
compressed (default: None). Valid values: 'Gzip', None.
split_type (str): The record delimiter for the input object (default:
'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'.
job_name (str): job name (default: None). If not specified, one will be
generated.
input_filter (str): A JSONPath to select a portion of the input to
pass to the algorithm container for inference. If you omit the
field, it gets the value '$', representing the entire input.
For CSV data, each row is taken as a JSON array,
so only index-based JSONPaths can be applied, e.g. $[0], $[1:].
CSV data should follow the `RFC format <https://tools.ietf.org/html/rfc4180>`_.
See `Supported JSONPath Operators
<https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#data-processing-operators>`_
for a table of supported JSONPath operators.
For more information, see the SageMaker API documentation for
`CreateTransformJob
<https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
Some examples: "$[1:]", "$.features" (default: None).
output_filter (str): A JSONPath to select a portion of the
joined/original output to return as the output.
For more information, see the SageMaker API documentation for
`CreateTransformJob
<https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
Some examples: "$[1:]", "$.prediction" (default: None).
join_source (str): The source of data to be joined to the transform
output. It can be set to 'Input' meaning the entire input record
will be joined to the inference result. You can use OutputFilter
to select the useful portion before uploading to S3. (default:
None). Valid values: Input, None.

Returns:
dict: Transform config that can be directly used by
Expand Down Expand Up @@ -723,6 +751,12 @@ def transform_config(
"TransformResources": job_config["resource_config"],
}

data_processing = sagemaker.transformer._TransformJob._prepare_data_processing(
input_filter, output_filter, join_source
)
if data_processing is not None:
config["DataProcessing"] = data_processing

if transformer.strategy is not None:
config["BatchStrategy"] = transformer.strategy

Expand Down Expand Up @@ -768,6 +802,9 @@ def transform_config_from_estimator(
model_server_workers=None,
image=None,
vpc_config_override=None,
input_filter=None,
output_filter=None,
join_source=None,
):
"""Export Airflow transform config from a SageMaker estimator

Expand Down Expand Up @@ -836,9 +873,35 @@ def transform_config_from_estimator(
image (str): An container image to use for deploying the model
vpc_config_override (dict[str, list[str]]): Override for VpcConfig set on
the model. Default: use subnets and security groups from this Estimator.

* 'Subnets' (list[str]): List of subnet ids.
* 'SecurityGroupIds' (list[str]): List of security group ids.

input_filter (str): A JSONPath to select a portion of the input to
pass to the algorithm container for inference. If you omit the
field, it gets the value '$', representing the entire input.
For CSV data, each row is taken as a JSON array,
so only index-based JSONPaths can be applied, e.g. $[0], $[1:].
CSV data should follow the `RFC format <https://tools.ietf.org/html/rfc4180>`_.
See `Supported JSONPath Operators
<https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#data-processing-operators>`_
for a table of supported JSONPath operators.
For more information, see the SageMaker API documentation for
`CreateTransformJob
<https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
Some examples: "$[1:]", "$.features" (default: None).
output_filter (str): A JSONPath to select a portion of the
joined/original output to return as the output.
For more information, see the SageMaker API documentation for
`CreateTransformJob
<https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
Some examples: "$[1:]", "$.prediction" (default: None).
join_source (str): The source of data to be joined to the transform
output. It can be set to 'Input' meaning the entire input record
will be joined to the inference result. You can use OutputFilter
to select the useful portion before uploading to S3. (default:
None). Valid values: Input, None.

Returns:
dict: Transform config that can be directly used by
SageMakerTransformOperator in Airflow.
Expand Down Expand Up @@ -891,7 +954,16 @@ def transform_config_from_estimator(
transformer.model_name = model_base_config["ModelName"]

transform_base_config = transform_config(
transformer, data, data_type, content_type, compression_type, split_type, job_name
transformer,
data,
data_type,
content_type,
compression_type,
split_type,
job_name,
input_filter,
output_filter,
join_source,
)

config = {"Model": model_base_config, "Transform": transform_base_config}
Expand Down
2 changes: 2 additions & 0 deletions tests/integ/test_airflow_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,8 @@ def _build_airflow_workflow(estimator, instance_type, inputs=None, mini_batch_si
instance_type=estimator.train_instance_type,
data=inputs,
content_type="text/csv",
input_filter="$",
output_filter="$",
)

default_args = {
Expand Down
18 changes: 16 additions & 2 deletions tests/unit/test_airflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.

from __future__ import absolute_import

import pytest
Expand All @@ -21,7 +20,6 @@
from sagemaker.amazon import amazon_estimator
from sagemaker.amazon import knn, linear_learner, ntm, pca


REGION = "us-west-2"
BUCKET_NAME = "output"
TIME_STAMP = "1111"
Expand Down Expand Up @@ -1162,6 +1160,9 @@ def test_transform_config(sagemaker_session):
content_type="{{ content_type }}",
compression_type="{{ compression_type }}",
split_type="{{ split_type }}",
input_filter="{{ input_filter }}",
output_filter="{{ output_filter }}",
join_source="{{ join_source }}",
)
expected_config = {
"TransformJobName": "tensorflow-transform-%s" % TIME_STAMP,
Expand Down Expand Up @@ -1190,6 +1191,11 @@ def test_transform_config(sagemaker_session):
"MaxPayloadInMB": "{{ max_payload }}",
"Environment": {"{{ key }}": "{{ value }}"},
"Tags": [{"{{ key }}": "{{ value }}"}],
"DataProcessing": {
"InputFilter": "{{ input_filter }}",
"JoinSource": "{{ join_source }}",
"OutputFilter": "{{ output_filter }}",
},
}

assert config == expected_config
Expand Down Expand Up @@ -1238,6 +1244,9 @@ def test_transform_config_from_framework_estimator(ecr_prefix, sagemaker_session
instance_count="{{ instance_count }}",
instance_type="ml.p2.xlarge",
data=transform_data,
input_filter="{{ input_filter }}",
output_filter="{{ output_filter }}",
join_source="{{ join_source }}",
)
expected_config = {
"Model": {
Expand Down Expand Up @@ -1272,6 +1281,11 @@ def test_transform_config_from_framework_estimator(ecr_prefix, sagemaker_session
"InstanceType": "ml.p2.xlarge",
},
"Environment": {},
"DataProcessing": {
"InputFilter": "{{ input_filter }}",
"JoinSource": "{{ join_source }}",
"OutputFilter": "{{ output_filter }}",
},
},
}

Expand Down