@@ -671,6 +671,9 @@ def transform_config(
671
671
compression_type = None ,
672
672
split_type = None ,
673
673
job_name = None ,
674
+ input_filter = None ,
675
+ output_filter = None ,
676
+ join_source = None ,
674
677
):
675
678
"""Export Airflow transform config from a SageMaker transformer
676
679
@@ -686,13 +689,38 @@ def transform_config(
686
689
687
690
* 'ManifestFile' - the S3 URI points to a single manifest file listing each S3 object
688
691
to use as an input for the transform job.
692
+
689
693
content_type (str): MIME type of the input data (default: None).
690
694
compression_type (str): Compression type of the input data, if
691
695
compressed (default: None). Valid values: 'Gzip', None.
692
696
split_type (str): The record delimiter for the input object (default:
693
697
'None'). Valid values: 'None', 'Line', 'RecordIO', and 'TFRecord'.
694
698
job_name (str): job name (default: None). If not specified, one will be
695
699
generated.
700
+ input_filter (str): A JSONPath to select a portion of the input to
701
+ pass to the algorithm container for inference. If you omit the
702
+ field, it gets the value '$', representing the entire input.
703
+ For CSV data, each row is taken as a JSON array,
704
+ so only index-based JSONPaths can be applied, e.g. $[0], $[1:].
705
+ CSV data should follow the `RFC format <https://tools.ietf.org/html/rfc4180>`_.
706
+ See `Supported JSONPath Operators
707
+ <https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#data-processing-operators>`_
708
+ for a table of supported JSONPath operators.
709
+ For more information, see the SageMaker API documentation for
710
+ `CreateTransformJob
711
+ <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
712
+ Some examples: "$[1:]", "$.features" (default: None).
713
+ output_filter (str): A JSONPath to select a portion of the
714
+ joined/original output to return as the output.
715
+ For more information, see the SageMaker API documentation for
716
+ `CreateTransformJob
717
+ <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
718
+ Some examples: "$[1:]", "$.prediction" (default: None).
719
+ join_source (str): The source of data to be joined to the transform
720
+ output. It can be set to 'Input' meaning the entire input record
721
+ will be joined to the inference result. You can use OutputFilter
722
+ to select the useful portion before uploading to S3. (default:
723
+ None). Valid values: Input, None.
696
724
697
725
Returns:
698
726
dict: Transform config that can be directly used by
@@ -723,6 +751,12 @@ def transform_config(
723
751
"TransformResources" : job_config ["resource_config" ],
724
752
}
725
753
754
+ data_processing = sagemaker .transformer ._TransformJob ._prepare_data_processing (
755
+ input_filter , output_filter , join_source
756
+ )
757
+ if data_processing is not None :
758
+ config ["DataProcessing" ] = data_processing
759
+
726
760
if transformer .strategy is not None :
727
761
config ["BatchStrategy" ] = transformer .strategy
728
762
@@ -768,6 +802,9 @@ def transform_config_from_estimator(
768
802
model_server_workers = None ,
769
803
image = None ,
770
804
vpc_config_override = None ,
805
+ input_filter = None ,
806
+ output_filter = None ,
807
+ join_source = None ,
771
808
):
772
809
"""Export Airflow transform config from a SageMaker estimator
773
810
@@ -836,9 +873,35 @@ def transform_config_from_estimator(
836
873
image (str): An container image to use for deploying the model
837
874
vpc_config_override (dict[str, list[str]]): Override for VpcConfig set on
838
875
the model. Default: use subnets and security groups from this Estimator.
876
+
839
877
* 'Subnets' (list[str]): List of subnet ids.
840
878
* 'SecurityGroupIds' (list[str]): List of security group ids.
841
879
880
+ input_filter (str): A JSONPath to select a portion of the input to
881
+ pass to the algorithm container for inference. If you omit the
882
+ field, it gets the value '$', representing the entire input.
883
+ For CSV data, each row is taken as a JSON array,
884
+ so only index-based JSONPaths can be applied, e.g. $[0], $[1:].
885
+ CSV data should follow the `RFC format <https://tools.ietf.org/html/rfc4180>`_.
886
+ See `Supported JSONPath Operators
887
+ <https://docs.aws.amazon.com/sagemaker/latest/dg/batch-transform-data-processing.html#data-processing-operators>`_
888
+ for a table of supported JSONPath operators.
889
+ For more information, see the SageMaker API documentation for
890
+ `CreateTransformJob
891
+ <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
892
+ Some examples: "$[1:]", "$.features" (default: None).
893
+ output_filter (str): A JSONPath to select a portion of the
894
+ joined/original output to return as the output.
895
+ For more information, see the SageMaker API documentation for
896
+ `CreateTransformJob
897
+ <https://docs.aws.amazon.com/sagemaker/latest/dg/API_CreateTransformJob.html>`_.
898
+ Some examples: "$[1:]", "$.prediction" (default: None).
899
+ join_source (str): The source of data to be joined to the transform
900
+ output. It can be set to 'Input' meaning the entire input record
901
+ will be joined to the inference result. You can use OutputFilter
902
+ to select the useful portion before uploading to S3. (default:
903
+ None). Valid values: Input, None.
904
+
842
905
Returns:
843
906
dict: Transform config that can be directly used by
844
907
SageMakerTransformOperator in Airflow.
@@ -891,7 +954,16 @@ def transform_config_from_estimator(
891
954
transformer .model_name = model_base_config ["ModelName" ]
892
955
893
956
transform_base_config = transform_config (
894
- transformer , data , data_type , content_type , compression_type , split_type , job_name
957
+ transformer ,
958
+ data ,
959
+ data_type ,
960
+ content_type ,
961
+ compression_type ,
962
+ split_type ,
963
+ job_name ,
964
+ input_filter ,
965
+ output_filter ,
966
+ join_source ,
895
967
)
896
968
897
969
config = {"Model" : model_base_config , "Transform" : transform_base_config }
0 commit comments