change: refactor out batch transform job input generation (#1955)

icywang86rui · Rui Wang Napieralski · web-flow · commit 813625febb4b · 2020-10-15T20:57:25.000-07:00
* change: refactor out batch transform job input generation

* Remove unused imports

Co-authored-by: Rui Wang Napieralski &lt;wru@amazon.com&gt;
diff --git a/src/sagemaker/session.py b/src/sagemaker/session.py
@@ -2171,7 +2171,7 @@ def stop_tuning_job(self, name):
                 )
                 raise
 
-    def transform(
+    def _get_transform_request(
         self,
         job_name,
         model_name,
@@ -2187,7 +2187,7 @@ def transform(
         data_processing,
         model_client_config=None,
     ):
-        """Create an Amazon SageMaker transform job.
+        """Construct an dict can be used to create an Amazon SageMaker transform job.
 
         Args:
             job_name (str): Name of the transform job being created.
@@ -2213,6 +2213,9 @@ def transform(
             model_client_config (dict): A dictionary describing the model configuration for the
                 job. Dictionary contains two optional keys,
                 'InvocationsTimeoutInSeconds', and 'InvocationsMaxRetries'.
+
+        Returns:
+            Dict: a create transform job request dict
         """
         transform_request = {
             "TransformJobName": job_name,
@@ -2246,6 +2249,67 @@ def transform(
         if model_client_config and len(model_client_config) > 0:
             transform_request["ModelClientConfig"] = model_client_config
 
+        return transform_request
+
+    def transform(
+        self,
+        job_name,
+        model_name,
+        strategy,
+        max_concurrent_transforms,
+        max_payload,
+        env,
+        input_config,
+        output_config,
+        resource_config,
+        experiment_config,
+        tags,
+        data_processing,
+        model_client_config=None,
+    ):
+        """Create an Amazon SageMaker transform job.
+
+        Args:
+            job_name (str): Name of the transform job being created.
+            model_name (str): Name of the SageMaker model being used for the transform job.
+            strategy (str): The strategy used to decide how to batch records in a single request.
+                Possible values are 'MultiRecord' and 'SingleRecord'.
+            max_concurrent_transforms (int): The maximum number of HTTP requests to be made to
+                each individual transform container at one time.
+            max_payload (int): Maximum size of the payload in a single HTTP request to the
+                container in MB.
+            env (dict): Environment variables to be set for use during the transform job.
+            input_config (dict): A dictionary describing the input data (and its location) for the
+                job.
+            output_config (dict): A dictionary describing the output location for the job.
+            resource_config (dict): A dictionary describing the resources to complete the job.
+            experiment_config (dict): A dictionary describing the experiment configuration for the
+                job. Dictionary contains three optional keys,
+                'ExperimentName', 'TrialName', and 'TrialComponentDisplayName'.
+            tags (list[dict]): List of tags for labeling a transform job.
+            data_processing(dict): A dictionary describing config for combining the input data and
+                transformed data. For more, see
+                https://docs.aws.amazon.com/sagemaker/latest/dg/API_Tag.html.
+            model_client_config (dict): A dictionary describing the model configuration for the
+                job. Dictionary contains two optional keys,
+                'InvocationsTimeoutInSeconds', and 'InvocationsMaxRetries'.
+        """
+        transform_request = self._get_transform_request(
+            job_name=job_name,
+            model_name=model_name,
+            strategy=strategy,
+            max_concurrent_transforms=max_concurrent_transforms,
+            max_payload=max_payload,
+            env=env,
+            input_config=input_config,
+            output_config=output_config,
+            resource_config=resource_config,
+            experiment_config=experiment_config,
+            tags=tags,
+            data_processing=data_processing,
+            model_client_config=model_client_config,
+        )
+
         LOGGER.info("Creating transform job with name: %s", job_name)
         LOGGER.debug("Transform request: %s", json.dumps(transform_request, indent=4))
         self.sagemaker_client.create_transform_job(**transform_request)
diff --git a/src/sagemaker/transformer.py b/src/sagemaker/transformer.py
@@ -363,30 +363,78 @@ def start_new(
             experiment_config:
             model_client_config:
         """
+
+        transform_args = cls._get_transform_args(
+            transformer,
+            data,
+            data_type,
+            content_type,
+            compression_type,
+            split_type,
+            input_filter,
+            output_filter,
+            join_source,
+            experiment_config,
+            model_client_config,
+        )
+        transformer.sagemaker_session.transform(**transform_args)
+
+        return cls(transformer.sagemaker_session, transformer._current_job_name)
+
+    @classmethod
+    def _get_transform_args(
+        cls,
+        transformer,
+        data,
+        data_type,
+        content_type,
+        compression_type,
+        split_type,
+        input_filter,
+        output_filter,
+        join_source,
+        experiment_config,
+        model_client_config,
+    ):
+        """
+        Args:
+            transformer:
+            data:
+            data_type:
+            content_type:
+            compression_type:
+            split_type:
+            input_filter:
+            output_filter:
+            join_source:
+            experiment_config:
+            model_client_config:
+        """
+
         config = _TransformJob._load_config(
             data, data_type, content_type, compression_type, split_type, transformer
         )
         data_processing = _TransformJob._prepare_data_processing(
             input_filter, output_filter, join_source
         )
 
-        transformer.sagemaker_session.transform(
-            job_name=transformer._current_job_name,
-            model_name=transformer.model_name,
-            strategy=transformer.strategy,
-            max_concurrent_transforms=transformer.max_concurrent_transforms,
-            max_payload=transformer.max_payload,
-            env=transformer.env,
-            input_config=config["input_config"],
-            output_config=config["output_config"],
-            resource_config=config["resource_config"],
-            experiment_config=experiment_config,
-            model_client_config=model_client_config,
-            tags=transformer.tags,
-            data_processing=data_processing,
+        transform_args = config.copy()
+        transform_args.update(
+            {
+                "job_name": transformer._current_job_name,
+                "model_name": transformer.model_name,
+                "strategy": transformer.strategy,
+                "max_concurrent_transforms": transformer.max_concurrent_transforms,
+                "max_payload": transformer.max_payload,
+                "env": transformer.env,
+                "experiment_config": experiment_config,
+                "model_client_config": model_client_config,
+                "tags": transformer.tags,
+                "data_processing": data_processing,
+            }
         )
 
-        return cls(transformer.sagemaker_session, transformer._current_job_name)
+        return transform_args
 
     def wait(self, logs=True):
         if logs: