Interim commit #4 - support all processors for ProcessingStep

Payton Staub · Payton Staub · commit 37a12eeece07 · 2021-03-13T13:31:57.000-08:00
diff --git a/src/sagemaker/processing.py b/src/sagemaker/processing.py
@@ -123,25 +123,6 @@ def __init__(
 
         self.sagemaker_session = sagemaker_session or Session()
 
-    def get_run_args(
-        self,
-        inputs=None,
-        outputs=None,
-        arguments=None,
-        job_name=None,
-        kms_key=None,
-    ):
-        # TODO: description
-        normalized_inputs, normalized_outputs = self._normalize_args(
-            job_name=job_name,
-            arguments=arguments,
-            inputs=inputs,
-            kms_key=kms_key,
-            outputs=outputs,
-        )
-
-        return RunArgs(inputs=normalized_inputs, outputs=normalized_outputs, code=None)
-
     def run(
         self,
         inputs=None,
@@ -1190,34 +1171,34 @@ def _to_request_dict(self):
 
 
 class RunArgs(object):
-    """Accepts parameters that specify an Amazon S3 output for a processing job.
+    """Provides an object containing the standard run arguments needed by
+        :class:`~sagemaker.processing.ScriptProcessor`.
 
-    It also provides a method to turn those parameters into a dictionary.
+    An instance of this class is returned from the ``get_run_args()`` method on processors,
+        and is used for normalizing the arguments so that they can be passed to
+        :class:`~sagemaker.workflow.steps.ProcessingStep`
     """
 
     def __init__(
         self,
+        code=None,
         inputs=None,
         outputs=None,
-        code=None,
         arguments=None,
     ):
-        """Initializes a ``ProcessingOutput`` instance.
-
-        ``ProcessingOutput`` accepts parameters that specify an Amazon S3 output for a
-        processing job and provides a method to turn those parameters into a dictionary.
+        """Initializes a ``RunArgs`` instance.
 
         Args:
-            source (str): The source for the output.
-            destination (str): The destination of the output. If a destination
-                is not provided, one will be generated:
-                "s3://<default-bucket-name>/<job-name>/output/<output-name>".
-            output_name (str): The name of the output. If a name
-                is not provided, one will be generated (eg. "output-1").
-            s3_upload_mode (str): Valid options are "EndOfJob" or "Continuous".
-            app_managed (bool): Whether the input are managed by SageMaker or application
-            feature_store_output (:class:`~sagemaker.processing.FeatureStoreOutput`)
-                Configuration for processing job outputs of FeatureStore.
+            code (str): This can be an S3 URI or a local path to a file with the framework
+                script to run.
+            inputs (list[:class:`~sagemaker.processing.ProcessingInput`]): Input files for
+                the processing job. These must be provided as
+                :class:`~sagemaker.processing.ProcessingInput` objects (default: None).
+            outputs (list[:class:`~sagemaker.processing.ProcessingOutput`]): Outputs for
+                the processing job. These can be specified as either path strings or
+                :class:`~sagemaker.processing.ProcessingOutput` objects (default: None).
+            arguments (list[str]): A list of string arguments to be passed to a
+                processing job (default: None).
         """
         self.inputs = inputs
         self.outputs = outputs
diff --git a/src/sagemaker/spark/processing.py b/src/sagemaker/spark/processing.py
@@ -730,7 +730,7 @@ def get_run_args(
         spark_event_logs_s3_uri=None,
     ):
         """Returns a RunArgs object. This object contains the normalized inputs, outputs
-            and arguments needed when creating using a ``PySparkProcessor`` in a :class:`~sagemaker.workflow.steps.ProcessingStep`.
+            and arguments needed when using a ``PySparkProcessor`` in a :class:`~sagemaker.workflow.steps.ProcessingStep`.
 
         Args:
             submit_app (str): Path (local or S3) to Python file to submit to Spark
@@ -758,7 +758,6 @@ def get_run_args(
                 be published to.
         """
         self._current_job_name = self._generate_current_job_name(job_name=job_name)
-        self.command = [_SparkProcessorBase._default_command]
 
         if not submit_app:
             raise ValueError("submit_app is required")
@@ -833,7 +832,6 @@ def run(
                 user code file (default: None).
         """
         self._current_job_name = self._generate_current_job_name(job_name=job_name)
-        self.command = [_SparkProcessorBase._default_command]
 
         if not submit_app:
             raise ValueError("submit_app is required")
@@ -868,6 +866,7 @@ def _extend_processing_args(self, inputs, outputs, **kwargs):
             outputs: Processing outputs.
             kwargs: Additional keyword arguments passed to `super()`.
         """
+        self.command = [_SparkProcessorBase._default_command]
         extended_inputs = self._handle_script_dependencies(
             inputs, kwargs.get("submit_py_files"), FileType.PYTHON
         )
@@ -976,7 +975,7 @@ def get_run_args(
         spark_event_logs_s3_uri=None,
     ):
         """Returns a RunArgs object. This object contains the normalized inputs, outputs
-            and arguments needed when creating using a ``SparkJarProcessor`` in a :class:`~sagemaker.workflow.steps.ProcessingStep`.
+            and arguments needed when using a ``SparkJarProcessor`` in a :class:`~sagemaker.workflow.steps.ProcessingStep`.
 
         Args:
             submit_app (str): Path (local or S3) to Python file to submit to Spark
@@ -1004,7 +1003,6 @@ def get_run_args(
                 be published to.
         """
         self._current_job_name = self._generate_current_job_name(job_name=job_name)
-        self.command = [_SparkProcessorBase._default_command]
 
         if not submit_app:
             raise ValueError("submit_app is required")
@@ -1079,7 +1077,6 @@ def run(
                 user code file (default: None).
         """
         self._current_job_name = self._generate_current_job_name(job_name=job_name)
-        self.command = [_SparkProcessorBase._default_command]
 
         if not submit_app:
             raise ValueError("submit_app is required")
@@ -1107,6 +1104,7 @@ def run(
         )
 
     def _extend_processing_args(self, inputs, outputs, **kwargs):
+        self.command = [_SparkProcessorBase._default_command]
         if kwargs.get("submit_class"):
             self.command.extend(["--class", kwargs.get("submit_class")])
         else:
diff --git a/tests/integ/test_workflow.py b/tests/integ/test_workflow.py
@@ -104,6 +104,7 @@ def athena_dataset_definition(sagemaker_session):
         ),
     )
 
+
 @pytest.fixture
 def configuration() -> list:
     configuration = [
@@ -183,6 +184,7 @@ def configuration() -> list:
     ]
     return configuration
 
+
 @pytest.fixture(scope="module")
 def build_jar():
     spark_path = os.path.join(DATA_DIR, "spark")
@@ -224,6 +226,7 @@ def build_jar():
     subprocess.run(["rm", os.path.join(jar_file_path, "hello-spark-java.jar")])
     subprocess.run(["rm", os.path.join(jar_file_path, java_file_path, "HelloJavaSparkApp.class")])
 
+
 def test_three_step_definition(
     sagemaker_session,
     region_name,
@@ -473,6 +476,7 @@ def test_one_step_sklearn_processing_pipeline(
         except Exception:
             pass
 
+
 def test_one_step_pyspark_processing_pipeline(
     sagemaker_session,
     role,
@@ -496,12 +500,18 @@ def test_one_step_pyspark_processing_pipeline(
     )
 
     spark_run_args = pyspark_processor.get_run_args(
-       submit_app=script_path,
-       arguments=["--s3_input_bucket", sagemaker_session.default_bucket(),
-                  "--s3_input_key_prefix", "spark-input",
-                  "--s3_output_bucket", sagemaker_session.default_bucket(),
-                  "--s3_output_key_prefix", "spark-output"],
-)
+        submit_app=script_path,
+        arguments=[
+            "--s3_input_bucket",
+            sagemaker_session.default_bucket(),
+            "--s3_input_key_prefix",
+            "spark-input",
+            "--s3_output_bucket",
+            sagemaker_session.default_bucket(),
+            "--s3_output_key_prefix",
+            "spark-output",
+        ],
+    )
 
     step_pyspark = ProcessingStep(
         name="pyspark-process",
@@ -520,12 +530,12 @@ def test_one_step_pyspark_processing_pipeline(
     )
 
     try:
-    # NOTE: We should exercise the case when role used in the pipeline execution is
-    # different than that required of the steps in the pipeline itself. The role in
-    # the pipeline definition needs to create training and processing jobs and other
-    # sagemaker entities. However, the jobs created in the steps themselves execute
-    # under a potentially different role, often requiring access to S3 and other
-    # artifacts not required to during creation of the jobs in the pipeline steps.
+        # NOTE: We should exercise the case when role used in the pipeline execution is
+        # different than that required of the steps in the pipeline itself. The role in
+        # the pipeline definition needs to create training and processing jobs and other
+        # sagemaker entities. However, the jobs created in the steps themselves execute
+        # under a potentially different role, often requiring access to S3 and other
+        # artifacts not required to during creation of the jobs in the pipeline steps.
         response = pipeline.create(role)
         create_arn = response["PipelineArn"]
         assert re.match(
@@ -568,14 +578,9 @@ def test_one_step_pyspark_processing_pipeline(
         except Exception:
             pass
 
+
 def test_one_step_sparkjar_processing_pipeline(
-    sagemaker_session,
-    role,
-    cpu_instance_type,
-    pipeline_name,
-    region_name,
-    configuration,
-    build_jar
+    sagemaker_session, role, cpu_instance_type, pipeline_name, region_name, configuration, build_jar
 ):
     instance_count = ParameterInteger(name="InstanceCount", default_value=2)
     cache_config = CacheConfig(enable_caching=True, expire_after="T30m")
@@ -670,6 +675,7 @@ def test_one_step_sparkjar_processing_pipeline(
         except Exception:
             pass
 
+
 def test_conditional_pytorch_training_model_registration(
     sagemaker_session,
     role,
diff --git a/tests/unit/sagemaker/spark/test_processing.py b/tests/unit/sagemaker/spark/test_processing.py
@@ -776,7 +776,6 @@ def test_py_spark_processor_run(
                 "inputs": [],
                 "opt": None,
                 "arguments": ["arg1"],
-                "kms_key": "test_kms_key",
             },
             ValueError,
         ),
@@ -787,7 +786,6 @@ def test_py_spark_processor_run(
                 "inputs": [processing_input],
                 "opt": None,
                 "arguments": ["arg1"],
-                "kms_key": "test_kms_key",
             },
             [processing_input],
         ),
@@ -798,7 +796,6 @@ def test_py_spark_processor_run(
                 "inputs": [processing_input],
                 "opt": None,
                 "arguments": ["arg1"],
-                "kms_key": "test_kms_key",
             },
             [processing_input, processing_input, processing_input, processing_input],
         ),
@@ -809,7 +806,6 @@ def test_py_spark_processor_run(
                 "inputs": None,
                 "opt": None,
                 "arguments": ["arg1"],
-                "kms_key": "test_kms_key",
             },
             [processing_input, processing_input, processing_input],
         ),
@@ -820,7 +816,6 @@ def test_py_spark_processor_run(
                 "inputs": None,
                 "opt": "opt",
                 "arguments": ["arg1"],
-                "kms_key": "test_kms_key",
             },
             [processing_input, processing_input, processing_input],
         ),
@@ -878,7 +873,6 @@ def test_py_spark_processor_get_run_args(
                 "inputs": [],
                 "opt": None,
                 "arguments": ["arg1"],
-                "kms_key": "test_kms_key",
             },
             ValueError,
         ),
@@ -889,7 +883,6 @@ def test_py_spark_processor_get_run_args(
                 "inputs": [processing_input],
                 "opt": None,
                 "arguments": ["arg1"],
-                "kms_key": "test_kms_key",
             },
             [processing_input],
         ),
@@ -900,7 +893,6 @@ def test_py_spark_processor_get_run_args(
                 "inputs": [processing_input],
                 "opt": None,
                 "arguments": ["arg1"],
-                "kms_key": "test_kms_key",
             },
             [processing_input, processing_input, processing_input, processing_input],
         ),
@@ -911,7 +903,6 @@ def test_py_spark_processor_get_run_args(
                 "inputs": None,
                 "opt": None,
                 "arguments": ["arg1"],
-                "kms_key": "test_kms_key",
             },
             [processing_input, processing_input, processing_input],
         ),
@@ -922,7 +913,6 @@ def test_py_spark_processor_get_run_args(
                 "inputs": None,
                 "opt": "opt",
                 "arguments": ["arg1"],
-                "kms_key": "test_kms_key",
             },
             [processing_input, processing_input, processing_input],
         ),
@@ -951,7 +941,6 @@ def test_py_spark_processor_get_run_args(
                 submit_files=config["files"],
                 inputs=config["inputs"],
                 arguments=config["arguments"],
-                kms_key=config["kms_key"],
             )
     else:
         py_spark_processor.get_run_args(
@@ -961,16 +950,13 @@ def test_py_spark_processor_get_run_args(
             submit_files=config["files"],
             inputs=config["inputs"],
             arguments=config["arguments"],
-            kms_key=config["kms_key"],
         )
 
         mock_super_get_run_args.assert_called_with(
-            submit_app=config["submit_app"],
+            code=config["submit_app"],
             inputs=expected,
             outputs=None,
             arguments=config["arguments"],
-            job_name="jobName",
-            kms_key=config["kms_key"],
         )
 
 
diff --git a/tests/unit/sagemaker/workflow/test_steps.py b/tests/unit/sagemaker/workflow/test_steps.py
@@ -183,7 +183,6 @@ def test_processing_step(sagemaker_session):
         outputs=[],
         cache_config=cache_config,
     )
-    print(f"StepToRequest is {step.to_request()}")
     assert step.to_request() == {
         "Name": "MyProcessingStep",
         "Type": "Processing",
diff --git a/tests/unit/test_processing.py b/tests/unit/test_processing.py

Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,6 @@ def test_processing_step(sagemaker_session):`
`183`	`183`	`outputs=[],`
`184`	`184`	`cache_config=cache_config,`
`185`	`185`	`)`
`186`		`- print(f"StepToRequest is {step.to_request()}")`
`187`	`186`	`assert step.to_request() == {`
`188`	`187`	`"Name": "MyProcessingStep",`
`189`	`188`	`"Type": "Processing",`