fix: frameworkprocessor side effects, testing

Brock Wade · Brock Wade · commit db69b5ff652b · 2022-11-29T09:32:17.000-08:00
diff --git a/src/sagemaker/processing.py b/src/sagemaker/processing.py
@@ -1704,6 +1704,7 @@ def _pack_and_upload_code(
         self, code, source_dir, dependencies, git_config, job_name, inputs, kms_key=None
     ):
         """Pack local code bundle and upload to Amazon S3."""
+        from sagemaker.workflow.utilities import _pipeline_config, hash_object
         if code.startswith("s3://"):
             return code, inputs, job_name
 
@@ -1737,12 +1738,29 @@ def _pack_and_upload_code(
             "runproc.sh",
         )
         script = estimator.uploaded_code.script_name
-        s3_runproc_sh = S3Uploader.upload_string_as_file_body(
-            self._generate_framework_script(script),
-            desired_s3_uri=entrypoint_s3_uri,
-            kms_key=kms_key,
-            sagemaker_session=self.sagemaker_session,
-        )
+
+        # If we are leveraging a pipeline session with optimized s3 artifact paths,
+        # we need to hash and upload the runproc.sh file to a separate location.
+        if _pipeline_config and _pipeline_config.pipeline_name:
+            runproc_file_str = self._generate_framework_script(script)
+            runproc_file_hash = hash_object(runproc_file_str)
+            s3_uri = (
+                f"s3://{self.sagemaker_session.default_bucket()}/{_pipeline_config.pipeline_name}/"
+                f"code/{runproc_file_hash}/runproc.sh"
+            )
+            s3_runproc_sh = S3Uploader.upload_string_as_file_body(
+                runproc_file_str,
+                desired_s3_uri=s3_uri,
+                kms_key=kms_key,
+                sagemaker_session=self.sagemaker_session,
+            )
+        else:
+            s3_runproc_sh = S3Uploader.upload_string_as_file_body(
+                self._generate_framework_script(script),
+                desired_s3_uri=entrypoint_s3_uri,
+                kms_key=kms_key,
+                sagemaker_session=self.sagemaker_session,
+            )
         logger.info("runproc.sh uploaded to %s", s3_runproc_sh)
 
         return s3_runproc_sh, inputs, job_name
diff --git a/src/sagemaker/spark/processing.py b/src/sagemaker/spark/processing.py
@@ -279,6 +279,12 @@ def run(
     def _extend_processing_args(self, inputs, outputs, **kwargs):
         """Extends processing job args such as inputs."""
 
+        # make a copy of user outputs
+        outputs = outputs or []
+        extended_outputs = []
+        for user_output in outputs:
+            extended_outputs.append(user_output)
+
         if kwargs.get("spark_event_logs_s3_uri"):
             spark_event_logs_s3_uri = kwargs.get("spark_event_logs_s3_uri")
             self._validate_s3_uri(spark_event_logs_s3_uri)
@@ -297,16 +303,20 @@ def _extend_processing_args(self, inputs, outputs, **kwargs):
                 s3_upload_mode="Continuous",
             )
 
-            outputs = outputs or []
-            outputs.append(output)
+            extended_outputs.append(output)
+
+        # make a copy of user inputs
+        inputs = inputs or []
+        extended_inputs = []
+        for user_input in inputs:
+            extended_inputs.append(user_input)
 
         if kwargs.get("configuration"):
             configuration = kwargs.get("configuration")
             self._validate_configuration(configuration)
-            inputs = inputs or []
-            inputs.append(self._stage_configuration(configuration))
+            extended_inputs.append(self._stage_configuration(configuration))
 
-        return inputs, outputs
+        return extended_inputs, extended_outputs
 
     def start_history_server(self, spark_event_logs_s3_uri=None):
         """Starts a Spark history server.
diff --git a/src/sagemaker/workflow/utilities.py b/src/sagemaker/workflow/utilities.py
@@ -114,11 +114,12 @@ def get_code_hash(step: Entity) -> str:
     if isinstance(step, ProcessingStep) and step.step_args:
         kwargs = step.step_args.func_kwargs
         source_dir = kwargs.get("source_dir")
+        submit_class = kwargs.get("submit_class")
         dependencies = get_processing_dependencies(
             [
                 kwargs.get("dependencies"),
                 kwargs.get("submit_py_files"),
-                kwargs.get("submit_class"),
+                [submit_class] if submit_class else None,
                 kwargs.get("submit_jars"),
                 kwargs.get("submit_files"),
             ]
diff --git a/tests/data/framework_processor_data/evaluate.py b/tests/data/framework_processor_data/evaluate.py
@@ -0,0 +1,6 @@
+"""
+Integ test file evaluate.py
+"""
+
+def evaluate():
+    print("evaluate")
diff --git a/tests/data/framework_processor_data/preprocess.py b/tests/data/framework_processor_data/preprocess.py
@@ -0,0 +1,6 @@
+"""
+Integ test file preprocess.py
+"""
+
+def preprocess():
+    print("preprocess")
diff --git a/tests/data/framework_processor_data/query_data.py b/tests/data/framework_processor_data/query_data.py
@@ -0,0 +1,6 @@
+"""
+Integ test file query_data.py
+"""
+
+def query_data():
+    print("query data")
diff --git a/tests/data/framework_processor_data/train_test_split.py b/tests/data/framework_processor_data/train_test_split.py
@@ -0,0 +1,6 @@
+"""
+Integ test file train_test_split.py
+"""
+
+def train_test_split():
+    print("train, test, split")
diff --git a/tests/integ/sagemaker/workflow/test_workflow.py b/tests/integ/sagemaker/workflow/test_workflow.py
@@ -19,11 +19,14 @@
 import time
 import shutil
 
+from pathlib import Path
 from contextlib import contextmanager
 import pytest
 
 from botocore.exceptions import WaiterError
 import pandas as pd
+from sagemaker.network import NetworkConfig
+from sagemaker.tensorflow import TensorFlow
 
 from tests.integ.s3_utils import extract_files_from_s3
 from sagemaker.workflow.model_step import (
@@ -47,6 +50,7 @@
     ProcessingOutput,
     FeatureStoreOutput,
     ScriptProcessor,
+    FrameworkProcessor
 )
 from sagemaker.s3 import S3Uploader
 from sagemaker.session import get_execution_role
@@ -83,6 +87,7 @@
     TransformInput,
     PropertyFile,
     TuningStep,
+    CacheConfig
 )
 from sagemaker.workflow.step_collections import RegisterModel
 from sagemaker.workflow.pipeline import Pipeline
@@ -1310,3 +1315,149 @@ def test_caching_behavior(
         except Exception:
             os.remove(script_dir + "/dummy_script.py")
             pass
+
+def test_processing_steps_with_framework_processor(pipeline_session, role):
+    default_bucket = pipeline_session.default_bucket()
+    cache_config = CacheConfig(enable_caching=True, expire_after="PT1H")
+    evaluation_report = PropertyFile(
+        name="EvaluationReport", output_name="evaluation", path="evaluation.json"
+    )
+    query_processor = ScriptProcessor(
+        command=["python3"],
+        image_uri="my-img",
+        role=role,
+        instance_count=1,
+        instance_type="ml.m5.xlarge",
+        network_config=NetworkConfig(
+            enable_network_isolation=False,
+            # VPC-Prod
+            subnets=["subnet-something"],
+            security_group_ids=["sg-something"],
+        ),
+        sagemaker_session=pipeline_session,
+    )
+
+    data_processor = FrameworkProcessor(
+        role=role,
+        instance_type="ml.m5.xlarge",
+        instance_count=1,
+        estimator_cls=TensorFlow,
+        framework_version="2.9",
+        py_version="py39",
+        sagemaker_session=pipeline_session,
+    )
+
+    query_step = ProcessingStep(
+        name="Query-Data",
+        step_args=query_processor.run(
+            code=os.path.join(DATA_DIR, "framework_processor_data/query_data.py"),
+            arguments=[
+                "--output-path",
+                "s3://out1",
+                "--region",
+                "s3://out2",
+            ],
+        ),
+        cache_config=cache_config,
+    )
+
+    input_path = "/opt/ml/processing/input"
+    output_path = "/opt/ml/processing/output"
+
+    prepare_step = ProcessingStep(
+        name="Prepare-Data",
+        step_args=data_processor.run(
+            code="preprocess.py",
+            source_dir=DATA_DIR + "/framework_processor_data",
+            inputs=[
+                ProcessingInput(
+                    input_name="task_preprocess_input",
+                    source=query_step.properties.ProcessingOutputConfig.Outputs["task_query_output"].S3Output.S3Uri,
+                    destination=input_path,
+                )
+            ],
+            arguments=[
+                "--input-path",
+                input_path,
+                "--output-path",
+                output_path,
+            ],
+        ),
+        cache_config=cache_config,
+    )
+
+    split_step = ProcessingStep(
+        name="Split-Data",
+        step_args=data_processor.run(
+            code="train_test_split.py",
+            source_dir=DATA_DIR + "/framework_processor_data",
+            inputs=[
+                ProcessingInput(
+                    source=prepare_step.properties.ProcessingOutputConfig.Outputs[
+                        "task_preprocess_output"
+                    ].S3Output.S3Uri,
+                    destination=input_path,
+                ),
+            ],
+            arguments=["--input-path", input_path, "--output-path", output_path],
+        ),
+        cache_config=cache_config,
+    )
+
+    sk_processor = FrameworkProcessor(
+        framework_version="1.0-1",
+        instance_type="ml.m5.xlarge",
+        instance_count=1,
+        base_job_name="my-job",
+        role=role,
+        estimator_cls=SKLearn,
+        sagemaker_session=pipeline_session,
+    )
+
+    evaluate_step = ProcessingStep(
+        name="Evaluate-Model",
+        step_args=sk_processor.run(
+            code="evaluate.py",
+            source_dir=DATA_DIR + "/framework_processor_data",
+            outputs=[
+                ProcessingOutput(
+                    output_name="evaluation",
+                    source="/opt/ml/processing/evaluation",
+                ),
+            ],
+        ),
+        property_files=[evaluation_report],
+        cache_config=cache_config,
+    )
+
+    pipeline = Pipeline(
+        name="test-fw-proc-steps-pipeline",
+        steps=[query_step, prepare_step, split_step, evaluate_step]
+    )
+    try:
+        # create pipeline
+        pipeline.create(role)
+        definition = json.loads(pipeline.definition())
+
+        source_dir_tar_prefix = f"s3://{default_bucket}/{pipeline.name}" \
+            f"/code/{hash_files_or_dirs([DATA_DIR + '/framework_processor_data'])}"
+
+        run_procs = []
+
+        for step in definition["Steps"]:
+            for input_obj in step["Arguments"]["ProcessingInputs"]:
+                if input_obj["InputName"] == "entrypoint":
+                    s3_uri = input_obj["S3Input"]["S3Uri"]
+                    run_procs.append(s3_uri)
+
+                    # verify runproc.sh prefix is different from code artifact prefix
+                    assert Path(s3_uri).parent != source_dir_tar_prefix
+
+        # verify all the run_proc.sh artifact paths are distinct
+        assert len(run_procs) == len(set(run_procs))
+
+    finally:
+        try:
+            pipeline.delete()
+        except Exception:
+            pass
diff --git a/tests/unit/sagemaker/workflow/test_pipeline.py b/tests/unit/sagemaker/workflow/test_pipeline.py
@@ -17,7 +17,7 @@
 
 import pytest
 
-from mock import Mock
+from mock import Mock, patch
 
 from sagemaker import s3
 from sagemaker.workflow.condition_step import ConditionStep
@@ -78,6 +78,7 @@ def test_pipeline_create_with_parallelism_config(sagemaker_session_mock, role_ar
     )
 
 
+@patch("sagemaker.spark.processing.S3Uploader.upload_string_as_file_body")
 def test_large_pipeline_create(sagemaker_session_mock, role_arn):
     parameter = ParameterString("MyStr")
     pipeline = Pipeline(
@@ -87,8 +88,6 @@ def test_large_pipeline_create(sagemaker_session_mock, role_arn):
         sagemaker_session=sagemaker_session_mock,
     )
 
-    s3.S3Uploader.upload_string_as_file_body = Mock()
-
     pipeline.create(role_arn=role_arn)
 
     assert s3.S3Uploader.upload_string_as_file_body.called_with(
@@ -150,7 +149,7 @@ def test_pipeline_update_with_parallelism_config(sagemaker_session_mock, role_ar
         ParallelismConfiguration={"MaxParallelExecutionSteps": 10},
     )
 
-
+@patch("sagemaker.spark.processing.S3Uploader.upload_string_as_file_body")
 def test_large_pipeline_update(sagemaker_session_mock, role_arn):
     parameter = ParameterString("MyStr")
     pipeline = Pipeline(
@@ -160,8 +159,6 @@ def test_large_pipeline_update(sagemaker_session_mock, role_arn):
         sagemaker_session=sagemaker_session_mock,
     )
 
-    s3.S3Uploader.upload_string_as_file_body = Mock()
-
     pipeline.create(role_arn=role_arn)
 
     assert s3.S3Uploader.upload_string_as_file_body.called_with(

Original file line number	Diff line number	Diff line change
`@@ -114,11 +114,12 @@ def get_code_hash(step: Entity) -> str:`
`114`	`114`	`if isinstance(step, ProcessingStep) and step.step_args:`
`115`	`115`	`kwargs = step.step_args.func_kwargs`
`116`	`116`	`source_dir = kwargs.get("source_dir")`
	`117`	`+ submit_class = kwargs.get("submit_class")`
`117`	`118`	`dependencies = get_processing_dependencies(`
`118`	`119`	`[`
`119`	`120`	`kwargs.get("dependencies"),`
`120`	`121`	`kwargs.get("submit_py_files"),`
`121`		`- kwargs.get("submit_class"),`
	`122`	`+ [submit_class] if submit_class else None,`
`122`	`123`	`kwargs.get("submit_jars"),`
`123`	`124`	`kwargs.get("submit_files"),`
`124`	`125`	`]`