Merge branch 'support-estimator-output-param'

jerrypeng7773 · jerrypeng7773 · commit 9e70866c9698 · 2022-05-13T14:41:05.000-07:00
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -695,14 +695,19 @@ def _stage_user_code_in_s3(self) -> str:
 
         Returns: S3 URI
         """
-        local_mode = self.output_path.startswith("file://")
+        local_mode = not is_pipeline_variable(self.output_path) and self.output_path.startswith(
+            "file://"
+        )
 
         if self.code_location is None and local_mode:
             code_bucket = self.sagemaker_session.default_bucket()
             code_s3_prefix = "{}/{}".format(self._current_job_name, "source")
             kms_key = None
         elif self.code_location is None:
-            code_bucket, _ = parse_s3_url(self.output_path)
+            if is_pipeline_variable(self.output_path):
+                code_bucket = self.sagemaker_session.default_bucket()
+            else:
+                code_bucket, _ = parse_s3_url(self.output_path)
             code_s3_prefix = "{}/{}".format(self._current_job_name, "source")
             kms_key = self.output_kms_key
         elif local_mode:
@@ -713,7 +718,10 @@ def _stage_user_code_in_s3(self) -> str:
             code_bucket, key_prefix = parse_s3_url(self.code_location)
             code_s3_prefix = "/".join(filter(None, [key_prefix, self._current_job_name, "source"]))
 
-            output_bucket, _ = parse_s3_url(self.output_path)
+            if is_pipeline_variable(self.output_path):
+                output_bucket = self.sagemaker_session.default_bucket()
+            else:
+                output_bucket, _ = parse_s3_url(self.output_path)
             kms_key = self.output_kms_key if code_bucket == output_bucket else None
 
         return tar_and_upload_dir(
diff --git a/tests/data/_repack_model.py b/tests/data/_repack_model.py
@@ -0,0 +1,110 @@
+# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""Repack model script for training jobs to inject entry points"""
+from __future__ import absolute_import
+
+import argparse
+import os
+import shutil
+import tarfile
+import tempfile
+
+# Repack Model
+# The following script is run via a training job which takes an existing model and a custom
+# entry point script as arguments. The script creates a new model archive with the custom
+# entry point in the "code" directory along with the existing model.  Subsequently, when the model
+# is unpacked for inference, the custom entry point will be used.
+# Reference: https://docs.aws.amazon.com/sagemaker/latest/dg/amazon-sagemaker-toolkits.html
+
+# distutils.dir_util.copy_tree works way better than the half-baked
+# shutil.copytree which bombs on previously existing target dirs...
+# alas ... https://bugs.python.org/issue10948
+# we'll go ahead and use the copy_tree function anyways because this
+# repacking is some short-lived hackery, right??
+from distutils.dir_util import copy_tree
+
+
+def repack(inference_script, model_archive, dependencies=None, source_dir=None):  # pragma: no cover
+    """Repack custom dependencies and code into an existing model TAR archive
+
+    Args:
+        inference_script (str): The path to the custom entry point.
+        model_archive (str): The name or path (e.g. s3 uri) of the model TAR archive.
+        dependencies (str): A space-delimited string of paths to custom dependencies.
+        source_dir (str): The path to a custom source directory.
+    """
+
+    # the data directory contains a model archive generated by a previous training job
+    data_directory = "/opt/ml/input/data/training"
+    model_path = os.path.join(data_directory, model_archive.split("/")[-1])
+
+    # create a temporary directory
+    with tempfile.TemporaryDirectory() as tmp:
+        local_path = os.path.join(tmp, "local.tar.gz")
+        # copy the previous training job's model archive to the temporary directory
+        shutil.copy2(model_path, local_path)
+        src_dir = os.path.join(tmp, "src")
+        # create the "code" directory which will contain the inference script
+        code_dir = os.path.join(src_dir, "code")
+        os.makedirs(code_dir)
+        # extract the contents of the previous training job's model archive to the "src"
+        # directory of this training job
+        with tarfile.open(name=local_path, mode="r:gz") as tf:
+            tf.extractall(path=src_dir)
+
+        if source_dir:
+            # copy /opt/ml/code to code/
+            if os.path.exists(code_dir):
+                shutil.rmtree(code_dir)
+            shutil.copytree("/opt/ml/code", code_dir)
+        else:
+            # copy the custom inference script to code/
+            entry_point = os.path.join("/opt/ml/code", inference_script)
+            shutil.copy2(entry_point, os.path.join(code_dir, inference_script))
+
+        # copy any dependencies to code/lib/
+        if dependencies:
+            for dependency in dependencies.split(" "):
+                actual_dependency_path = os.path.join("/opt/ml/code", dependency)
+                lib_dir = os.path.join(code_dir, "lib")
+                if not os.path.exists(lib_dir):
+                    os.mkdir(lib_dir)
+                if os.path.isfile(actual_dependency_path):
+                    shutil.copy2(actual_dependency_path, lib_dir)
+                else:
+                    if os.path.exists(lib_dir):
+                        shutil.rmtree(lib_dir)
+                    # a directory is in the dependencies. we have to copy
+                    # all of /opt/ml/code into the lib dir because the original directory
+                    # was flattened by the SDK training job upload..
+                    shutil.copytree("/opt/ml/code", lib_dir)
+                    break
+
+        # copy the "src" dir, which includes the previous training job's model and the
+        # custom inference script, to the output of this training job
+        copy_tree(src_dir, "/opt/ml/model")
+
+
+if __name__ == "__main__":  # pragma: no cover
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--inference_script", type=str, default="inference.py")
+    parser.add_argument("--dependencies", type=str, default=None)
+    parser.add_argument("--source_dir", type=str, default=None)
+    parser.add_argument("--model_archive", type=str, default="model.tar.gz")
+    args, extra = parser.parse_known_args()
+    repack(
+        inference_script=args.inference_script,
+        dependencies=args.dependencies,
+        source_dir=args.source_dir,
+        model_archive=args.model_archive,
+    )
diff --git a/tests/integ/sagemaker/workflow/test_training_steps.py b/tests/integ/sagemaker/workflow/test_training_steps.py
@@ -60,6 +60,9 @@ def test_training_job_with_debugger_and_profiler(
 ):
     instance_count = ParameterInteger(name="InstanceCount", default_value=1)
     instance_type = ParameterString(name="InstanceType", default_value="ml.m5.xlarge")
+    output_path = ParameterString(
+        name="OutputPath", default_value=f"s3://{sagemaker_session.default_bucket()}/test/"
+    )
 
     rules = [
         Rule.sagemaker(rule_configs.vanishing_gradient()),
@@ -88,6 +91,7 @@ def test_training_job_with_debugger_and_profiler(
         sagemaker_session=sagemaker_session,
         rules=rules,
         debugger_hook_config=debugger_hook_config,
+        output_path=output_path,
     )
 
     step_train = TrainingStep(
@@ -98,7 +102,7 @@ def test_training_job_with_debugger_and_profiler(
 
     pipeline = Pipeline(
         name=pipeline_name,
-        parameters=[instance_count, instance_type],
+        parameters=[instance_count, instance_type, output_path],
         steps=[step_train],
         sagemaker_session=sagemaker_session,
     )
diff --git a/tests/unit/sagemaker/workflow/test_training_step.py b/tests/unit/sagemaker/workflow/test_training_step.py
@@ -13,13 +13,21 @@
 # language governing permissions and limitations under the License.
 from __future__ import absolute_import
 
+<<<<<<< HEAD
+=======
+import os
+>>>>>>> support-estimator-output-param
 import json
 
 import pytest
 import sagemaker
 import warnings
 
 from sagemaker.workflow.pipeline_context import PipelineSession
+<<<<<<< HEAD
+=======
+from sagemaker.workflow.parameters import ParameterString
+>>>>>>> support-estimator-output-param
 
 from sagemaker.workflow.steps import TrainingStep
 from sagemaker.workflow.pipeline import Pipeline
@@ -46,12 +54,14 @@
 from sagemaker.amazon.ntm import NTM
 from sagemaker.amazon.object2vec import Object2Vec
 
+from tests.integ import DATA_DIR
 
 from sagemaker.inputs import TrainingInput
 
 REGION = "us-west-2"
 IMAGE_URI = "fakeimage"
 MODEL_NAME = "gisele"
+DUMMY_LOCAL_SCRIPT_PATH = os.path.join(DATA_DIR, "dummy_script.py")
 DUMMY_S3_SCRIPT_PATH = "s3://dummy-s3/dummy_script.py"
 DUMMY_S3_SOURCE_DIR = "s3://dummy-s3-source-dir/"
 INSTANCE_TYPE = "ml.m4.xlarge"
@@ -119,6 +129,36 @@ def test_training_step_with_estimator(pipeline_session, training_input, hyperpar
     assert step.properties.TrainingJobName.expr == {"Get": "Steps.MyTrainingStep.TrainingJobName"}
 
 
+def test_estimator_with_parameterized_output(pipeline_session, training_input):
+    output_path = ParameterString(name="OutputPath")
+    estimator = XGBoost(
+        framework_version="1.3-1",
+        py_version="py3",
+        role=sagemaker.get_execution_role(),
+        instance_type=INSTANCE_TYPE,
+        instance_count=1,
+        entry_point=DUMMY_LOCAL_SCRIPT_PATH,
+        output_path=output_path,
+        sagemaker_session=pipeline_session,
+    )
+    step_args = estimator.fit(inputs=training_input)
+    step = TrainingStep(
+        name="MyTrainingStep",
+        step_args=step_args,
+        description="TrainingStep description",
+        display_name="MyTrainingStep",
+    )
+    pipeline = Pipeline(
+        name="MyPipeline",
+        steps=[step],
+        sagemaker_session=pipeline_session,
+    )
+    step_def = json.loads(pipeline.definition())["Steps"][0]
+    assert step_def["Arguments"]["OutputDataConfig"]["S3OutputPath"] == {
+        "Get": "Parameters.OutputPath"
+    }
+
+
 @pytest.mark.parametrize(
     "estimator",
     [
@@ -128,23 +168,23 @@ def test_training_step_with_estimator(pipeline_session, training_input, hyperpar
             instance_type=INSTANCE_TYPE,
             instance_count=1,
             role=sagemaker.get_execution_role(),
-            entry_point="entry_point.py",
+            entry_point=DUMMY_LOCAL_SCRIPT_PATH,
         ),
         PyTorch(
             role=sagemaker.get_execution_role(),
             instance_type=INSTANCE_TYPE,
             instance_count=1,
             framework_version="1.8.0",
             py_version="py36",
-            entry_point="entry_point.py",
+            entry_point=DUMMY_LOCAL_SCRIPT_PATH,
         ),
         TensorFlow(
             role=sagemaker.get_execution_role(),
             instance_type=INSTANCE_TYPE,
             instance_count=1,
             framework_version="2.0",
             py_version="py3",
-            entry_point="entry_point.py",
+            entry_point=DUMMY_LOCAL_SCRIPT_PATH,
         ),
         HuggingFace(
             transformers_version="4.6",
@@ -153,23 +193,23 @@ def test_training_step_with_estimator(pipeline_session, training_input, hyperpar
             instance_type="ml.p3.2xlarge",
             instance_count=1,
             py_version="py36",
-            entry_point="entry_point.py",
+            entry_point=DUMMY_LOCAL_SCRIPT_PATH,
         ),
         XGBoost(
             framework_version="1.3-1",
             py_version="py3",
             role=sagemaker.get_execution_role(),
             instance_type=INSTANCE_TYPE,
             instance_count=1,
-            entry_point="entry_point.py",
+            entry_point=DUMMY_LOCAL_SCRIPT_PATH,
         ),
         MXNet(
             framework_version="1.4.1",
             py_version="py3",
             role=sagemaker.get_execution_role(),
             instance_type=INSTANCE_TYPE,
             instance_count=1,
-            entry_point="entry_point.py",
+            entry_point=DUMMY_LOCAL_SCRIPT_PATH,
         ),
         RLEstimator(
             entry_point="cartpole.py",
@@ -182,7 +222,7 @@ def test_training_step_with_estimator(pipeline_session, training_input, hyperpar
         ),
         Chainer(
             role=sagemaker.get_execution_role(),
-            entry_point="entry_point.py",
+            entry_point=DUMMY_LOCAL_SCRIPT_PATH,
             use_mpi=True,
             num_processes=4,
             framework_version="5.0.0",