feature: include estimator-centric transformer step (aws#467)

metrizable · Dan Choi · commit 24aad93b16c8 · 2020-11-19T10:47:29.000-08:00
diff --git a/.pylintrc b/.pylintrc
@@ -314,7 +314,7 @@ ignored-modules=distutils
 # List of class names for which member attributes should not be checked (useful
 # for classes with dynamically set attributes). This supports the use of
 # qualified names.
-ignored-classes=optparse.Values,thread._local,_thread._local,matplotlib.cm,tensorflow.python,tensorflow,tensorflow.train.Example,RunOptions
+ignored-classes=optparse.Values,thread._local,_thread._local,matplotlib.cm,tensorflow.python,tensorflow,tensorflow.train.Example,RunOptions,sagemaker.workflow.properties.Properties
 
 # List of members which are set dynamically and missed by pylint inference
 # system, and so shouldn't trigger E1101 when accessed. Python regular
diff --git a/src/sagemaker/estimator.py b/src/sagemaker/estimator.py
@@ -63,7 +63,7 @@
 logger = logging.getLogger(__name__)
 
 
-class EstimatorBase(with_metaclass(ABCMeta, object)):
+class EstimatorBase(with_metaclass(ABCMeta, object)):  # pylint: disable=too-many-public-methods
     """Handle end-to-end Amazon SageMaker training and deployment tasks.
 
     For introduction to model training and deployment, see
diff --git a/src/sagemaker/workflow/_utils.py b/src/sagemaker/workflow/_utils.py
@@ -98,6 +98,7 @@ def __init__(
                 "model_archive": self._model_archive,
             },
         )
+        repacker.disable_profiler = True
         inputs = TrainingInput(self._model_prefix)
 
         # super!
diff --git a/src/sagemaker/workflow/step_collections.py b/src/sagemaker/workflow/step_collections.py
@@ -18,8 +18,15 @@
 import attr
 
 from sagemaker.estimator import EstimatorBase
+from sagemaker.model import Model
+from sagemaker.predictor import Predictor
+from sagemaker.transformer import Transformer
 from sagemaker.workflow.entities import RequestType
-from sagemaker.workflow.steps import Step
+from sagemaker.workflow.steps import (
+    CreateModelStep,
+    Step,
+    TransformStep,
+)
 from sagemaker.workflow._utils import (
     _RegisterModelStep,
     _RepackModelStep,
@@ -114,3 +121,131 @@ def __init__(
         )
         steps.append(register_model_step)
         self.steps = steps
+
+
+class EstimatorTransformer(StepCollection):
+    """Creates a Transformer step collection for workflow.
+
+    Attributes:
+        steps (List[Step]): A list of steps.
+    """
+
+    def __init__(
+        self,
+        name: str,
+        estimator: EstimatorBase,
+        model_data,
+        model_inputs,
+        instance_count,
+        instance_type,
+        transform_inputs,
+        # model arguments
+        image_uri=None,
+        predictor_cls=None,
+        env=None,
+        # transformer arguments
+        strategy=None,
+        assemble_with=None,
+        output_path=None,
+        output_kms_key=None,
+        accept=None,
+        max_concurrent_transforms=None,
+        max_payload=None,
+        tags=None,
+        volume_kms_key=None,
+        **kwargs,
+    ):
+        """Constructs steps required for transformation:
+
+        An estimator-centric step collection, it models what occurs in current workflows
+        with invoking the `transform()` method on an estimator instance: first, if custom
+        model artifacts are required, a `_RepackModelStep` is included; second, a
+        `CreateModelStep` with the model data passed in from a training step or other
+        training job output; finally, a `TransformerStep`.
+
+        If repacking
+        the model artifacts is not necessary, only the CreateModelStep and TransformerStep
+        are in the step collection.
+        Args:
+            name (str): The name of the Transform Step.
+            estimator: The estimator instance.
+            instance_count (int): Number of EC2 instances to use.
+            instance_type (str): Type of EC2 instance to use, for example,
+                'ml.c4.xlarge'.
+            strategy (str): The strategy used to decide how to batch records in
+                a single request (default: None). Valid values: 'MultiRecord'
+                and 'SingleRecord'.
+            assemble_with (str): How the output is assembled (default: None).
+                Valid values: 'Line' or 'None'.
+            output_path (str): S3 location for saving the transform result. If
+                not specified, results are stored to a default bucket.
+            output_kms_key (str): Optional. KMS key ID for encrypting the
+                transform output (default: None).
+            accept (str): The accept header passed by the client to
+                the inference endpoint. If it is supported by the endpoint,
+                it will be the format of the batch transform output.
+            env (dict): Environment variables to be set for use during the
+                transform job (default: None).
+        """
+        steps = []
+        if "entry_point" in kwargs:
+            entry_point = kwargs["entry_point"]
+            source_dir = kwargs.get("source_dir")
+            dependencies = kwargs.get("dependencies")
+            repack_model_step = _RepackModelStep(
+                name=f"{name}RepackModel",
+                estimator=estimator,
+                model_data=model_data,
+                entry_point=entry_point,
+                source_dir=source_dir,
+                dependencies=dependencies,
+            )
+            steps.append(repack_model_step)
+            model_data = repack_model_step.properties.ModelArtifacts.S3ModelArtifacts
+
+        def predict_wrapper(endpoint, session):
+            return Predictor(endpoint, session)
+
+        predictor_cls = predictor_cls or predict_wrapper
+
+        model = Model(
+            image_uri=image_uri or estimator.training_image_uri(),
+            model_data=model_data,
+            predictor_cls=predictor_cls,
+            vpc_config=None,
+            sagemaker_session=estimator.sagemaker_session,
+            role=estimator.role,
+            **kwargs,
+        )
+        model_step = CreateModelStep(
+            name=f"{name}CreateModelStep",
+            model=model,
+            inputs=model_inputs,
+        )
+        steps.append(model_step)
+
+        transformer = Transformer(
+            model_name=model_step.properties.ModelName,
+            instance_count=instance_count,
+            instance_type=instance_type,
+            strategy=strategy,
+            assemble_with=assemble_with,
+            output_path=output_path,
+            output_kms_key=output_kms_key,
+            accept=accept,
+            max_concurrent_transforms=max_concurrent_transforms,
+            max_payload=max_payload,
+            env=env,
+            tags=tags,
+            base_transform_job_name=name,
+            volume_kms_key=volume_kms_key,
+            sagemaker_session=estimator.sagemaker_session,
+        )
+        transform_step = TransformStep(
+            name=f"{name}TransformStep",
+            transformer=transformer,
+            inputs=transform_inputs,
+        )
+        steps.append(transform_step)
+
+        self.steps = steps
diff --git a/tests/unit/sagemaker/workflow/test_step_collections.py b/tests/unit/sagemaker/workflow/test_step_collections.py
@@ -23,12 +23,14 @@
 )
 
 from sagemaker.estimator import Estimator
+from sagemaker.inputs import CreateModelInput, TransformInput
 from sagemaker.workflow.properties import Properties
 from sagemaker.workflow.steps import (
     Step,
     StepTypeEnum,
 )
 from sagemaker.workflow.step_collections import (
+    EstimatorTransformer,
     StepCollection,
     RegisterModel,
 )
@@ -101,7 +103,7 @@ def estimator(sagemaker_session):
         image_uri=IMAGE_URI,
         role=ROLE,
         instance_count=1,
-        instance_type="c4.4xlarge",
+        instance_type="ml.c4.4xlarge",
         sagemaker_session=sagemaker_session,
     )
 
@@ -145,3 +147,56 @@ def test_register_model(estimator):
             },
         ]
     )
+
+
+def test_estimator_transformer(estimator):
+    model_data = f"s3://{BUCKET}/model.tar.gz"
+    model_inputs = CreateModelInput(
+        instance_type="c4.4xlarge",
+        accelerator_type="ml.eia1.medium",
+    )
+    transform_inputs = TransformInput(data=f"s3://{BUCKET}/transform_manifest")
+    estimator_transformer = EstimatorTransformer(
+        name="EstimatorTransformerStep",
+        estimator=estimator,
+        model_data=model_data,
+        model_inputs=model_inputs,
+        instance_count=1,
+        instance_type="ml.c4.4xlarge",
+        transform_inputs=transform_inputs,
+    )
+    request_dicts = estimator_transformer.request_dicts()
+    assert len(request_dicts) == 2
+    for request_dict in request_dicts:
+        if request_dict["Type"] == "CreateModel":
+            assert request_dict == {
+                "Name": "EstimatorTransformerStepCreateModelStep",
+                "Type": "CreateModel",
+                "Arguments": {
+                    "ExecutionRoleArn": "DummyRole",
+                    "PrimaryContainer": {
+                        "Environment": {},
+                        "Image": "fakeimage",
+                        "ModelDataUrl": "s3://my-bucket/model.tar.gz",
+                    },
+                },
+            }
+        elif request_dict["Type"] == "Transform":
+            assert request_dict["Name"] == "EstimatorTransformerStepTransformStep"
+            arguments = request_dict["Arguments"]
+            assert isinstance(arguments["ModelName"], Properties)
+            arguments.pop("ModelName")
+            assert arguments == {
+                "TransformInput": {
+                    "DataSource": {
+                        "S3DataSource": {
+                            "S3DataType": "S3Prefix",
+                            "S3Uri": f"s3://{BUCKET}/transform_manifest",
+                        }
+                    }
+                },
+                "TransformOutput": {"S3OutputPath": None},
+                "TransformResources": {"InstanceCount": 1, "InstanceType": "ml.c4.4xlarge"},
+            }
+        else:
+            raise Exception("A step exists in the collection of an invalid type.")
diff --git a/tox.ini b/tox.ini
@@ -147,7 +147,7 @@ commands =
 [testenv:docstyle]
 deps = pydocstyle
 commands = 
-    pydocstyle src/sagemaker/{posargs}
+    pydocstyle src/sagemaker
 
 [testenv:collect-tests]
 # this needs to succeed for tests to display in some IDEs

Original file line number	Diff line number	Diff line change
`@@ -98,6 +98,7 @@ def __init__(`
`98`	`98`	`"model_archive": self._model_archive,`
`99`	`99`	`},`
`100`	`100`	`)`
	`101`	`+ repacker.disable_profiler = True`
`101`	`102`	`inputs = TrainingInput(self._model_prefix)`
`102`	`103`
`103`	`104`	`# super!`