feat(processing): add HuggingFaceProcessor

athewsey · athewsey · commit bb9613b0ebee · 2021-05-22T01:23:23.000+08:00
Add a FrameworkProcessor class for HuggingFace, including a refactor
of how FrameworkProcessor creates Estimators to enable the addition.
diff --git a/src/sagemaker/huggingface/__init__.py b/src/sagemaker/huggingface/__init__.py
@@ -14,3 +14,4 @@
 from __future__ import absolute_import
 
 from sagemaker.huggingface.estimator import HuggingFace  # noqa: F401
+from sagemaker.huggingface.processing import HuggingFaceProcessor  # noqa:F401
diff --git a/src/sagemaker/huggingface/processing.py b/src/sagemaker/huggingface/processing.py
@@ -0,0 +1,132 @@
+# Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+"""This module contains code related to HuggingFace Processors which are used for Processing jobs.
+
+These jobs let customers perform data pre-processing, post-processing, feature engineering,
+data validation, and model evaluation and interpretation on SageMaker.
+"""
+from __future__ import absolute_import
+
+from sagemaker.processing import FrameworkProcessor
+from sagemaker.huggingface.estimator import HuggingFace
+
+
+class HuggingFaceProcessor(FrameworkProcessor):
+    """Handles Amazon SageMaker processing tasks for jobs using HuggingFace containers."""
+
+    estimator_cls = HuggingFace
+
+    def __init__(
+        self,
+        role,
+        instance_count,
+        instance_type,
+        transformers_version=None,
+        tensorflow_version=None,
+        pytorch_version=None,
+        py_version="py36",
+        image_uri=None,
+        command=["python"],
+        volume_size_in_gb=30,
+        volume_kms_key=None,
+        output_kms_key=None,
+        code_location=None,
+        max_runtime_in_seconds=None,
+        base_job_name=None,
+        sagemaker_session=None,
+        env=None,
+        tags=None,
+        network_config=None,
+    ):
+        """This processor executes a Python script in a HuggingFace execution environment.
+
+        Unless ``image_uri`` is specified, the environment is an Amazon-built Docker container
+        that executes functions defined in the supplied ``code`` Python script.
+
+        The arguments have the same meaning as in ``FrameworkProcessor``, with the following
+        exceptions.
+
+        Args:
+            transformers_version (str): Transformers version you want to use for
+                executing your model training code. Defaults to ``None``. Required unless
+                ``image_uri`` is provided. The current supported version is ``4.4.2``.
+            tensorflow_version (str): TensorFlow version you want to use for
+                executing your model training code. Defaults to ``None``. Required unless
+                ``pytorch_version`` is provided. The current supported version is ``1.6.0``.
+            pytorch_version (str): PyTorch version you want to use for
+                executing your model training code. Defaults to ``None``. Required unless
+                ``tensorflow_version`` is provided. The current supported version is ``2.4.1``.
+            py_version (str): Python version you want to use for executing your model training
+                code. Defaults to ``None``. Required unless ``image_uri`` is provided.  If
+                using PyTorch, the current supported version is ``py36``. If using TensorFlow,
+                the current supported version is ``py37``.
+
+        .. tip::
+
+            You can find additional parameters for initializing this class at
+            :class:`~sagemaker.processing.FrameworkProcessor`.
+        """
+        self.pytorch_version = pytorch_version
+        self.tensorflow_version = tensorflow_version
+        super().__init__(
+            self.estimator_cls,
+            transformers_version,
+            role,
+            instance_count,
+            instance_type,
+            py_version,
+            image_uri,
+            command,
+            volume_size_in_gb,
+            volume_kms_key,
+            output_kms_key,
+            code_location,
+            max_runtime_in_seconds,
+            base_job_name,
+            sagemaker_session,
+            env,
+            tags,
+            network_config,
+        )
+
+    def _create_estimator(
+        self,
+        entry_point="",
+        source_dir=None,
+        dependencies=None,
+        git_config=None,
+    ):
+        """Override default estimator factory function for HuggingFace's different parameters
+
+        HuggingFace estimators have 3 framework version parameters instead of one: The version for
+        Transformers, PyTorch, and TensorFlow.
+        """
+        return self.estimator_cls(
+            transformers_version=self.framework_version,
+            tensorflow_version=self.tensorflow_version,
+            pytorch_version=self.pytorch_version,
+            py_version=self.py_version,
+            entry_point=entry_point,
+            source_dir=source_dir,
+            dependencies=dependencies,
+            git_config=git_config,
+            code_location=self.code_location,
+            enable_network_isolation=False,
+            image_uri=self.image_uri,
+            role=self.role,
+            instance_count=self.instance_count,
+            instance_type=self.instance_type,
+            sagemaker_session=self.sagemaker_session,
+            debugger_hook_config=False,
+            disable_profiler=True,
+        )
diff --git a/src/sagemaker/processing.py b/src/sagemaker/processing.py
@@ -1298,10 +1298,15 @@ def __init__(
         self.framework_version = framework_version
         self.py_version = py_version
 
-        image_uri, base_job_name = self._pre_init_normalization(
-            instance_type, image_uri, base_job_name, sagemaker_session
-        )
-
+        # 1. To finalize/normalize the image_uri or base_job_name, we need to create an
+        #    estimator_cls instance.
+        # 2. We want to make it easy for children of FrameworkProcessor to override estimator
+        #    creation via a function (to create FrameworkProcessors for Estimators that may have
+        #    different signatures - like HuggingFace or others in future).
+        # 3. Super-class __init__ doesn't (currently) do anything with these params besides
+        #    storing them
+        #
+        # Therefore we'll init the superclass first and then customize the setup after:
         super().__init__(
             role=role,
             image_uri=image_uri,
@@ -1318,6 +1323,7 @@ def __init__(
             tags=tags,
             network_config=network_config,
         )
+
         # This subclass uses the "code" input for actual payload and the ScriptProcessor parent's
         # functionality for uploading just a small entrypoint script to invoke it.
         self._CODE_CONTAINER_INPUT_NAME = "entrypoint"
@@ -1326,38 +1332,45 @@ def __init__(
             code_location[:-1] if (code_location and code_location.endswith("/")) else code_location
         )
 
-    def _pre_init_normalization(
-        self,
-        instance_type: str,
-        image_uri: Optional[str] = None,
-        base_job_name: Optional[str] = None,
-        sagemaker_session: Optional[str] = None,
-    ) -> Tuple[str, str]:
-        """Normalize job name and container image uri."""
-        # Normalize base_job_name
-        if base_job_name is None:
-            base_job_name = self.estimator_cls._framework_name
+        if image_uri is None or base_job_name is None:
+            # For these default configuration purposes, we don't need the optional args:
+            est = self._create_estimator()
+            if image_uri is None:
+                self.image_uri = est.training_image_uri()
             if base_job_name is None:
-                logger.warning("Framework name is None. Please check with the maintainer.")
-                base_job_name = str(base_job_name)  # Keep mypy happy.
-
-        # Normalize image uri.
-        if image_uri is None:
-            # Estimator used only to probe image uri, so can get away with some dummy values.
-            est = self.estimator_cls(
-                framework_version=self.framework_version,
-                instance_type=instance_type,
-                py_version=self.py_version,
-                image_uri=image_uri,
-                entry_point="",
-                role="",
-                enable_network_isolation=False,
-                instance_count=1,  # SKLearn estimator explicitly disables instance_count>1
-                sagemaker_session=sagemaker_session,
-            )
-            image_uri = est.training_image_uri()
+                self.base_job_name = est.base_job_name or estimator_cls._framework_name
+                if base_job_name is None:
+                    base_job_name = "framework-processor"
 
-        return image_uri, base_job_name
+    def _create_estimator(
+        self,
+        entry_point="",
+        source_dir=None,
+        dependencies=None,
+        git_config=None,
+    ):
+        """Instantiate the Framework Estimator that backs this Processor"""
+        return self.estimator_cls(
+            framework_version=self.framework_version,
+            py_version=self.py_version,
+            entry_point=entry_point,
+            source_dir=source_dir,
+            dependencies=dependencies,
+            git_config=git_config,
+            code_location=self.code_location,
+            enable_network_isolation=False,  # True -> uploads to input channel. Not what we want!
+            image_uri=self.image_uri,
+            role=self.role,
+            # Estimator instance_count doesn't currently matter to FrameworkProcessor, and the
+            # SKLearn Framework Estimator requires instance_type==1. So here we hard-wire it to 1,
+            # but if it matters in future perhaps we could take self.instance_count here and have
+            # SKLearnProcessor override this function instead:
+            instance_count=1,
+            instance_type=self.instance_type,
+            sagemaker_session=self.sagemaker_session,
+            debugger_hook_config=False,
+            disable_profiler=True,
+        )
 
     def get_run_args(
         self,
@@ -1623,22 +1636,11 @@ def _upload_payload(
         """Upload payload sourcedir.tar.gz to S3."""
         # A new estimator instance is required, because each call to ScriptProcessor.run() can
         # use different codes.
-        estimator = self.estimator_cls(
+        estimator = self._create_estimator(
             entry_point=entry_point,
             source_dir=source_dir,
             dependencies=dependencies,
             git_config=git_config,
-            framework_version=self.framework_version,
-            py_version=self.py_version,
-            code_location=self.code_location,  # Upload to <code_loc>/jobname/output/source.tar.gz
-            enable_network_isolation=False,  # If true, uploads to input channel. Not what we want!
-            image_uri=self.image_uri,  # The image uri is already normalized by this point.
-            role=self.role,
-            instance_type=self.instance_type,
-            instance_count=1,
-            sagemaker_session=self.sagemaker_session,
-            debugger_hook_config=False,
-            disable_profiler=True,
         )
 
         estimator._prepare_for_training(job_name=job_name)
diff --git a/tests/integ/test_huggingface.py b/tests/integ/test_huggingface.py
@@ -16,11 +16,47 @@
 
 import pytest
 
-from sagemaker.huggingface import HuggingFace
+from sagemaker.huggingface import HuggingFace, HuggingFaceProcessor
 from tests import integ
 from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
 from tests.integ.timeout import timeout
 
+ROLE = "SageMakerRole"
+
+
+@pytest.mark.release
+@pytest.mark.skipif(
+    integ.test_region() in integ.TRAINING_NO_P2_REGIONS,
+    reason="no ml.p2 instances in this region",
+)
+def test_framework_processing_job_with_deps(
+    sagemaker_session,
+    gpu_instance_type,
+    huggingface_training_latest_version,
+    huggingface_pytorch_latest_version,
+):
+    with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
+        code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
+        entry_point = "main_script.py"
+
+        processor = HuggingFaceProcessor(
+            transformers_version=huggingface_training_latest_version,
+            pytorch_version=huggingface_pytorch_latest_version,
+            py_version="py36",
+            role=ROLE,
+            instance_count=1,
+            instance_type=gpu_instance_type,
+            sagemaker_session=sagemaker_session,
+            base_job_name="test-huggingface",
+        )
+
+        processor.run(
+            code=entry_point,
+            source_dir=code_path,
+            inputs=[],
+            wait=True,
+        )
+
 
 @pytest.mark.release
 @pytest.mark.skipif(
@@ -39,7 +75,7 @@ def test_huggingface_training(
         hf = HuggingFace(
             py_version="py36",
             entry_point="examples/text-classification/run_glue.py",
-            role="SageMakerRole",
+            role=ROLE,
             transformers_version=huggingface_training_latest_version,
             pytorch_version=huggingface_pytorch_latest_version,
             instance_count=1,
@@ -86,7 +122,7 @@ def test_huggingface_training_tf(
         hf = HuggingFace(
             py_version="py37",
             entry_point=os.path.join(data_path, "run_tf.py"),
-            role="SageMakerRole",
+            role=ROLE,
             transformers_version=huggingface_training_latest_version,
             tensorflow_version=huggingface_tensorflow_latest_version,
             instance_count=1,
diff --git a/tests/unit/sagemaker/huggingface/huggingface_utils.py b/tests/unit/sagemaker/huggingface/huggingface_utils.py
@@ -0,0 +1,36 @@
+# Copyright 2017-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+from sagemaker import image_uris
+
+REGION = "us-east-1"
+GPU_INSTANCE_TYPE = "ml.p2.xlarge"
+
+
+def get_full_gpu_image_uri(
+    version,
+    base_framework_version,
+    region=REGION,
+    instance_type=GPU_INSTANCE_TYPE,
+):
+    return image_uris.retrieve(
+        "huggingface",
+        region,
+        version=version,
+        py_version="py36",
+        instance_type=instance_type,
+        image_scope="training",
+        base_framework_version=base_framework_version,
+        container_version="cu110-ubuntu18.04",
+    )
diff --git a/tests/unit/sagemaker/huggingface/test_estimator.py b/tests/unit/sagemaker/huggingface/test_estimator.py
diff --git a/tests/unit/sagemaker/huggingface/test_processing.py b/tests/unit/sagemaker/huggingface/test_processing.py

Original file line number	Diff line number	Diff line change
`@@ -14,3 +14,4 @@`
`14`	`14`	`from __future__ import absolute_import`
`15`	`15`
`16`	`16`	`from sagemaker.huggingface.estimator import HuggingFace # noqa: F401`
	`17`	`+from sagemaker.huggingface.processing import HuggingFaceProcessor # noqa:F401`