Skip to content

Commit bb9613b

Browse files
committed
feat(processing): add HuggingFaceProcessor
Add a FrameworkProcessor class for HuggingFace, including a refactor of how FrameworkProcessor creates Estimators to enable the addition.
1 parent 4312db9 commit bb9613b

File tree

7 files changed

+412
-74
lines changed

7 files changed

+412
-74
lines changed

src/sagemaker/huggingface/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,3 +14,4 @@
1414
from __future__ import absolute_import
1515

1616
from sagemaker.huggingface.estimator import HuggingFace # noqa: F401
17+
from sagemaker.huggingface.processing import HuggingFaceProcessor # noqa:F401
Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
# Copyright 2019-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
"""This module contains code related to HuggingFace Processors which are used for Processing jobs.
14+
15+
These jobs let customers perform data pre-processing, post-processing, feature engineering,
16+
data validation, and model evaluation and interpretation on SageMaker.
17+
"""
18+
from __future__ import absolute_import
19+
20+
from sagemaker.processing import FrameworkProcessor
21+
from sagemaker.huggingface.estimator import HuggingFace
22+
23+
24+
class HuggingFaceProcessor(FrameworkProcessor):
25+
"""Handles Amazon SageMaker processing tasks for jobs using HuggingFace containers."""
26+
27+
estimator_cls = HuggingFace
28+
29+
def __init__(
30+
self,
31+
role,
32+
instance_count,
33+
instance_type,
34+
transformers_version=None,
35+
tensorflow_version=None,
36+
pytorch_version=None,
37+
py_version="py36",
38+
image_uri=None,
39+
command=["python"],
40+
volume_size_in_gb=30,
41+
volume_kms_key=None,
42+
output_kms_key=None,
43+
code_location=None,
44+
max_runtime_in_seconds=None,
45+
base_job_name=None,
46+
sagemaker_session=None,
47+
env=None,
48+
tags=None,
49+
network_config=None,
50+
):
51+
"""This processor executes a Python script in a HuggingFace execution environment.
52+
53+
Unless ``image_uri`` is specified, the environment is an Amazon-built Docker container
54+
that executes functions defined in the supplied ``code`` Python script.
55+
56+
The arguments have the same meaning as in ``FrameworkProcessor``, with the following
57+
exceptions.
58+
59+
Args:
60+
transformers_version (str): Transformers version you want to use for
61+
executing your model training code. Defaults to ``None``. Required unless
62+
``image_uri`` is provided. The current supported version is ``4.4.2``.
63+
tensorflow_version (str): TensorFlow version you want to use for
64+
executing your model training code. Defaults to ``None``. Required unless
65+
``pytorch_version`` is provided. The current supported version is ``1.6.0``.
66+
pytorch_version (str): PyTorch version you want to use for
67+
executing your model training code. Defaults to ``None``. Required unless
68+
``tensorflow_version`` is provided. The current supported version is ``2.4.1``.
69+
py_version (str): Python version you want to use for executing your model training
70+
code. Defaults to ``None``. Required unless ``image_uri`` is provided. If
71+
using PyTorch, the current supported version is ``py36``. If using TensorFlow,
72+
the current supported version is ``py37``.
73+
74+
.. tip::
75+
76+
You can find additional parameters for initializing this class at
77+
:class:`~sagemaker.processing.FrameworkProcessor`.
78+
"""
79+
self.pytorch_version = pytorch_version
80+
self.tensorflow_version = tensorflow_version
81+
super().__init__(
82+
self.estimator_cls,
83+
transformers_version,
84+
role,
85+
instance_count,
86+
instance_type,
87+
py_version,
88+
image_uri,
89+
command,
90+
volume_size_in_gb,
91+
volume_kms_key,
92+
output_kms_key,
93+
code_location,
94+
max_runtime_in_seconds,
95+
base_job_name,
96+
sagemaker_session,
97+
env,
98+
tags,
99+
network_config,
100+
)
101+
102+
def _create_estimator(
103+
self,
104+
entry_point="",
105+
source_dir=None,
106+
dependencies=None,
107+
git_config=None,
108+
):
109+
"""Override default estimator factory function for HuggingFace's different parameters
110+
111+
HuggingFace estimators have 3 framework version parameters instead of one: The version for
112+
Transformers, PyTorch, and TensorFlow.
113+
"""
114+
return self.estimator_cls(
115+
transformers_version=self.framework_version,
116+
tensorflow_version=self.tensorflow_version,
117+
pytorch_version=self.pytorch_version,
118+
py_version=self.py_version,
119+
entry_point=entry_point,
120+
source_dir=source_dir,
121+
dependencies=dependencies,
122+
git_config=git_config,
123+
code_location=self.code_location,
124+
enable_network_isolation=False,
125+
image_uri=self.image_uri,
126+
role=self.role,
127+
instance_count=self.instance_count,
128+
instance_type=self.instance_type,
129+
sagemaker_session=self.sagemaker_session,
130+
debugger_hook_config=False,
131+
disable_profiler=True,
132+
)

src/sagemaker/processing.py

Lines changed: 48 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1298,10 +1298,15 @@ def __init__(
12981298
self.framework_version = framework_version
12991299
self.py_version = py_version
13001300

1301-
image_uri, base_job_name = self._pre_init_normalization(
1302-
instance_type, image_uri, base_job_name, sagemaker_session
1303-
)
1304-
1301+
# 1. To finalize/normalize the image_uri or base_job_name, we need to create an
1302+
# estimator_cls instance.
1303+
# 2. We want to make it easy for children of FrameworkProcessor to override estimator
1304+
# creation via a function (to create FrameworkProcessors for Estimators that may have
1305+
# different signatures - like HuggingFace or others in future).
1306+
# 3. Super-class __init__ doesn't (currently) do anything with these params besides
1307+
# storing them
1308+
#
1309+
# Therefore we'll init the superclass first and then customize the setup after:
13051310
super().__init__(
13061311
role=role,
13071312
image_uri=image_uri,
@@ -1318,6 +1323,7 @@ def __init__(
13181323
tags=tags,
13191324
network_config=network_config,
13201325
)
1326+
13211327
# This subclass uses the "code" input for actual payload and the ScriptProcessor parent's
13221328
# functionality for uploading just a small entrypoint script to invoke it.
13231329
self._CODE_CONTAINER_INPUT_NAME = "entrypoint"
@@ -1326,38 +1332,45 @@ def __init__(
13261332
code_location[:-1] if (code_location and code_location.endswith("/")) else code_location
13271333
)
13281334

1329-
def _pre_init_normalization(
1330-
self,
1331-
instance_type: str,
1332-
image_uri: Optional[str] = None,
1333-
base_job_name: Optional[str] = None,
1334-
sagemaker_session: Optional[str] = None,
1335-
) -> Tuple[str, str]:
1336-
"""Normalize job name and container image uri."""
1337-
# Normalize base_job_name
1338-
if base_job_name is None:
1339-
base_job_name = self.estimator_cls._framework_name
1335+
if image_uri is None or base_job_name is None:
1336+
# For these default configuration purposes, we don't need the optional args:
1337+
est = self._create_estimator()
1338+
if image_uri is None:
1339+
self.image_uri = est.training_image_uri()
13401340
if base_job_name is None:
1341-
logger.warning("Framework name is None. Please check with the maintainer.")
1342-
base_job_name = str(base_job_name) # Keep mypy happy.
1343-
1344-
# Normalize image uri.
1345-
if image_uri is None:
1346-
# Estimator used only to probe image uri, so can get away with some dummy values.
1347-
est = self.estimator_cls(
1348-
framework_version=self.framework_version,
1349-
instance_type=instance_type,
1350-
py_version=self.py_version,
1351-
image_uri=image_uri,
1352-
entry_point="",
1353-
role="",
1354-
enable_network_isolation=False,
1355-
instance_count=1, # SKLearn estimator explicitly disables instance_count>1
1356-
sagemaker_session=sagemaker_session,
1357-
)
1358-
image_uri = est.training_image_uri()
1341+
self.base_job_name = est.base_job_name or estimator_cls._framework_name
1342+
if base_job_name is None:
1343+
base_job_name = "framework-processor"
13591344

1360-
return image_uri, base_job_name
1345+
def _create_estimator(
1346+
self,
1347+
entry_point="",
1348+
source_dir=None,
1349+
dependencies=None,
1350+
git_config=None,
1351+
):
1352+
"""Instantiate the Framework Estimator that backs this Processor"""
1353+
return self.estimator_cls(
1354+
framework_version=self.framework_version,
1355+
py_version=self.py_version,
1356+
entry_point=entry_point,
1357+
source_dir=source_dir,
1358+
dependencies=dependencies,
1359+
git_config=git_config,
1360+
code_location=self.code_location,
1361+
enable_network_isolation=False, # True -> uploads to input channel. Not what we want!
1362+
image_uri=self.image_uri,
1363+
role=self.role,
1364+
# Estimator instance_count doesn't currently matter to FrameworkProcessor, and the
1365+
# SKLearn Framework Estimator requires instance_type==1. So here we hard-wire it to 1,
1366+
# but if it matters in future perhaps we could take self.instance_count here and have
1367+
# SKLearnProcessor override this function instead:
1368+
instance_count=1,
1369+
instance_type=self.instance_type,
1370+
sagemaker_session=self.sagemaker_session,
1371+
debugger_hook_config=False,
1372+
disable_profiler=True,
1373+
)
13611374

13621375
def get_run_args(
13631376
self,
@@ -1623,22 +1636,11 @@ def _upload_payload(
16231636
"""Upload payload sourcedir.tar.gz to S3."""
16241637
# A new estimator instance is required, because each call to ScriptProcessor.run() can
16251638
# use different codes.
1626-
estimator = self.estimator_cls(
1639+
estimator = self._create_estimator(
16271640
entry_point=entry_point,
16281641
source_dir=source_dir,
16291642
dependencies=dependencies,
16301643
git_config=git_config,
1631-
framework_version=self.framework_version,
1632-
py_version=self.py_version,
1633-
code_location=self.code_location, # Upload to <code_loc>/jobname/output/source.tar.gz
1634-
enable_network_isolation=False, # If true, uploads to input channel. Not what we want!
1635-
image_uri=self.image_uri, # The image uri is already normalized by this point.
1636-
role=self.role,
1637-
instance_type=self.instance_type,
1638-
instance_count=1,
1639-
sagemaker_session=self.sagemaker_session,
1640-
debugger_hook_config=False,
1641-
disable_profiler=True,
16421644
)
16431645

16441646
estimator._prepare_for_training(job_name=job_name)

tests/integ/test_huggingface.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,11 +16,47 @@
1616

1717
import pytest
1818

19-
from sagemaker.huggingface import HuggingFace
19+
from sagemaker.huggingface import HuggingFace, HuggingFaceProcessor
2020
from tests import integ
2121
from tests.integ import DATA_DIR, TRAINING_DEFAULT_TIMEOUT_MINUTES
2222
from tests.integ.timeout import timeout
2323

24+
ROLE = "SageMakerRole"
25+
26+
27+
@pytest.mark.release
28+
@pytest.mark.skipif(
29+
integ.test_region() in integ.TRAINING_NO_P2_REGIONS,
30+
reason="no ml.p2 instances in this region",
31+
)
32+
def test_framework_processing_job_with_deps(
33+
sagemaker_session,
34+
gpu_instance_type,
35+
huggingface_training_latest_version,
36+
huggingface_pytorch_latest_version,
37+
):
38+
with timeout(minutes=TRAINING_DEFAULT_TIMEOUT_MINUTES):
39+
code_path = os.path.join(DATA_DIR, "dummy_code_bundle_with_reqs")
40+
entry_point = "main_script.py"
41+
42+
processor = HuggingFaceProcessor(
43+
transformers_version=huggingface_training_latest_version,
44+
pytorch_version=huggingface_pytorch_latest_version,
45+
py_version="py36",
46+
role=ROLE,
47+
instance_count=1,
48+
instance_type=gpu_instance_type,
49+
sagemaker_session=sagemaker_session,
50+
base_job_name="test-huggingface",
51+
)
52+
53+
processor.run(
54+
code=entry_point,
55+
source_dir=code_path,
56+
inputs=[],
57+
wait=True,
58+
)
59+
2460

2561
@pytest.mark.release
2662
@pytest.mark.skipif(
@@ -39,7 +75,7 @@ def test_huggingface_training(
3975
hf = HuggingFace(
4076
py_version="py36",
4177
entry_point="examples/text-classification/run_glue.py",
42-
role="SageMakerRole",
78+
role=ROLE,
4379
transformers_version=huggingface_training_latest_version,
4480
pytorch_version=huggingface_pytorch_latest_version,
4581
instance_count=1,
@@ -86,7 +122,7 @@ def test_huggingface_training_tf(
86122
hf = HuggingFace(
87123
py_version="py37",
88124
entry_point=os.path.join(data_path, "run_tf.py"),
89-
role="SageMakerRole",
125+
role=ROLE,
90126
transformers_version=huggingface_training_latest_version,
91127
tensorflow_version=huggingface_tensorflow_latest_version,
92128
instance_count=1,
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
# Copyright 2017-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
from sagemaker import image_uris
16+
17+
REGION = "us-east-1"
18+
GPU_INSTANCE_TYPE = "ml.p2.xlarge"
19+
20+
21+
def get_full_gpu_image_uri(
22+
version,
23+
base_framework_version,
24+
region=REGION,
25+
instance_type=GPU_INSTANCE_TYPE,
26+
):
27+
return image_uris.retrieve(
28+
"huggingface",
29+
region,
30+
version=version,
31+
py_version="py36",
32+
instance_type=instance_type,
33+
image_scope="training",
34+
base_framework_version=base_framework_version,
35+
container_version="cu110-ubuntu18.04",
36+
)

0 commit comments

Comments
 (0)