Skip to content

Commit d64ad40

Browse files
navinsoniLokiiiiiimchoi8739SergTogulSergey Togulev
authored andcommitted
feature: SM Training Compiler with an UI to enable/disable compilation for HuggingFace DLCs to speedup training
* feature: SM Training Compiler with an UI to enable/disable compilation for HuggingFace DLCs to speedup training Co-authored-by: Miyoung <[email protected]> * change: New repo for training compiler Co-authored-by: Sergey Togulev <[email protected]> Co-authored-by: Loki <[email protected]> Co-authored-by: Miyoung <[email protected]> Co-authored-by: Sergey Togulev <[email protected]> Co-authored-by: Sergey Togulev <[email protected]>
1 parent 2c915ce commit d64ad40

File tree

14 files changed

+947
-27
lines changed

14 files changed

+947
-27
lines changed

doc/frameworks/huggingface/sagemaker.huggingface.rst

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,15 @@ Hugging Face
44
Hugging Face Estimator
55
----------------------
66

7-
.. autoclass:: sagemaker.huggingface.estimator.HuggingFace
7+
.. autoclass:: sagemaker.huggingface.HuggingFace
8+
:members:
9+
:undoc-members:
10+
:show-inheritance:
11+
12+
Hugging Face Training Compiler Configuration
13+
--------------------------------------------
14+
15+
.. autoclass:: sagemaker.huggingface.TrainingCompilerConfig
816
:members:
917
:undoc-members:
1018
:show-inheritance:
@@ -17,8 +25,8 @@ Hugging Face Model
1725
:undoc-members:
1826
:show-inheritance:
1927

20-
HuggingFace Predictor
21-
---------------------
28+
Hugging Face Predictor
29+
----------------------
2230

2331
.. autoclass:: sagemaker.huggingface.model.HuggingFacePredictor
2432
:members:

src/sagemaker/estimator.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2432,10 +2432,12 @@ def training_image_uri(self):
24322432
distribution = self.distribution # pylint: disable=no-member
24332433
else:
24342434
distribution = None
2435+
compiler_config = getattr(self, "compiler_config", None)
24352436

24362437
if hasattr(self, "tensorflow_version") or hasattr(self, "pytorch_version"):
24372438
processor = image_uris._processor(self.instance_type, ["cpu", "gpu"])
2438-
container_version = "cu110-ubuntu18.04" if processor == "gpu" else None
2439+
is_native_huggingface_gpu = processor == "gpu" and not compiler_config
2440+
container_version = "cu110-ubuntu18.04" if is_native_huggingface_gpu else None
24392441
if self.tensorflow_version is not None: # pylint: disable=no-member
24402442
base_framework_version = (
24412443
f"tensorflow{self.tensorflow_version}" # pylint: disable=no-member
@@ -2458,6 +2460,7 @@ def training_image_uri(self):
24582460
distribution=distribution,
24592461
base_framework_version=base_framework_version,
24602462
container_version=container_version,
2463+
training_compiler_config=compiler_config,
24612464
)
24622465

24632466
@classmethod

src/sagemaker/fw_utils.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,8 @@ def framework_name_from_image(image_uri):
296296
name_pattern = re.compile(
297297
r"""^(?:sagemaker(?:-rl)?-)?
298298
(tensorflow|mxnet|chainer|pytorch|scikit-learn|xgboost
299-
|huggingface-tensorflow|huggingface-pytorch)(?:-)?
299+
|huggingface-tensorflow|huggingface-pytorch
300+
|huggingface-tensorflow-trcomp|huggingface-pytorch-trcomp)(?:-)?
300301
(scriptmode|training)?
301302
:(.*)-(.*?)-(py2|py3\d*)(?:.*)$""",
302303
re.VERBOSE,

src/sagemaker/huggingface/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,3 +16,5 @@
1616
from sagemaker.huggingface.estimator import HuggingFace # noqa: F401
1717
from sagemaker.huggingface.model import HuggingFaceModel, HuggingFacePredictor # noqa: F401
1818
from sagemaker.huggingface.processing import HuggingFaceProcessor # noqa:F401
19+
20+
from sagemaker.training_compiler.config import TrainingCompilerConfig # noqa: F401

src/sagemaker/huggingface/estimator.py

Lines changed: 45 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from sagemaker.huggingface.model import HuggingFaceModel
2727
from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT
2828

29+
from sagemaker.training_compiler.config import TrainingCompilerConfig
30+
2931
logger = logging.getLogger("sagemaker")
3032

3133

@@ -45,7 +47,8 @@ def __init__(
4547
hyperparameters=None,
4648
image_uri=None,
4749
distribution=None,
48-
**kwargs
50+
compiler_config=None,
51+
**kwargs,
4952
):
5053
"""This ``Estimator`` executes a HuggingFace script in a managed execution environment.
5154
@@ -135,6 +138,8 @@ def __init__(
135138
}
136139
}
137140
}
141+
compiler_config (:class:`~sagemaker.huggingface.TrainingCompilerConfig`):
142+
Configures SageMaker Training Compiler to accelerate training.
138143
139144
**kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework`
140145
constructor.
@@ -152,16 +157,16 @@ def __init__(
152157

153158
self._validate_args(image_uri=image_uri)
154159

155-
if distribution is not None:
156-
instance_type = renamed_kwargs(
157-
"train_instance_type", "instance_type", kwargs.get("instance_type"), kwargs
158-
)
160+
instance_type = renamed_kwargs(
161+
"train_instance_type", "instance_type", kwargs.get("instance_type"), kwargs
162+
)
159163

160-
base_framework_name = "tensorflow" if tensorflow_version is not None else "pytorch"
161-
base_framework_version = (
162-
tensorflow_version if tensorflow_version is not None else pytorch_version
163-
)
164+
base_framework_name = "tensorflow" if tensorflow_version is not None else "pytorch"
165+
base_framework_version = (
166+
tensorflow_version if tensorflow_version is not None else pytorch_version
167+
)
164168

169+
if distribution is not None:
165170
validate_smdistributed(
166171
instance_type=instance_type,
167172
framework_name=base_framework_name,
@@ -183,7 +188,24 @@ def __init__(
183188
super(HuggingFace, self).__init__(
184189
entry_point, source_dir, hyperparameters, image_uri=image_uri, **kwargs
185190
)
191+
192+
if compiler_config is not None:
193+
if not isinstance(compiler_config, TrainingCompilerConfig):
194+
error_string = (
195+
f"Expected instance of type {TrainingCompilerConfig}"
196+
f"for argument compiler_config. "
197+
f"Instead got {type(compiler_config)}"
198+
)
199+
raise ValueError(error_string)
200+
if compiler_config:
201+
compiler_config.validate(
202+
image_uri=image_uri,
203+
instance_type=instance_type,
204+
distribution=distribution,
205+
)
206+
186207
self.distribution = distribution or {}
208+
self.compiler_config = compiler_config
187209

188210
def _validate_args(self, image_uri):
189211
"""Placeholder docstring"""
@@ -220,10 +242,19 @@ def _validate_args(self, image_uri):
220242
def hyperparameters(self):
221243
"""Return hyperparameters used by your custom PyTorch code during model training."""
222244
hyperparameters = super(HuggingFace, self).hyperparameters()
223-
additional_hyperparameters = self._distribution_configuration(
245+
distributed_training_hyperparameters = self._distribution_configuration(
224246
distribution=self.distribution
225247
)
226-
hyperparameters.update(Framework._json_encode_hyperparameters(additional_hyperparameters))
248+
hyperparameters.update(
249+
Framework._json_encode_hyperparameters(distributed_training_hyperparameters)
250+
)
251+
252+
if self.compiler_config:
253+
training_compiler_hyperparameters = self.compiler_config._to_hyperparameter_dict()
254+
hyperparameters.update(
255+
Framework._json_encode_hyperparameters(training_compiler_hyperparameters)
256+
)
257+
227258
return hyperparameters
228259

229260
def create_model(
@@ -234,7 +265,7 @@ def create_model(
234265
entry_point=None,
235266
source_dir=None,
236267
dependencies=None,
237-
**kwargs
268+
**kwargs,
238269
):
239270
"""Create a SageMaker ``HuggingFaceModel`` object that can be deployed to an ``Endpoint``.
240271
@@ -286,7 +317,7 @@ def create_model(
286317
sagemaker_session=self.sagemaker_session,
287318
vpc_config=self.get_vpc_config(vpc_config_override),
288319
dependencies=(dependencies or self.dependencies),
289-
**kwargs
320+
**kwargs,
290321
)
291322

292323
@classmethod
@@ -311,7 +342,7 @@ def _prepare_init_params_from_job_description(cls, job_details, model_channel_na
311342
if tag is None:
312343
framework_version = None
313344
else:
314-
framework, pt_or_tf = framework.split("-")
345+
framework, pt_or_tf = framework.split("-")[:2]
315346
tag_pattern = re.compile(r"^(.*)-transformers(.*)-(cpu|gpu)-(py2|py3\d*)$")
316347
tag_match = tag_pattern.match(tag)
317348
pt_or_tf_version = tag_match.group(1)
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
{
2+
"training": {
3+
"processors": ["gpu"],
4+
"version_aliases": {
5+
"4.10": "4.10.2"
6+
},
7+
"versions": {
8+
"4.10.2": {
9+
"version_aliases": {
10+
"pytorch1.9": "pytorch1.9.0",
11+
"tensorflow2.5": "tensorflow2.5.1"
12+
},
13+
"pytorch1.9.0": {
14+
"py_versions": ["py38"],
15+
"registries": {
16+
"eu-west-1": "763104351884",
17+
"us-east-1": "763104351884",
18+
"us-east-2": "763104351884",
19+
"us-west-2": "763104351884"
20+
},
21+
"repository": "huggingface-pytorch-trcomp-training",
22+
"container_version": {"gpu":"cu111-ubuntu20.04"}
23+
},
24+
"tensorflow2.5.1": {
25+
"py_versions": ["py37"],
26+
"registries": {
27+
"eu-west-1": "763104351884",
28+
"us-east-1": "763104351884",
29+
"us-east-2": "763104351884",
30+
"us-west-2": "763104351884"
31+
},
32+
"repository": "huggingface-tensorflow-trcomp-training",
33+
"container_version": {"gpu":"cu112-ubuntu18.04"}
34+
}
35+
}
36+
}
37+
}
38+
}

src/sagemaker/image_uris.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ def retrieve(
3838
container_version=None,
3939
distribution=None,
4040
base_framework_version=None,
41+
training_compiler_config=None,
4142
):
4243
"""Retrieves the ECR URI for the Docker image matching the given arguments.
4344
@@ -65,6 +66,8 @@ def retrieve(
6566
https://github.com/aws/deep-learning-containers/blob/master/available_images.md
6667
(default: None).
6768
distribution (dict): A dictionary with information on how to run distributed training
69+
training_compiler_config (:class:`~sagemaker.training_compiler.TrainingCompilerConfig`):
70+
A configuration class for the SageMaker Training Compiler
6871
(default: None).
6972
7073
Returns:
@@ -73,8 +76,16 @@ def retrieve(
7376
Raises:
7477
ValueError: If the combination of arguments specified is not supported.
7578
"""
76-
77-
config = _config_for_framework_and_scope(framework, image_scope, accelerator_type)
79+
if training_compiler_config is None:
80+
config = _config_for_framework_and_scope(framework, image_scope, accelerator_type)
81+
elif framework == HUGGING_FACE_FRAMEWORK:
82+
config = _config_for_framework_and_scope(
83+
framework + "-training-compiler", image_scope, accelerator_type
84+
)
85+
else:
86+
raise ValueError(
87+
"Unsupported Configuration: Training Compiler is only supported with HuggingFace"
88+
)
7889
original_version = version
7990
version = _validate_version_and_set_if_needed(version, config, framework)
8091
version_config = config["versions"][_version_for_config(version, config)]
@@ -108,7 +119,6 @@ def retrieve(
108119
re.compile("^(pytorch|tensorflow)(.*)$").match(base_framework_version).group(2)
109120
)
110121
tag_prefix = f"{pt_or_tf_version}-transformers{original_version}"
111-
112122
else:
113123
tag_prefix = version_config.get("tag_prefix", version)
114124

@@ -134,9 +144,7 @@ def retrieve(
134144
"pytorch-1.6-gpu-py3": "cu110-ubuntu18.04-v3",
135145
"pytorch-1.6.0-gpu-py3": "cu110-ubuntu18.04",
136146
}
137-
138147
key = "-".join([framework, tag])
139-
140148
if key in container_versions:
141149
tag = "-".join([tag, container_versions[key]])
142150

src/sagemaker/training_compiler/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)