Skip to content

Commit b9c6c82

Browse files
authored
Merge branch 'master' into master
2 parents 5a9a31a + ebd48c9 commit b9c6c82

File tree

12 files changed

+550
-39
lines changed

12 files changed

+550
-39
lines changed

CHANGELOG.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
# Changelog
22

3+
## v2.147.0 (2023-04-18)
4+
5+
### Features
6+
7+
* support different types of deletion mode
8+
39
## v2.146.1 (2023-04-17)
410

511
### Bug Fixes and Other Changes

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.146.2.dev0
1+
2.147.1.dev0

src/sagemaker/fw_utils.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@
135135
"1.12.0",
136136
"1.12.1",
137137
"1.13.1",
138+
"2.0.0",
138139
],
139140
}
140141

@@ -148,10 +149,11 @@
148149
"1.12.0",
149150
"1.12.1",
150151
"1.13.1",
152+
"2.0.0",
151153
]
152154

153155

154-
TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS = ["1.13.1"]
156+
TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS = ["1.13.1", "2.0.0"]
155157

156158
TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES = ["torch_distributed"]
157159
TRAINIUM_SUPPORTED_TORCH_DISTRIBUTED_FRAMEWORK_VERSIONS = [
@@ -161,6 +163,7 @@
161163
"1.12.0",
162164
"1.12.1",
163165
"1.13.1",
166+
"2.0.0",
164167
]
165168

166169
SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]

src/sagemaker/huggingface/estimator.py

Lines changed: 85 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,10 @@
1717
import re
1818
from typing import Optional, Union, Dict
1919

20-
from sagemaker.deprecations import renamed_kwargs
2120
from sagemaker.estimator import Framework, EstimatorBase
2221
from sagemaker.fw_utils import (
2322
framework_name_from_image,
24-
warn_if_parameter_server_with_multi_gpu,
25-
validate_smdistributed,
23+
validate_distribution,
2624
)
2725
from sagemaker.huggingface.model import HuggingFaceModel
2826
from sagemaker.vpc_utils import VPC_CONFIG_DEFAULT
@@ -37,6 +35,9 @@ class HuggingFace(Framework):
3735
"""Handle training of custom HuggingFace code."""
3836

3937
_framework_name = "huggingface"
38+
LAUNCH_PYTORCH_DDP_ENV_NAME = "sagemaker_pytorch_ddp_enabled"
39+
LAUNCH_TORCH_DISTRIBUTED_ENV_NAME = "sagemaker_torch_distributed_enabled"
40+
INSTANCE_TYPE_ENV_NAME = "sagemaker_instance_type"
4041

4142
def __init__(
4243
self,
@@ -142,6 +143,36 @@ def __init__(
142143
}
143144
}
144145
146+
**To enable PyTorch DDP:**
147+
148+
.. code:: python
149+
150+
{
151+
"pytorchddp": {
152+
"enabled": True
153+
}
154+
}
155+
156+
To learn more, see `Distributed PyTorch Training
157+
<https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training>`_.
158+
159+
**To enable Torch Distributed:**
160+
161+
This is available for general distributed training on
162+
GPU instances from PyTorch v1.13.1 and later.
163+
164+
.. code:: python
165+
166+
{
167+
"torch_distributed": {
168+
"enabled": True
169+
}
170+
}
171+
172+
This option also supports distributed training on Trn1.
173+
To learn more, see `Distributed PyTorch Training on Trainium
174+
<https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training-on-trainium>`_.
175+
145176
To enable distributed training with
146177
`SageMaker Training Compiler <https://docs.aws.amazon.com/sagemaker/latest/dg/training-compiler.html>`_
147178
for Hugging Face Transformers with PyTorch:
@@ -182,29 +213,6 @@ def __init__(
182213

183214
self._validate_args(image_uri=image_uri)
184215

185-
instance_type = renamed_kwargs(
186-
"train_instance_type", "instance_type", kwargs.get("instance_type"), kwargs
187-
)
188-
189-
base_framework_name = "tensorflow" if tensorflow_version is not None else "pytorch"
190-
base_framework_version = (
191-
tensorflow_version if tensorflow_version is not None else pytorch_version
192-
)
193-
194-
if distribution is not None:
195-
validate_smdistributed(
196-
instance_type=instance_type,
197-
framework_name=base_framework_name,
198-
framework_version=base_framework_version,
199-
py_version=self.py_version,
200-
distribution=distribution,
201-
image_uri=image_uri,
202-
)
203-
204-
warn_if_parameter_server_with_multi_gpu(
205-
training_instance_type=instance_type, distribution=distribution
206-
)
207-
208216
if "enable_sagemaker_metrics" not in kwargs:
209217
kwargs["enable_sagemaker_metrics"] = True
210218

@@ -214,6 +222,25 @@ def __init__(
214222
entry_point, source_dir, hyperparameters, image_uri=image_uri, **kwargs
215223
)
216224

225+
if "entry_point" not in kwargs:
226+
kwargs["entry_point"] = entry_point
227+
228+
self.base_framework_name = "tensorflow" if tensorflow_version is not None else "pytorch"
229+
self.base_framework_version = (
230+
tensorflow_version if tensorflow_version is not None else pytorch_version
231+
)
232+
233+
if distribution is not None:
234+
distribution = validate_distribution(
235+
distribution,
236+
self.instance_groups,
237+
self.base_framework_name,
238+
self.base_framework_version,
239+
py_version,
240+
image_uri,
241+
kwargs,
242+
)
243+
217244
self.distribution = distribution or {}
218245

219246
if compiler_config is not None:
@@ -267,14 +294,44 @@ def _validate_args(self, image_uri):
267294
"transformers_version, tensorflow_version and pytorch_version."
268295
)
269296

297+
def _huggingface_distribution_configuration(self, distribution):
298+
"""Returns a dict of distribution config for Hugging Face training
299+
300+
Args:
301+
distribution (dict): A dictionary with information on how to run distributed training.
302+
Returns:
303+
dict containing Pytorch DDP config
304+
"""
305+
distribution_config = {}
306+
pytorch_ddp_enabled = False
307+
torch_distributed_enabled = False
308+
309+
if "pytorchddp" in distribution:
310+
pytorch_ddp_enabled = distribution.get("pytorchddp").get("enabled", False)
311+
elif "torch_distributed" in distribution:
312+
torch_distributed_enabled = distribution.get("torch_distributed").get("enabled", False)
313+
314+
if pytorch_ddp_enabled:
315+
distribution_config[self.LAUNCH_PYTORCH_DDP_ENV_NAME] = pytorch_ddp_enabled
316+
if self.instance_type is not None:
317+
distribution_config[self.INSTANCE_TYPE_ENV_NAME] = self.instance_type
318+
elif torch_distributed_enabled:
319+
distribution_config[self.LAUNCH_TORCH_DISTRIBUTED_ENV_NAME] = torch_distributed_enabled
320+
if self.instance_type is not None:
321+
distribution_config[self.INSTANCE_TYPE_ENV_NAME] = self.instance_type
322+
else:
323+
distribution_config = self._distribution_configuration(distribution=distribution)
324+
325+
return distribution_config
326+
270327
def hyperparameters(self):
271328
"""Return hyperparameters used by your custom PyTorch code during model training."""
272329
hyperparameters = super(HuggingFace, self).hyperparameters()
273-
distributed_training_hyperparameters = self._distribution_configuration(
330+
additional_hyperparameters = self._huggingface_distribution_configuration(
274331
distribution=self.distribution
275332
)
276333
hyperparameters.update(
277-
EstimatorBase._json_encode_hyperparameters(distributed_training_hyperparameters)
334+
EstimatorBase._json_encode_hyperparameters(additional_hyperparameters)
278335
)
279336

280337
if self.compiler_config:

src/sagemaker/image_uri_config/pytorch.json

Lines changed: 115 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,8 @@
7777
"1.10": "1.10.2",
7878
"1.11": "1.11.0",
7979
"1.12": "1.12.1",
80-
"1.13": "1.13.1"
80+
"1.13": "1.13.1",
81+
"2.0": "2.0.0"
8182
},
8283
"versions": {
8384
"0.4.0": {
@@ -838,6 +839,43 @@
838839
"us-west-2": "763104351884"
839840
},
840841
"repository": "pytorch-inference"
842+
},
843+
"2.0.0": {
844+
"py_versions": [
845+
"py310"
846+
],
847+
"registries": {
848+
"af-south-1": "626614931356",
849+
"ap-east-1": "871362719292",
850+
"ap-northeast-1": "763104351884",
851+
"ap-northeast-2": "763104351884",
852+
"ap-northeast-3": "364406365360",
853+
"ap-south-1": "763104351884",
854+
"ap-southeast-1": "763104351884",
855+
"ap-southeast-2": "763104351884",
856+
"ap-southeast-3": "907027046896",
857+
"ap-southeast-4": "457447274322",
858+
"ca-central-1": "763104351884",
859+
"cn-north-1": "727897471807",
860+
"cn-northwest-1": "727897471807",
861+
"eu-central-1": "763104351884",
862+
"eu-north-1": "763104351884",
863+
"eu-west-1": "763104351884",
864+
"eu-west-2": "763104351884",
865+
"eu-west-3": "763104351884",
866+
"eu-south-1": "692866216735",
867+
"me-south-1": "217643126080",
868+
"sa-east-1": "763104351884",
869+
"us-east-1": "763104351884",
870+
"us-east-2": "763104351884",
871+
"us-gov-east-1": "446045086412",
872+
"us-gov-west-1": "442386744353",
873+
"us-iso-east-1": "886529160074",
874+
"us-isob-east-1": "094389454867",
875+
"us-west-1": "763104351884",
876+
"us-west-2": "763104351884"
877+
},
878+
"repository": "pytorch-inference"
841879
}
842880
}
843881
},
@@ -846,7 +884,8 @@
846884
"cpu"
847885
],
848886
"version_aliases": {
849-
"1.12": "1.12.1"
887+
"1.12": "1.12.1",
888+
"2.0": "2.0.0"
850889
},
851890
"versions": {
852891
"1.12.1": {
@@ -889,6 +928,41 @@
889928
},
890929
"repository": "pytorch-inference-graviton",
891930
"container_version": {"cpu": "ubuntu20.04"}
931+
},
932+
"2.0.0": {
933+
"py_versions": [
934+
"py310"
935+
],
936+
"registries": {
937+
"af-south-1": "626614931356",
938+
"ap-east-1": "871362719292",
939+
"ap-northeast-1": "763104351884",
940+
"ap-northeast-2": "763104351884",
941+
"ap-northeast-3": "364406365360",
942+
"ap-south-1": "763104351884",
943+
"ap-south-2": "772153158452",
944+
"ap-southeast-1": "763104351884",
945+
"ap-southeast-2": "763104351884",
946+
"ap-southeast-3": "907027046896",
947+
"ap-southeast-4": "457447274322",
948+
"ca-central-1": "763104351884",
949+
"eu-central-1": "763104351884",
950+
"eu-central-2": "380420809688",
951+
"eu-north-1": "763104351884",
952+
"eu-west-1": "763104351884",
953+
"eu-west-2": "763104351884",
954+
"eu-west-3": "763104351884",
955+
"eu-south-1": "692866216735",
956+
"eu-south-2": "503227376785",
957+
"me-south-1": "217643126080",
958+
"sa-east-1": "763104351884",
959+
"us-east-1": "763104351884",
960+
"us-east-2": "763104351884",
961+
"us-west-1": "763104351884",
962+
"us-west-2": "763104351884"
963+
},
964+
"repository": "pytorch-inference-graviton",
965+
"container_version": {"cpu": "ubuntu20.04"}
892966
}
893967
}
894968
},
@@ -912,7 +986,8 @@
912986
"1.10": "1.10.2",
913987
"1.11": "1.11.0",
914988
"1.12": "1.12.1",
915-
"1.13": "1.13.1"
989+
"1.13": "1.13.1",
990+
"2.0": "2.0.0"
916991
},
917992
"versions": {
918993
"0.4.0": {
@@ -1674,6 +1749,43 @@
16741749
"us-west-2": "763104351884"
16751750
},
16761751
"repository": "pytorch-training"
1752+
},
1753+
"2.0.0": {
1754+
"py_versions": [
1755+
"py310"
1756+
],
1757+
"registries": {
1758+
"af-south-1": "626614931356",
1759+
"ap-east-1": "871362719292",
1760+
"ap-northeast-1": "763104351884",
1761+
"ap-northeast-2": "763104351884",
1762+
"ap-northeast-3": "364406365360",
1763+
"ap-south-1": "763104351884",
1764+
"ap-southeast-1": "763104351884",
1765+
"ap-southeast-2": "763104351884",
1766+
"ap-southeast-3": "907027046896",
1767+
"ap-southeast-4": "457447274322",
1768+
"ca-central-1": "763104351884",
1769+
"cn-north-1": "727897471807",
1770+
"cn-northwest-1": "727897471807",
1771+
"eu-central-1": "763104351884",
1772+
"eu-north-1": "763104351884",
1773+
"eu-west-1": "763104351884",
1774+
"eu-west-2": "763104351884",
1775+
"eu-west-3": "763104351884",
1776+
"eu-south-1": "692866216735",
1777+
"me-south-1": "217643126080",
1778+
"sa-east-1": "763104351884",
1779+
"us-east-1": "763104351884",
1780+
"us-east-2": "763104351884",
1781+
"us-gov-east-1": "446045086412",
1782+
"us-gov-west-1": "442386744353",
1783+
"us-iso-east-1": "886529160074",
1784+
"us-isob-east-1": "094389454867",
1785+
"us-west-1": "763104351884",
1786+
"us-west-2": "763104351884"
1787+
},
1788+
"repository": "pytorch-training"
16771789
}
16781790
}
16791791
}

0 commit comments

Comments
 (0)