Skip to content

Commit 260dae2

Browse files
authored
Merge branch 'master' into update-hf-pt-train-dlc
2 parents adc46a7 + 5ca7f28 commit 260dae2

21 files changed

+398
-45
lines changed

CHANGELOG.md

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,38 @@
11
# Changelog
22

3+
## v2.135.1 (2023-03-01)
4+
5+
### Bug Fixes and Other Changes
6+
7+
* Revert back to stable apache-airflow-providers-amazon from 7.2.1 to 4.0.0.
8+
* Typo in graviton algos
9+
* build(deps): bump apache-airflow-providers-amazon from 4.0.0 to 7.2.1 in /requirements/extras
10+
* Support cloning private repo using ssh key
11+
* Create a default SageMaker Session inside FeatureGroup class
12+
13+
### Documentation Changes
14+
15+
* fix typo in README
16+
17+
## v2.135.0 (2023-02-23)
18+
19+
### Features
20+
21+
* Add DLC accounts for MEL Region
22+
* allow use of short lived creds for local container
23+
24+
### Bug Fixes and Other Changes
25+
26+
* update lambda function when function arn is provided
27+
28+
## v2.134.1 (2023-02-22)
29+
30+
### Bug Fixes and Other Changes
31+
32+
* local mode deletion of temp files on job end
33+
* Cron expression resetting on update monitor
34+
* added support to update arguments in create_monitoring_schedule
35+
336
## v2.134.0 (2023-02-22)
437

538
### Features

README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ To run the unit tests with tox, run:
126126

127127
tox tests/unit
128128

129-
**Integrations tests**
129+
**Integration tests**
130130

131131
To run the integration tests, the following prerequisites must be met
132132

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.134.1.dev0
1+
2.135.2.dev0

doc/overview.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1578,6 +1578,7 @@ A few important notes:
15781578
- If you are using S3 data as input, it is pulled from S3 to your local environment. Ensure you have sufficient space to store the data locally.
15791579
- If you run into problems it often due to different Docker containers conflicting. Killing these containers and re-running often solves your problems.
15801580
- Local Mode requires Docker Compose and `nvidia-docker2 <https://github.com/NVIDIA/nvidia-docker>`__ for ``local_gpu``.
1581+
- Set ``USE_SHORT_LIVED_CREDENTIALS=1`` if running on EC2 and you would like to use the session credentials instead of EC2 Metadata Service credentials.
15811582
15821583
.. warning::
15831584

src/sagemaker/feature_store/feature_group.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -473,11 +473,12 @@ class FeatureGroup:
473473
Attributes:
474474
name (str): name of the FeatureGroup instance.
475475
sagemaker_session (Session): session instance to perform boto calls.
476+
If None, a new Session will be created.
476477
feature_definitions (Sequence[FeatureDefinition]): list of FeatureDefinitions.
477478
"""
478479

479480
name: str = attr.ib(factory=str)
480-
sagemaker_session: Session = attr.ib(default=Session)
481+
sagemaker_session: Session = attr.ib(factory=Session)
481482
feature_definitions: Sequence[FeatureDefinition] = attr.ib(factory=list)
482483

483484
_INTEGER_TYPES = [

src/sagemaker/fw_utils.py

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -148,11 +148,17 @@
148148
]
149149

150150

151-
TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS = ["1.11", "1.11.0"]
152-
151+
TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS = ["1.13.1"]
153152

154153
TRAINIUM_SUPPORTED_DISTRIBUTION_STRATEGIES = ["torch_distributed"]
155-
154+
TRAINIUM_SUPPORTED_TORCH_DISTRIBUTED_FRAMEWORK_VERSIONS = [
155+
"1.11",
156+
"1.11.0",
157+
"1.12",
158+
"1.12.0",
159+
"1.12.1",
160+
"1.13.1",
161+
]
156162

157163
SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]
158164

@@ -1055,9 +1061,8 @@ def validate_torch_distributed_distribution(
10551061
Raises:
10561062
ValueError: if
10571063
`py_version` is not python3 or
1058-
`framework_version` is not in TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS
1064+
`framework_version` is not compatible with instance types
10591065
"""
1060-
10611066
torch_distributed_enabled = False
10621067
if "torch_distributed" in distribution:
10631068
torch_distributed_enabled = distribution.get("torch_distributed").get("enabled", False)
@@ -1066,30 +1071,36 @@ def validate_torch_distributed_distribution(
10661071
return
10671072

10681073
err_msg = ""
1074+
10691075
if not image_uri:
10701076
# ignore framework_version and py_version if image_uri is set
10711077
# in case image_uri is not set, then both are mandatory
1072-
if framework_version not in TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS:
1073-
err_msg += (
1074-
f"Provided framework_version {framework_version} is not supported by"
1075-
" torch_distributed.\n"
1076-
"Please specify one of the supported framework versions:"
1077-
f" {TORCH_DISTRIBUTED_SUPPORTED_FRAMEWORK_VERSIONS} \n"
1078-
)
10791078
if "py3" not in py_version:
10801079
err_msg += (
10811080
f"Provided py_version {py_version} is not supported by torch_distributed.\n"
1082-
"Please specify py_version>=py3"
1081+
"Please specify py_version>=py3\n"
10831082
)
10841083

1085-
# Check instance compatibility
1086-
match = re.match(r"^ml[\._]([a-z\d]+)\.?\w*$", instance_type)
1087-
if match:
1088-
if not match[1].startswith("trn"):
1084+
# Check instance and framework_version compatibility
1085+
if _is_gpu_instance(instance_type):
1086+
if framework_version not in TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS:
1087+
err_msg += (
1088+
f"Provided framework_version {framework_version} is not supported by"
1089+
f" torch_distributed for instance {instance_type}.\n"
1090+
"Please specify one of the supported framework versions:"
1091+
f"{TORCH_DISTRIBUTED_GPU_SUPPORTED_FRAMEWORK_VERSIONS} \n"
1092+
)
1093+
elif _is_trainium_instance(instance_type):
1094+
if framework_version not in TRAINIUM_SUPPORTED_TORCH_DISTRIBUTED_FRAMEWORK_VERSIONS:
1095+
err_msg += (
1096+
f"Provided framework_version {framework_version} is not supported by"
1097+
f" torch_distributed for instance {instance_type}.\n"
1098+
"Please specify one of the supported framework versions:"
1099+
f"{TRAINIUM_SUPPORTED_TORCH_DISTRIBUTED_FRAMEWORK_VERSIONS} \n"
1100+
)
1101+
else:
10891102
err_msg += (
1090-
"torch_distributed is currently supported only for trainium instances.\n"
1091-
" Please refer https://sagemaker.readthedocs.io/en/stable/frameworks/pytorch/using_pytorch.html#distributed-pytorch-training \n" # noqa E501 # pylint: disable=c0301
1092-
"for information regarding distributed training on non-trainium instances"
1103+
"Currently torch_distributed is supported only for GPU and Trainium instances.\n"
10931104
)
10941105

10951106
# Check entry point type
@@ -1103,6 +1114,41 @@ def validate_torch_distributed_distribution(
11031114
raise ValueError(err_msg)
11041115

11051116

1117+
def _is_gpu_instance(instance_type):
1118+
"""Returns bool indicating whether instance_type supports GPU
1119+
1120+
Args:
1121+
instance_type (str): Name of the instance_type to check against.
1122+
1123+
Returns:
1124+
bool: Whether or not the instance_type supports GPU
1125+
"""
1126+
if isinstance(instance_type, str):
1127+
match = re.match(r"^ml[\._]([a-z\d]+)\.?\w*$", instance_type)
1128+
if match:
1129+
if match[1].startswith("p") or match[1].startswith("g"):
1130+
return True
1131+
if instance_type == "local_gpu":
1132+
return True
1133+
return False
1134+
1135+
1136+
def _is_trainium_instance(instance_type):
1137+
"""Returns bool indicating whether instance_type is a Trainium instance
1138+
1139+
Args:
1140+
instance_type (str): Name of the instance_type to check against.
1141+
1142+
Returns:
1143+
bool: Whether or not the instance_type is a Trainium instance
1144+
"""
1145+
if isinstance(instance_type, str):
1146+
match = re.match(r"^ml[\._]([a-z\d]+)\.?\w*$", instance_type)
1147+
if match and match[1].startswith("trn"):
1148+
return True
1149+
return False
1150+
1151+
11061152
def python_deprecation_warning(framework, latest_supported_version):
11071153
"""Placeholder docstring"""
11081154
return PYTHON_2_DEPRECATION_WARNING.format(

src/sagemaker/git_utils.py

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ def _clone_command_for_github_like(git_config, dest_dir):
174174
CalledProcessError: If failed to clone git repo.
175175
"""
176176
is_https = git_config["repo"].startswith("https://")
177-
is_ssh = git_config["repo"].startswith("git@")
177+
is_ssh = git_config["repo"].startswith("git@") or git_config["repo"].startswith("ssh://")
178178
if not is_https and not is_ssh:
179179
raise ValueError("Invalid Git url provided.")
180180
if is_ssh:
@@ -277,12 +277,16 @@ def _run_clone_command(repo_url, dest_dir):
277277
if repo_url.startswith("https://"):
278278
my_env["GIT_TERMINAL_PROMPT"] = "0"
279279
subprocess.check_call(["git", "clone", repo_url, dest_dir], env=my_env)
280-
elif repo_url.startswith("git@"):
281-
with tempfile.NamedTemporaryFile() as sshnoprompt:
282-
with open(sshnoprompt.name, "w") as write_pipe:
283-
write_pipe.write("ssh -oBatchMode=yes $@")
284-
os.chmod(sshnoprompt.name, 0o511)
285-
my_env["GIT_SSH"] = sshnoprompt.name
280+
elif repo_url.startswith("git@") or repo_url.startswith("ssh://"):
281+
try:
282+
with tempfile.NamedTemporaryFile() as sshnoprompt:
283+
with open(sshnoprompt.name, "w") as write_pipe:
284+
write_pipe.write("ssh -oBatchMode=yes $@")
285+
os.chmod(sshnoprompt.name, 0o511)
286+
my_env["GIT_SSH"] = sshnoprompt.name
287+
subprocess.check_call(["git", "clone", repo_url, dest_dir], env=my_env)
288+
except subprocess.CalledProcessError:
289+
del my_env["GIT_SSH"]
286290
subprocess.check_call(["git", "clone", repo_url, dest_dir], env=my_env)
287291

288292

src/sagemaker/image_uri_config/autogluon.json

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@
298298
"ap-southeast-1": "763104351884",
299299
"ap-southeast-2": "763104351884",
300300
"ap-southeast-3": "907027046896",
301+
"ap-southeast-4": "457447274322",
301302
"ca-central-1": "763104351884",
302303
"cn-north-1": "727897471807",
303304
"cn-northwest-1": "727897471807",
@@ -337,6 +338,7 @@
337338
"ap-southeast-1": "763104351884",
338339
"ap-southeast-2": "763104351884",
339340
"ap-southeast-3": "907027046896",
341+
"ap-southeast-4": "457447274322",
340342
"ca-central-1": "763104351884",
341343
"cn-north-1": "727897471807",
342344
"cn-northwest-1": "727897471807",
@@ -376,6 +378,7 @@
376378
"ap-southeast-1": "763104351884",
377379
"ap-southeast-2": "763104351884",
378380
"ap-southeast-3": "907027046896",
381+
"ap-southeast-4": "457447274322",
379382
"ca-central-1": "763104351884",
380383
"cn-north-1": "727897471807",
381384
"cn-northwest-1": "727897471807",
@@ -415,6 +418,7 @@
415418
"ap-southeast-1": "763104351884",
416419
"ap-southeast-2": "763104351884",
417420
"ap-southeast-3": "907027046896",
421+
"ap-southeast-4": "457447274322",
418422
"ca-central-1": "763104351884",
419423
"cn-north-1": "727897471807",
420424
"cn-northwest-1": "727897471807",
@@ -454,6 +458,7 @@
454458
"ap-southeast-1": "763104351884",
455459
"ap-southeast-2": "763104351884",
456460
"ap-southeast-3": "907027046896",
461+
"ap-southeast-4": "457447274322",
457462
"ca-central-1": "763104351884",
458463
"cn-north-1": "727897471807",
459464
"cn-northwest-1": "727897471807",
@@ -493,6 +498,7 @@
493498
"ap-southeast-1": "763104351884",
494499
"ap-southeast-2": "763104351884",
495500
"ap-southeast-3": "907027046896",
501+
"ap-southeast-4": "457447274322",
496502
"ca-central-1": "763104351884",
497503
"cn-north-1": "727897471807",
498504
"cn-northwest-1": "727897471807",

src/sagemaker/image_uri_config/huggingface-neuron.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"ap-south-2": "772153158452",
1919
"ap-southeast-1": "763104351884",
2020
"ap-southeast-2": "763104351884",
21+
"ap-southeast-4": "457447274322",
2122
"ca-central-1": "763104351884",
2223
"cn-north-1": "727897471807",
2324
"cn-northwest-1": "727897471807",

src/sagemaker/image_uri_config/huggingface.json

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -774,6 +774,7 @@
774774
"ap-southeast-1": "763104351884",
775775
"ap-southeast-2": "763104351884",
776776
"ap-southeast-3": "907027046896",
777+
"ap-southeast-4": "457447274322",
777778
"ca-central-1": "763104351884",
778779
"cn-north-1": "727897471807",
779780
"cn-northwest-1": "727897471807",
@@ -813,6 +814,7 @@
813814
"ap-southeast-1": "763104351884",
814815
"ap-southeast-2": "763104351884",
815816
"ap-southeast-3": "907027046896",
817+
"ap-southeast-4": "457447274322",
816818
"ca-central-1": "763104351884",
817819
"cn-north-1": "727897471807",
818820
"cn-northwest-1": "727897471807",
@@ -852,6 +854,7 @@
852854
"ap-southeast-1": "763104351884",
853855
"ap-southeast-2": "763104351884",
854856
"ap-southeast-3": "907027046896",
857+
"ap-southeast-4": "457447274322",
855858
"ca-central-1": "763104351884",
856859
"cn-north-1": "727897471807",
857860
"cn-northwest-1": "727897471807",
@@ -899,6 +902,7 @@
899902
"ap-southeast-1": "763104351884",
900903
"ap-southeast-2": "763104351884",
901904
"ap-southeast-3": "907027046896",
905+
"ap-southeast-4": "457447274322",
902906
"ca-central-1": "763104351884",
903907
"cn-north-1": "727897471807",
904908
"cn-northwest-1": "727897471807",
@@ -938,6 +942,7 @@
938942
"ap-southeast-1": "763104351884",
939943
"ap-southeast-2": "763104351884",
940944
"ap-southeast-3": "907027046896",
945+
"ap-southeast-4": "457447274322",
941946
"ca-central-1": "763104351884",
942947
"cn-north-1": "727897471807",
943948
"cn-northwest-1": "727897471807",
@@ -977,6 +982,7 @@
977982
"ap-southeast-1": "763104351884",
978983
"ap-southeast-2": "763104351884",
979984
"ap-southeast-3": "907027046896",
985+
"ap-southeast-4": "457447274322",
980986
"ca-central-1": "763104351884",
981987
"cn-north-1": "727897471807",
982988
"cn-northwest-1": "727897471807",
@@ -1016,6 +1022,7 @@
10161022
"ap-southeast-1": "763104351884",
10171023
"ap-southeast-2": "763104351884",
10181024
"ap-southeast-3": "907027046896",
1025+
"ap-southeast-4": "457447274322",
10191026
"ca-central-1": "763104351884",
10201027
"cn-north-1": "727897471807",
10211028
"cn-northwest-1": "727897471807",
@@ -1061,6 +1068,7 @@
10611068
"ap-southeast-1": "763104351884",
10621069
"ap-southeast-2": "763104351884",
10631070
"ap-southeast-3": "907027046896",
1071+
"ap-southeast-4": "457447274322",
10641072
"ca-central-1": "763104351884",
10651073
"cn-north-1": "727897471807",
10661074
"cn-northwest-1": "727897471807",
@@ -1100,6 +1108,7 @@
11001108
"ap-southeast-1": "763104351884",
11011109
"ap-southeast-2": "763104351884",
11021110
"ap-southeast-3": "907027046896",
1111+
"ap-southeast-4": "457447274322",
11031112
"ca-central-1": "763104351884",
11041113
"cn-north-1": "727897471807",
11051114
"cn-northwest-1": "727897471807",
@@ -1145,6 +1154,7 @@
11451154
"ap-southeast-1": "763104351884",
11461155
"ap-southeast-2": "763104351884",
11471156
"ap-southeast-3": "907027046896",
1157+
"ap-southeast-4": "457447274322",
11481158
"ca-central-1": "763104351884",
11491159
"cn-north-1": "727897471807",
11501160
"cn-northwest-1": "727897471807",
@@ -1184,6 +1194,7 @@
11841194
"ap-southeast-1": "763104351884",
11851195
"ap-southeast-2": "763104351884",
11861196
"ap-southeast-3": "907027046896",
1197+
"ap-southeast-4": "457447274322",
11871198
"ca-central-1": "763104351884",
11881199
"cn-north-1": "727897471807",
11891200
"cn-northwest-1": "727897471807",
@@ -1229,6 +1240,7 @@
12291240
"ap-southeast-1": "763104351884",
12301241
"ap-southeast-2": "763104351884",
12311242
"ap-southeast-3": "907027046896",
1243+
"ap-southeast-4": "457447274322",
12321244
"ca-central-1": "763104351884",
12331245
"cn-north-1": "727897471807",
12341246
"cn-northwest-1": "727897471807",
@@ -1268,6 +1280,7 @@
12681280
"ap-southeast-1": "763104351884",
12691281
"ap-southeast-2": "763104351884",
12701282
"ap-southeast-3": "907027046896",
1283+
"ap-southeast-4": "457447274322",
12711284
"ca-central-1": "763104351884",
12721285
"cn-north-1": "727897471807",
12731286
"cn-northwest-1": "727897471807",

0 commit comments

Comments
 (0)