Skip to content

Commit fd25ea0

Browse files
committed
Merge branch 'update-hf-pt-train-dlc' of https://github.com/JingyaHuang/sagemaker-python-sdk into update-hf-pt-train-dlc
2 parents eabd6b7 + 3a25ba8 commit fd25ea0

29 files changed

+857
-214
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
# Changelog
22

3+
## v2.134.0 (2023-02-22)
4+
5+
### Features
6+
7+
* Add python 3.9 and spark 3.2 support for spark processor
8+
* Adding support for Multi Worker Mirrored Strategy in TF estimator
9+
10+
### Bug Fixes and Other Changes
11+
12+
* tag permission issue - remove describe before create
13+
314
## v2.133.0 (2023-02-18)
415

516
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.133.1.dev0
1+
2.134.1.dev0

doc/api/training/sdp_versions/latest.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,8 @@ depending on the version of the library you use.
2626
<https://docs.aws.amazon.com/sagemaker/latest/dg/data-parallel-use-api.html#data-parallel-use-python-skd-api>`_
2727
for more information.
2828

29-
Version 1.4.0, 1.4.1, 1.5.0, 1.6.0 (Latest)
30-
===========================================
29+
For versions between 1.4.0 and 1.7.0 (Latest)
30+
=============================================
3131

3232
.. toctree::
3333
:maxdepth: 1

doc/api/training/smd_data_parallel_release_notes/smd_data_parallel_change_log.rst

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,9 +7,40 @@ Release Notes
77
New features, bug fixes, and improvements are regularly made to the SageMaker
88
distributed data parallel library.
99

10-
SageMaker Distributed Data Parallel 1.6.0 Release Notes
10+
SageMaker Distributed Data Parallel 1.7.0 Release Notes
1111
=======================================================
1212

13+
*Date: Feb. 10. 2023*
14+
15+
**Currency Updates**
16+
17+
* Added support for PyTorch 1.13.1.
18+
19+
**Migration to AWS Deep Learning Containers**
20+
21+
This version passed benchmark testing and is migrated to the following AWS Deep Learning Containers (DLC):
22+
23+
- PyTorch 1.13.1 DLC
24+
25+
.. code::
26+
27+
763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:1.13.1-gpu-py39-cu117-ubuntu20.04-sagemaker
28+
29+
Binary file of this version of the library for custom container users:
30+
31+
.. code::
32+
33+
https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.13.1/cu117/2023-01-09/smdistributed_dataparallel-1.7.0-cp39-cp39-linux_x86_64.whl
34+
35+
36+
----
37+
38+
Release History
39+
===============
40+
41+
SageMaker Distributed Data Parallel 1.6.0 Release Notes
42+
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
43+
1344
*Date: Dec. 15. 2022*
1445

1546
**New Features**
@@ -44,11 +75,6 @@ Binary file of this version of the library for `custom container
4475
https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.12.1/cu113/2022-12-05/smdistributed_dataparallel-1.6.0-cp38-cp38-linux_x86_64.whl
4576
4677
47-
----
48-
49-
Release History
50-
===============
51-
5278
SageMaker Distributed Data Parallel 1.5.0 Release Notes
5379
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
5480

src/sagemaker/experiments/experiment.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
import time
1717

18+
from botocore.exceptions import ClientError
19+
1820
from sagemaker.apiutils import _base_types
1921
from sagemaker.experiments.trial import _Trial
2022
from sagemaker.experiments.trial_component import _TrialComponent
@@ -154,17 +156,21 @@ def _load_or_create(
154156
Returns:
155157
experiments.experiment._Experiment: A SageMaker `_Experiment` object
156158
"""
157-
sagemaker_client = sagemaker_session.sagemaker_client
158159
try:
159-
experiment = _Experiment.load(experiment_name, sagemaker_session)
160-
except sagemaker_client.exceptions.ResourceNotFound:
161160
experiment = _Experiment.create(
162161
experiment_name=experiment_name,
163162
display_name=display_name,
164163
description=description,
165164
tags=tags,
166165
sagemaker_session=sagemaker_session,
167166
)
167+
except ClientError as ce:
168+
error_code = ce.response["Error"]["Code"]
169+
error_message = ce.response["Error"]["Message"]
170+
if not (error_code == "ValidationException" and "already exists" in error_message):
171+
raise ce
172+
# already exists
173+
experiment = _Experiment.load(experiment_name, sagemaker_session)
168174
return experiment
169175

170176
def list_trials(self, created_before=None, created_after=None, sort_by=None, sort_order=None):

src/sagemaker/experiments/trial.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,8 @@
1313
"""Contains the Trial class."""
1414
from __future__ import absolute_import
1515

16+
from botocore.exceptions import ClientError
17+
1618
from sagemaker.apiutils import _base_types
1719
from sagemaker.experiments import _api_types
1820
from sagemaker.experiments.trial_component import _TrialComponent
@@ -268,8 +270,20 @@ def _load_or_create(
268270
Returns:
269271
experiments.trial._Trial: A SageMaker `_Trial` object
270272
"""
271-
sagemaker_client = sagemaker_session.sagemaker_client
272273
try:
274+
trial = _Trial.create(
275+
experiment_name=experiment_name,
276+
trial_name=trial_name,
277+
display_name=display_name,
278+
tags=tags,
279+
sagemaker_session=sagemaker_session,
280+
)
281+
except ClientError as ce:
282+
error_code = ce.response["Error"]["Code"]
283+
error_message = ce.response["Error"]["Message"]
284+
if not (error_code == "ValidationException" and "already exists" in error_message):
285+
raise ce
286+
# already exists
273287
trial = _Trial.load(trial_name, sagemaker_session)
274288
if trial.experiment_name != experiment_name: # pylint: disable=no-member
275289
raise ValueError(
@@ -278,12 +292,4 @@ def _load_or_create(
278292
trial.experiment_name # pylint: disable=no-member
279293
)
280294
)
281-
except sagemaker_client.exceptions.ResourceNotFound:
282-
trial = _Trial.create(
283-
experiment_name=experiment_name,
284-
trial_name=trial_name,
285-
display_name=display_name,
286-
tags=tags,
287-
sagemaker_session=sagemaker_session,
288-
)
289295
return trial

src/sagemaker/experiments/trial_component.py

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@
1515

1616
import time
1717

18+
from botocore.exceptions import ClientError
19+
1820
from sagemaker.apiutils import _base_types
1921
from sagemaker.experiments import _api_types
2022
from sagemaker.experiments._api_types import TrialComponentSearchResult
@@ -326,16 +328,20 @@ def _load_or_create(
326328
experiments.trial_component._TrialComponent: A SageMaker `_TrialComponent` object.
327329
bool: A boolean variable indicating whether the trail component already exists
328330
"""
329-
sagemaker_client = sagemaker_session.sagemaker_client
330331
is_existed = False
331332
try:
332-
run_tc = _TrialComponent.load(trial_component_name, sagemaker_session)
333-
is_existed = True
334-
except sagemaker_client.exceptions.ResourceNotFound:
335333
run_tc = _TrialComponent.create(
336334
trial_component_name=trial_component_name,
337335
display_name=display_name,
338336
tags=tags,
339337
sagemaker_session=sagemaker_session,
340338
)
339+
except ClientError as ce:
340+
error_code = ce.response["Error"]["Code"]
341+
error_message = ce.response["Error"]["Message"]
342+
if not (error_code == "ValidationException" and "already exists" in error_message):
343+
raise ce
344+
# already exists
345+
run_tc = _TrialComponent.load(trial_component_name, sagemaker_session)
346+
is_existed = True
341347
return run_tc, is_existed

src/sagemaker/image_uri_config/spark.json

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,36 @@
9191
"ap-southeast-3": "732049463269"
9292
},
9393
"repository": "sagemaker-spark-processing"
94+
},
95+
"3.2": {
96+
"py_versions": ["py39"],
97+
"registries": {
98+
"me-south-1": "750251592176",
99+
"ap-south-1": "105495057255",
100+
"eu-north-1": "330188676905",
101+
"eu-west-3": "136845547031",
102+
"us-east-2": "314815235551",
103+
"eu-west-1": "571004829621",
104+
"eu-central-1": "906073651304",
105+
"sa-east-1": "737130764395",
106+
"ap-east-1": "732049463269",
107+
"us-east-1": "173754725891",
108+
"ap-northeast-2": "860869212795",
109+
"eu-west-2": "836651553127",
110+
"ap-northeast-1": "411782140378",
111+
"us-west-2": "153931337802",
112+
"us-west-1": "667973535471",
113+
"ap-southeast-1": "759080221371",
114+
"ap-southeast-2": "440695851116",
115+
"ca-central-1": "446299261295",
116+
"cn-north-1": "671472414489",
117+
"cn-northwest-1": "844356804704",
118+
"eu-south-1": "753923664805",
119+
"af-south-1": "309385258863",
120+
"us-gov-west-1": "271483468897",
121+
"ap-southeast-3": "732049463269"
122+
},
123+
"repository": "sagemaker-spark-processing"
94124
}
95125
}
96126
}

src/sagemaker/local/utils.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,20 @@
1414
from __future__ import absolute_import
1515

1616
import os
17+
import logging
1718
import shutil
1819
import subprocess
1920
import json
2021
import re
22+
import errno
2123

2224
from distutils.dir_util import copy_tree
2325
from six.moves.urllib.parse import urlparse
2426

2527
from sagemaker import s3
2628

29+
logger = logging.getLogger(__name__)
30+
2731

2832
def copy_directory_structure(destination_directory, relative_path):
2933
"""Creates intermediate directory structure for relative_path.
@@ -77,7 +81,19 @@ def move_to_destination(source, destination, job_name, sagemaker_session):
7781
else:
7882
raise ValueError("Invalid destination URI, must be s3:// or file://, got: %s" % destination)
7983

80-
shutil.rmtree(source)
84+
try:
85+
shutil.rmtree(source)
86+
except OSError as exc:
87+
# on Linux, when docker writes to any mounted volume, it uses the container's user. In most
88+
# cases this is root. When the container exits and we try to delete them we can't because
89+
# root owns those files. We expect this to happen, so we handle EACCESS. Any other error
90+
# we will raise the exception up.
91+
if exc.errno == errno.EACCES:
92+
logger.warning("Failed to delete: %s Please remove it manually.", source)
93+
else:
94+
logger.error("Failed to delete: %s", source)
95+
raise
96+
8197
return final_uri
8298

8399

src/sagemaker/model_monitor/dataset_format.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,6 @@ def csv(header=True):
7171
Args:
7272
header (bool): Whether the csv dataset to baseline and monitor has a header.
7373
Default: True.
74-
output_columns_position (str): The position of the output columns.
75-
Must be one of ("START", "END"). Default: "START".
7674
7775
Returns:
7876
dict: JSON string containing DatasetFormat to be used by DefaultModelMonitor.

src/sagemaker/model_monitor/model_monitoring.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ def create_monitoring_schedule(
233233
monitor_schedule_name=None,
234234
schedule_cron_expression=None,
235235
batch_transform_input=None,
236+
arguments=None,
236237
):
237238
"""Creates a monitoring schedule to monitor an Amazon SageMaker Endpoint.
238239
@@ -262,6 +263,7 @@ def create_monitoring_schedule(
262263
batch_transform_input (sagemaker.model_monitor.BatchTransformInput): Inputs to
263264
run the monitoring schedule on the batch transform
264265
(default: None)
266+
arguments ([str]): A list of string arguments to be passed to a processing job.
265267
266268
"""
267269
if self.monitoring_schedule_name is not None:
@@ -326,6 +328,9 @@ def create_monitoring_schedule(
326328
if self.network_config is not None:
327329
network_config_dict = self.network_config._to_request_dict()
328330

331+
if arguments is not None:
332+
self.arguments = arguments
333+
329334
self.sagemaker_session.create_monitoring_schedule(
330335
monitoring_schedule_name=self.monitoring_schedule_name,
331336
schedule_expression=schedule_cron_expression,
@@ -2054,6 +2059,21 @@ def _update_data_quality_monitoring_schedule(
20542059
self._update_monitoring_schedule(self.job_definition_name, schedule_cron_expression)
20552060
return
20562061

2062+
existing_desc = self.sagemaker_session.describe_monitoring_schedule(
2063+
monitoring_schedule_name=self.monitoring_schedule_name
2064+
)
2065+
2066+
if (
2067+
existing_desc.get("MonitoringScheduleConfig") is not None
2068+
and existing_desc["MonitoringScheduleConfig"].get("ScheduleConfig") is not None
2069+
and existing_desc["MonitoringScheduleConfig"]["ScheduleConfig"]["ScheduleExpression"]
2070+
is not None
2071+
and schedule_cron_expression is None
2072+
):
2073+
schedule_cron_expression = existing_desc["MonitoringScheduleConfig"]["ScheduleConfig"][
2074+
"ScheduleExpression"
2075+
]
2076+
20572077
# Need to update schedule with a new job definition
20582078
job_desc = self.sagemaker_session.sagemaker_client.describe_data_quality_job_definition(
20592079
JobDefinitionName=self.job_definition_name

0 commit comments

Comments
 (0)