Skip to content

Commit fe382bf

Browse files
authored
Merge branch 'master' into collect-tests
2 parents d245153 + 9a8d066 commit fe382bf

File tree

7 files changed

+88
-6
lines changed

7 files changed

+88
-6
lines changed

CHANGELOG.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,22 @@
11
# Changelog
22

3+
## v2.24.0 (2021-01-22)
4+
5+
### Features
6+
7+
* add support for Std:Join for pipelines
8+
* Map image name to image uri
9+
* friendly names for short URIs
10+
11+
### Bug Fixes and Other Changes
12+
13+
* increase allowed time for search to get updated
14+
* refactor distribution config construction
15+
16+
### Documentation Changes
17+
18+
* Add SMP 1.2.0 API docs
19+
320
## v2.23.6 (2021-01-20)
421

522
### Bug Fixes and Other Changes

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.23.7.dev0
1+
2.24.1.dev0

doc/api/training/smp_versions/v1.2.0/smd_model_parallel_pytorch.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -140,16 +140,16 @@ This API document assumes you use the following import statements in your traini
140140
computation. \ ``bucket_cap_mb``\ controls the bucket size in MegaBytes
141141
(MB).
142142

143-
- ``trace_memory_usage`` (default: False): When set to True, the library attempts
143+
- ``trace_memory_usage`` (default: False): When set to True, the library attempts
144144
to measure memory usage per module during tracing. If this is disabled,
145145
memory usage will be estimated through the sizes of tensors returned from
146146
the module.
147147

148-
- ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``.
148+
- ``broadcast_buffers`` (default: True): Flag to be used with ``ddp=True``.
149149
This parameter is forwarded to the underlying ``DistributedDataParallel`` wrapper.
150150
Please see: `broadcast_buffer <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
151151

152-
- ``gradient_as_bucket_view (PyTorch 1.7 only)`` (default: False): To be
152+
- ``gradient_as_bucket_view (PyTorch 1.7 only)`` (default: False): To be
153153
used with ``ddp=True``. This parameter is forwarded to the underlying
154154
``DistributedDataParallel`` wrapper. Please see `gradient_as_bucket_view <https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html#torch.nn.parallel.DistributedDataParallel>`__.
155155

src/sagemaker/estimator.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
UploadedCode,
5050
validate_source_dir,
5151
_region_supports_debugger,
52+
_region_supports_profiler,
5253
get_mp_parameters,
5354
)
5455
from sagemaker.inputs import TrainingInput
@@ -494,7 +495,7 @@ def _prepare_profiler_for_training(self):
494495
"""Set necessary values and do basic validations in profiler config and profiler rules.
495496
496497
When user explicitly set rules to an empty list, default profiler rule won't be enabled.
497-
Default profiler rule will be enabled when either:
498+
Default profiler rule will be enabled in supported regions when either:
498499
1. user doesn't specify any rules, i.e., rules=None; or
499500
2. user only specify debugger rules, i.e., rules=[Rule.sagemaker(...)]
500501
"""
@@ -503,7 +504,7 @@ def _prepare_profiler_for_training(self):
503504
raise RuntimeError("profiler_config cannot be set when disable_profiler is True.")
504505
if self.profiler_rules:
505506
raise RuntimeError("ProfilerRule cannot be set when disable_profiler is True.")
506-
elif _region_supports_debugger(self.sagemaker_session.boto_region_name):
507+
elif _region_supports_profiler(self.sagemaker_session.boto_region_name):
507508
if self.profiler_config is None:
508509
self.profiler_config = ProfilerConfig(s3_output_path=self.output_path)
509510
if self.rules is None or (self.rules and not self.profiler_rules):

src/sagemaker/fw_utils.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
)
5050

5151
DEBUGGER_UNSUPPORTED_REGIONS = ("us-iso-east-1",)
52+
PROFILER_UNSUPPORTED_REGIONS = ("us-iso-east-1", "cn-north-1", "cn-northwest-1")
53+
5254
SINGLE_GPU_INSTANCE_TYPES = ("ml.p2.xlarge", "ml.p3.2xlarge")
5355
SM_DATAPARALLEL_SUPPORTED_INSTANCE_TYPES = (
5456
"ml.p3.16xlarge",
@@ -550,6 +552,19 @@ def _region_supports_debugger(region_name):
550552
return region_name.lower() not in DEBUGGER_UNSUPPORTED_REGIONS
551553

552554

555+
def _region_supports_profiler(region_name):
556+
"""Returns bool indicating whether region supports Amazon SageMaker Debugger profiling feature.
557+
558+
Args:
559+
region_name (str): Name of the region to check against.
560+
561+
Returns:
562+
bool: Whether or not the region supports Amazon SageMaker Debugger profiling feature.
563+
564+
"""
565+
return region_name.lower() not in PROFILER_UNSUPPORTED_REGIONS
566+
567+
553568
def validate_version_or_image_args(framework_version, py_version, image_uri):
554569
"""Checks if version or image arguments are specified.
555570

tests/integ/test_experiments_analytics.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,17 @@ def experiment(sagemaker_session):
3838

3939
time.sleep(15) # wait for search to get updated
4040

41+
# allow search time thrice
42+
for _ in range(3):
43+
analytics = ExperimentAnalytics(
44+
experiment_name=experiment_name, sagemaker_session=sagemaker_session
45+
)
46+
47+
if len(analytics.dataframe().columns) > 0:
48+
break
49+
50+
time.sleep(15)
51+
4152
yield experiment_name
4253
finally:
4354
_delete_resources(sm, experiment_name, trials)
@@ -79,6 +90,17 @@ def experiment_with_artifacts(sagemaker_session):
7990

8091
time.sleep(15) # wait for search to get updated
8192

93+
# allow search time thrice
94+
for _ in range(3):
95+
analytics = ExperimentAnalytics(
96+
experiment_name=experiment_name, sagemaker_session=sagemaker_session
97+
)
98+
99+
if len(analytics.dataframe().columns) > 0:
100+
break
101+
102+
time.sleep(15)
103+
82104
yield experiment_name
83105
finally:
84106
_delete_resources(sm, experiment_name, trials)

tests/unit/test_estimator.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@
3535
Rule,
3636
)
3737
from sagemaker.estimator import Estimator, EstimatorBase, Framework, _TrainingJob
38+
from sagemaker.fw_utils import PROFILER_UNSUPPORTED_REGIONS
3839
from sagemaker.inputs import ShuffleConfig
3940
from sagemaker.model import FrameworkModel
4041
from sagemaker.predictor import Predictor
@@ -632,6 +633,32 @@ def test_framework_with_profiler_config_without_s3_output_path(time, sagemaker_s
632633
]
633634

634635

636+
@pytest.mark.parametrize("region", PROFILER_UNSUPPORTED_REGIONS)
637+
def test_framework_with_no_default_profiler_in_unsupported_region(region):
638+
boto_mock = Mock(name="boto_session", region_name=region)
639+
sms = MagicMock(
640+
name="sagemaker_session",
641+
boto_session=boto_mock,
642+
boto_region_name=region,
643+
config=None,
644+
local_mode=False,
645+
s3_client=None,
646+
s3_resource=None,
647+
)
648+
f = DummyFramework(
649+
entry_point=SCRIPT_PATH,
650+
role=ROLE,
651+
sagemaker_session=sms,
652+
instance_count=INSTANCE_COUNT,
653+
instance_type=INSTANCE_TYPE,
654+
)
655+
f.fit("s3://mydata")
656+
sms.train.assert_called_once()
657+
_, args = sms.train.call_args
658+
assert args.get("profiler_config") is None
659+
assert args.get("profiler_rule_configs") is None
660+
661+
635662
def test_framework_with_profiler_config_and_profiler_disabled(sagemaker_session):
636663
with pytest.raises(RuntimeError) as error:
637664
f = DummyFramework(

0 commit comments

Comments
 (0)