Skip to content

Commit 1b03c5d

Browse files
authored
Merge branch 'master' into smddp_custom_mpi_support
2 parents 6343e19 + f7161f0 commit 1b03c5d

File tree

8 files changed

+130
-14
lines changed

8 files changed

+130
-14
lines changed

CHANGELOG.md

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,16 @@
11
# Changelog
22

3+
## v2.35.0 (2021-04-14)
4+
5+
### Features
6+
7+
* add support for PyTorch 1.8.1
8+
9+
### Bug Fixes and Other Changes
10+
11+
* boto3 client param updated for feature store
12+
* Updated release notes and API doc for smd model parallel 1.3.1
13+
314
## v2.34.0 (2021-04-12)
415

516
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.34.1.dev0
1+
2.35.1.dev0

doc/amazon_sagemaker_featurestore.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ use the SageMaker default bucket and add a custom prefix to it.
6767
offline_feature_store_bucket = 's3://*{}*/*{}*'.format(default_bucket, prefix)
6868
6969
sagemaker_client = boto_session.client(service_name='sagemaker', region_name=region)
70-
featurestore_runtime = boto_session.client(service_name='featurestore-runtime', region_name=region)
70+
featurestore_runtime = boto_session.client(service_name='sagemaker-featurestore-runtime', region_name=region)
7171
7272
feature_store_session = Session(
7373
    boto_session=boto_session,

doc/api/training/smd_model_parallel_release_notes/smd_model_parallel_change_log.md

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,33 @@
1+
# Sagemaker Distributed Model Parallel 1.3.1 Release Notes
2+
3+
- New Features
4+
- Bug Fixes
5+
- Known Issues
6+
7+
## New Features
8+
9+
### TensorFlow
10+
11+
- Exposes a new decorator ``register_post_partition_hook``. This allows invoking the decorated methods just after model partition but before executing the first step. For example loading a checkpoint. Refer to the [SageMaker distributed model parallel API documentation](https://sagemaker.readthedocs.io/en/stable/api/training/smp_versions/latest/smd_model_parallel_tensorflow.html) for more information.
12+
13+
## Bug Fixes
14+
15+
### PyTorch
16+
17+
- Improved memory efficiency when using active microbatches by clearing activations at end of each microbatch.
18+
19+
### TensorFlow
20+
21+
- Fixed issue that caused hangs when training some models with XLA enabled.
22+
23+
## Known Issues
24+
25+
### PyTorch
26+
27+
- A crash was observed when ``optimizer.step()`` was called for certain optimizers such as AdaDelta, when the partition on which this method was called has no local parameters assigned to it after partitioning. This is due to a bug in PyTorch which [has since been fixed](https://github.com/pytorch/pytorch/pull/52944). Till that makes its way to the next release of PyTorch, only call ``optimizer.step()`` on processes which have at least one local parameter. This can be checked like this ``len(list(model.local_parameters())) > 0``.
28+
29+
- A performance regression still exists when training on SMP with PyTorch 1.7.1 compared to 1.6. The rootcause was found to be the slowdown in performance of `.grad` method calls in PyTorch 1.7.1 compared to 1.6. See the related discussion: https://github.com/pytorch/pytorch/issues/50636. This issue does not exist with PyTorch 1.8.
30+
131
# Sagemaker Distributed Model Parallel 1.3.0 Release Notes
232

333
- New Features

doc/api/training/smp_versions/latest/smd_model_parallel_tensorflow.rst

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,21 @@ TensorFlow API
8383
    with smp.partition(3):
8484
        z = tf.reduce_sum(y)             # placed in partition 3
8585
86-
86+
87+
.. function:: register_post_partition_hook(hook)
88+
89+
Registers a callable ``hook`` to
90+
be executed after the model is partitioned. This is useful in situations
91+
where an operation needs to be executed after the model partition during
92+
the first call to ``smp.step``, but before the actual execution of the
93+
first forward pass.
94+
95+
.. code:: python
96+
97+
@smp.register_post_partition_hook
98+
def test_eager():
99+
# All statements here will be executed right after partition but before the first forward pass
100+
tf.print("Entered hook through eager context")
87101
88102
.. class:: smp.CheckpointManager
89103

@@ -102,13 +116,6 @@ TensorFlow API
102116
                      max_to_keep=None,
103117
                      checkpoint_name="ckpt")
104118
105-
106-
**Important:** ``smp.CheckpointManager.restore()`` must be called after
107-
the first training step. This is because the first call of the
108-
``smp.step`` function constructs and partitions the model, which must
109-
take place before the checkpoint restore. Calling it before the first
110-
``smp.step`` call might result in hangs or unexpected behavior.
111-
112119
**Parameters**
113120

114121
- ``checkpoint``: A `tf.train.Checkpoint
@@ -154,7 +161,8 @@ TensorFlow API
154161
.. code:: python
155162
156163
for step, inputs in enumerate(train_ds):
157-
    if step == 1:                    # NOTE: restore occurs on the second step
164+
    if step == 0:
158165
        ckpt_manager.restore()
159166
    loss = train_step(inputs)
160167
168+

src/sagemaker/fw_utils.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@
6060
)
6161
SM_DATAPARALLEL_SUPPORTED_FRAMEWORK_VERSIONS = {
6262
"tensorflow": ["2.3", "2.3.1", "2.3.2", "2.4", "2.4.1"],
63-
"pytorch": ["1.6", "1.6.0", "1.7", "1.7.1", "1.8", "1.8.0"],
63+
"pytorch": ["1.6", "1.6.0", "1.7", "1.7.1", "1.8", "1.8.0", "1.8.1"],
6464
}
6565
SMDISTRIBUTED_SUPPORTED_STRATEGIES = ["dataparallel", "modelparallel"]
6666

src/sagemaker/image_uri_config/pytorch.json

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,7 @@
5656
"1.5": "1.5.0",
5757
"1.6": "1.6.0",
5858
"1.7": "1.7.1",
59-
"1.8": "1.8.0"
59+
"1.8": "1.8.1"
6060
},
6161
"versions": {
6262
"0.4.0": {
@@ -386,6 +386,39 @@
386386
"us-west-2": "763104351884"
387387
},
388388
"repository": "pytorch-inference"
389+
},
390+
"1.8.1": {
391+
"py_versions": [
392+
"py3",
393+
"py36"
394+
],
395+
"registries": {
396+
"af-south-1": "626614931356",
397+
"ap-east-1": "871362719292",
398+
"ap-northeast-1": "763104351884",
399+
"ap-northeast-2": "763104351884",
400+
"ap-south-1": "763104351884",
401+
"ap-southeast-1": "763104351884",
402+
"ap-southeast-2": "763104351884",
403+
"ca-central-1": "763104351884",
404+
"cn-north-1": "727897471807",
405+
"cn-northwest-1": "727897471807",
406+
"eu-central-1": "763104351884",
407+
"eu-north-1": "763104351884",
408+
"eu-west-1": "763104351884",
409+
"eu-west-2": "763104351884",
410+
"eu-west-3": "763104351884",
411+
"eu-south-1": "692866216735",
412+
"me-south-1": "217643126080",
413+
"sa-east-1": "763104351884",
414+
"us-east-1": "763104351884",
415+
"us-east-2": "763104351884",
416+
"us-gov-west-1": "442386744353",
417+
"us-iso-east-1": "886529160074",
418+
"us-west-1": "763104351884",
419+
"us-west-2": "763104351884"
420+
},
421+
"repository": "pytorch-inference"
389422
}
390423
}
391424
},
@@ -404,7 +437,7 @@
404437
"1.5": "1.5.0",
405438
"1.6": "1.6.0",
406439
"1.7": "1.7.1",
407-
"1.8": "1.8.0"
440+
"1.8": "1.8.1"
408441
},
409442
"versions": {
410443
"0.4.0": {
@@ -735,6 +768,39 @@
735768
"us-west-2": "763104351884"
736769
},
737770
"repository": "pytorch-training"
771+
},
772+
"1.8.1": {
773+
"py_versions": [
774+
"py3",
775+
"py36"
776+
],
777+
"registries": {
778+
"af-south-1": "626614931356",
779+
"ap-east-1": "871362719292",
780+
"ap-northeast-1": "763104351884",
781+
"ap-northeast-2": "763104351884",
782+
"ap-south-1": "763104351884",
783+
"ap-southeast-1": "763104351884",
784+
"ap-southeast-2": "763104351884",
785+
"ca-central-1": "763104351884",
786+
"cn-north-1": "727897471807",
787+
"cn-northwest-1": "727897471807",
788+
"eu-central-1": "763104351884",
789+
"eu-north-1": "763104351884",
790+
"eu-west-1": "763104351884",
791+
"eu-west-2": "763104351884",
792+
"eu-west-3": "763104351884",
793+
"eu-south-1": "692866216735",
794+
"me-south-1": "217643126080",
795+
"sa-east-1": "763104351884",
796+
"us-east-1": "763104351884",
797+
"us-east-2": "763104351884",
798+
"us-gov-west-1": "442386744353",
799+
"us-iso-east-1": "886529160074",
800+
"us-west-1": "763104351884",
801+
"us-west-2": "763104351884"
802+
},
803+
"repository": "pytorch-training"
738804
}
739805
}
740806
}

tests/unit/test_fw_utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,7 @@ def test_validate_smdataparallel_args_not_raises():
649649
("ml.p3.16xlarge", "pytorch", "1.7.1", "py3", smdataparallel_enabled),
650650
("ml.p3.16xlarge", "pytorch", "1.7", "py3", smdataparallel_enabled),
651651
("ml.p3.16xlarge", "pytorch", "1.8.0", "py3", smdataparallel_enabled),
652+
("ml.p3.16xlarge", "pytorch", "1.8.1", "py3", smdataparallel_enabled),
652653
("ml.p3.16xlarge", "pytorch", "1.8", "py3", smdataparallel_enabled),
653654
("ml.p3.16xlarge", "tensorflow", "2.4.1", "py3", smdataparallel_enabled_custom_mpi),
654655
("ml.p3.16xlarge", "pytorch", "1.8.0", "py3", smdataparallel_enabled_custom_mpi),

0 commit comments

Comments
 (0)