Skip to content

Commit 165052a

Browse files
authored
Merge branch 'master' into smddp_custom_mpi_support
2 parents b464716 + a167396 commit 165052a

34 files changed

+222
-21
lines changed

CHANGELOG.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,20 @@
11
# Changelog
22

3+
## v2.32.1 (2021-04-01)
4+
5+
### Bug Fixes and Other Changes
6+
7+
* disable profiler in some release tests
8+
* remove outdated notebook from test
9+
* add compilation option for ml_eia2
10+
* add short version to smdataparallel supported list
11+
12+
### Documentation Changes
13+
14+
* creating a "latest" version sm distributed docs
15+
* add docs for Sagemaker Model Parallel 1.3, released with PT 1.8
16+
* update PyTorch version in doc
17+
318
## v2.32.0 (2021-03-26)
419

520
### Features

VERSION

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
2.32.1.dev0
1+
2.32.2.dev0
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
2+
Version 1.1.0 (Latest)
3+
======================
4+
5+
.. toctree::
6+
:maxdepth: 1
7+
8+
latest/smd_data_parallel_pytorch.rst
9+
latest/smd_data_parallel_tensorflow.rst

doc/api/training/sdp_versions/v1_1_0.rst

Lines changed: 0 additions & 9 deletions
This file was deleted.

doc/api/training/smd_data_parallel.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ Select a version to see the API documentation for version.
8484
.. toctree::
8585
:maxdepth: 1
8686

87-
sdp_versions/v1_1_0.rst
87+
sdp_versions/latest.rst
8888
sdp_versions/v1_0_0.rst
8989

9090
.. important::

doc/api/training/smd_model_parallel.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ Select a version to see the API documentation for version. To use the library, r
3434
.. toctree::
3535
:maxdepth: 1
3636

37-
smp_versions/v1_3_0.rst
37+
smp_versions/latest.rst
3838
smp_versions/v1_2_0.rst
3939
smp_versions/v1_1_0.rst
4040

doc/api/training/smp_versions/v1_3_0.rst renamed to doc/api/training/smp_versions/latest.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,6 @@ To use the library, reference the Common API documentation alongside the framewo
77
.. toctree::
88
:maxdepth: 1
99

10-
v1.3.0/smd_model_parallel_common_api
11-
v1.3.0/smd_model_parallel_pytorch
12-
v1.3.0/smd_model_parallel_tensorflow
10+
latest/smd_model_parallel_common_api
11+
latest/smd_model_parallel_pytorch
12+
latest/smd_model_parallel_tensorflow

src/sagemaker/estimator.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,7 @@ def __init__(
123123
enable_network_isolation=False,
124124
profiler_config=None,
125125
disable_profiler=False,
126+
environment=None,
126127
**kwargs,
127128
):
128129
"""Initialize an ``EstimatorBase`` instance.
@@ -266,6 +267,8 @@ def __init__(
266267
``disable_profiler`` parameter to ``True``.
267268
disable_profiler (bool): Specifies whether Debugger monitoring and profiling
268269
will be disabled (default: ``False``).
270+
environment (dict[str, str]) : Environment variables to be set for
271+
use during training job (default: ``None``)
269272
270273
"""
271274
instance_count = renamed_kwargs(
@@ -352,6 +355,8 @@ def __init__(
352355
self.profiler_config = profiler_config
353356
self.disable_profiler = disable_profiler
354357

358+
self.environment = environment
359+
355360
if not _region_supports_profiler(self.sagemaker_session.boto_region_name):
356361
self.disable_profiler = True
357362

@@ -1471,6 +1476,7 @@ def _get_train_args(cls, estimator, inputs, experiment_config):
14711476
train_args["tags"] = estimator.tags
14721477
train_args["metric_definitions"] = estimator.metric_definitions
14731478
train_args["experiment_config"] = experiment_config
1479+
train_args["environment"] = estimator.environment
14741480

14751481
if isinstance(inputs, TrainingInput):
14761482
if "InputMode" in inputs.config:
@@ -1659,6 +1665,7 @@ def __init__(
16591665
enable_sagemaker_metrics=None,
16601666
profiler_config=None,
16611667
disable_profiler=False,
1668+
environment=None,
16621669
**kwargs,
16631670
):
16641671
"""Initialize an ``Estimator`` instance.
@@ -1807,6 +1814,8 @@ def __init__(
18071814
``disable_profiler`` parameter to ``True``.
18081815
disable_profiler (bool): Specifies whether Debugger monitoring and profiling
18091816
will be disabled (default: ``False``).
1817+
environment (dict[str, str]) : Environment variables to be set for
1818+
use during training job (default: ``None``)
18101819
"""
18111820
self.image_uri = image_uri
18121821
self.hyperparam_dict = hyperparameters.copy() if hyperparameters else {}
@@ -1840,6 +1849,7 @@ def __init__(
18401849
enable_network_isolation=enable_network_isolation,
18411850
profiler_config=profiler_config,
18421851
disable_profiler=disable_profiler,
1852+
environment=environment,
18431853
**kwargs,
18441854
)
18451855

src/sagemaker/image_uri_config/tensorflow.json

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@
1010
"1.13": "1.13.0",
1111
"1.14": "1.14.0",
1212
"1.15": "1.15.0",
13-
"2.0": "2.0.0"
13+
"2.0": "2.0.0",
14+
"2.3": "2.3.0"
1415
},
1516
"versions": {
1617
"1.10.0": {
@@ -218,6 +219,35 @@
218219
"us-west-2": "763104351884"
219220
},
220221
"repository": "tensorflow-inference-eia"
222+
},
223+
"2.3.0": {
224+
"registries": {
225+
"af-south-1": "626614931356",
226+
"ap-east-1": "871362719292",
227+
"ap-northeast-1": "763104351884",
228+
"ap-northeast-2": "763104351884",
229+
"ap-south-1": "763104351884",
230+
"ap-southeast-1": "763104351884",
231+
"ap-southeast-2": "763104351884",
232+
"ca-central-1": "763104351884",
233+
"cn-north-1": "727897471807",
234+
"cn-northwest-1": "727897471807",
235+
"eu-central-1": "763104351884",
236+
"eu-north-1": "763104351884",
237+
"eu-south-1": "692866216735",
238+
"eu-west-1": "763104351884",
239+
"eu-west-2": "763104351884",
240+
"eu-west-3": "763104351884",
241+
"me-south-1": "217643126080",
242+
"sa-east-1": "763104351884",
243+
"us-east-1": "763104351884",
244+
"us-east-2": "763104351884",
245+
"us-gov-west-1": "442386744353",
246+
"us-iso-east-1": "886529160074",
247+
"us-west-1": "763104351884",
248+
"us-west-2": "763104351884"
249+
},
250+
"repository": "tensorflow-inference-eia"
221251
}
222252
}
223253
},

src/sagemaker/model.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,9 @@ def compile(
651651
job_status = self.sagemaker_session.wait_for_compilation_job(job_name)
652652
self.model_data = job_status["ModelArtifacts"]["S3ModelArtifacts"]
653653
if target_instance_family is not None:
654-
if target_instance_family.startswith("ml_"):
654+
if target_instance_family == "ml_eia2":
655+
pass
656+
elif target_instance_family.startswith("ml_"):
655657
self.image_uri = self._compilation_image_uri(
656658
self.sagemaker_session.boto_region_name,
657659
target_instance_family,

src/sagemaker/session.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -456,6 +456,7 @@ def train( # noqa: C901
456456
enable_sagemaker_metrics=None,
457457
profiler_rule_configs=None,
458458
profiler_config=None,
459+
environment=None,
459460
):
460461
"""Create an Amazon SageMaker training job.
461462
@@ -522,9 +523,12 @@ def train( # noqa: C901
522523
Series. For more information see:
523524
https://docs.aws.amazon.com/sagemaker/latest/dg/API_AlgorithmSpecification.html#SageMaker-Type-AlgorithmSpecification-EnableSageMakerMetricsTimeSeries
524525
(default: ``None``).
525-
profiler_rule_configs (list[dict]): A list of profiler rule configurations.
526+
profiler_rule_configs (list[dict]): A list of profiler rule
527+
configurations.src/sagemaker/lineage/artifact.py:285
526528
profiler_config (dict): Configuration for how profiling information is emitted
527529
with SageMaker Profiler. (default: ``None``).
530+
environment (dict[str, str]) : Environment variables to be set for
531+
use during training job (default: ``None``)
528532
529533
Returns:
530534
str: ARN of the training job, if it is created.
@@ -556,6 +560,7 @@ def train( # noqa: C901
556560
enable_sagemaker_metrics=enable_sagemaker_metrics,
557561
profiler_rule_configs=profiler_rule_configs,
558562
profiler_config=profiler_config,
563+
environment=environment,
559564
)
560565
LOGGER.info("Creating training-job with name: %s", job_name)
561566
LOGGER.debug("train request: %s", json.dumps(train_request, indent=4))
@@ -588,6 +593,7 @@ def _get_train_request( # noqa: C901
588593
enable_sagemaker_metrics=None,
589594
profiler_rule_configs=None,
590595
profiler_config=None,
596+
environment=None,
591597
):
592598
"""Constructs a request compatible for creating an Amazon SageMaker training job.
593599
@@ -657,6 +663,8 @@ def _get_train_request( # noqa: C901
657663
profiler_rule_configs (list[dict]): A list of profiler rule configurations.
658664
profiler_config(dict): Configuration for how profiling information is emitted with
659665
SageMaker Profiler. (default: ``None``).
666+
environment (dict[str, str]) : Environment variables to be set for
667+
use during training job (default: ``None``)
660668
661669
Returns:
662670
Dict: a training request dict
@@ -699,6 +707,9 @@ def _get_train_request( # noqa: C901
699707
if hyperparameters and len(hyperparameters) > 0:
700708
train_request["HyperParameters"] = hyperparameters
701709

710+
if environment is not None:
711+
train_request["Environment"] = environment
712+
702713
if tags is not None:
703714
train_request["Tags"] = tags
704715

src/sagemaker/tensorflow/model.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ class TensorFlowModel(sagemaker.model.FrameworkModel):
118118
logging.ERROR: "error",
119119
logging.CRITICAL: "crit",
120120
}
121-
LATEST_EIA_VERSION = [2, 0]
121+
LATEST_EIA_VERSION = [2, 3]
122122

123123
def __init__(
124124
self,
@@ -289,7 +289,12 @@ def deploy(
289289

290290
def _eia_supported(self):
291291
"""Return true if TF version is EIA enabled"""
292-
return [int(s) for s in self.framework_version.split(".")][:2] <= self.LATEST_EIA_VERSION
292+
framework_version = [int(s) for s in self.framework_version.split(".")][:2]
293+
return (
294+
framework_version != [2, 1]
295+
and framework_version != [2, 2]
296+
and framework_version <= self.LATEST_EIA_VERSION
297+
)
293298

294299
def prepare_container_def(self, instance_type=None, accelerator_type=None):
295300
"""Prepare the container definition.

tests/data/cuteCat.jpg

6.43 KB
Loading

tests/integ/test_horovod.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ def _create_and_fit_estimator(sagemaker_session, tf_version, py_version, instanc
9191
py_version=py_version,
9292
framework_version=tf_version,
9393
distribution={"mpi": {"enabled": True}},
94+
disable_profiler=True,
9495
)
9596

9697
with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):

tests/integ/test_horovod_mx.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def _create_and_fit_estimator(mxnet_version, py_version, sagemaker_session, inst
9292
py_version=py_version,
9393
framework_version=mxnet_version,
9494
distribution={"mpi": {"enabled": True}},
95+
disable_profiler=True,
9596
)
9697

9798
with timeout.timeout(minutes=integ.TRAINING_DEFAULT_TIMEOUT_MINUTES):

tests/integ/test_huggingface.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ def test_huggingface_training(
5858
"repo": "https://github.com/huggingface/transformers.git",
5959
"branch": f"v{huggingface_training_latest_version}",
6060
},
61+
disable_profiler=True,
6162
)
6263

6364
train_input = hf.sagemaker_session.upload_data(

tests/integ/test_tf.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
PARAMETER_SERVER_DISTRIBUTION = {"parameter_server": {"enabled": True}}
3737
MPI_DISTRIBUTION = {"mpi": {"enabled": True}}
3838
TAGS = [{"Key": "some-key", "Value": "some-value"}]
39+
ENV_INPUT = {"env_key1": "env_val1", "env_key2": "env_val2", "env_key3": "env_val3"}
3940

4041

4142
def test_mnist_with_checkpoint_config(
@@ -59,6 +60,7 @@ def test_mnist_with_checkpoint_config(
5960
metric_definitions=[{"Name": "train:global_steps", "Regex": r"global_step\/sec:\s(.*)"}],
6061
checkpoint_s3_uri=checkpoint_s3_uri,
6162
checkpoint_local_path=checkpoint_local_path,
63+
environment=ENV_INPUT,
6264
)
6365
inputs = estimator.sagemaker_session.upload_data(
6466
path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/mnist"
@@ -82,7 +84,13 @@ def test_mnist_with_checkpoint_config(
8284
actual_training_checkpoint_config = sagemaker_session.sagemaker_client.describe_training_job(
8385
TrainingJobName=training_job_name
8486
)["CheckpointConfig"]
87+
actual_training_environment_variable_config = (
88+
sagemaker_session.sagemaker_client.describe_training_job(TrainingJobName=training_job_name)[
89+
"Environment"
90+
]
91+
)
8592
assert actual_training_checkpoint_config == expected_training_checkpoint_config
93+
assert actual_training_environment_variable_config == ENV_INPUT
8694

8795

8896
def test_server_side_encryption(sagemaker_session, tf_full_version, tf_full_py_version):
@@ -141,6 +149,7 @@ def test_mnist_distributed(
141149
framework_version=tensorflow_training_latest_version,
142150
py_version=tensorflow_training_latest_py_version,
143151
distribution=PARAMETER_SERVER_DISTRIBUTION,
152+
disable_profiler=True,
144153
)
145154
inputs = estimator.sagemaker_session.upload_data(
146155
path=os.path.join(MNIST_RESOURCE_PATH, "data"), key_prefix="scriptmode/distributed_mnist"

tests/scripts/run-notebook-test.sh

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,5 +32,4 @@ echo "set SAGEMAKER_ROLE_ARN=$SAGEMAKER_ROLE_ARN"
3232
./amazon-sagemaker-examples/sagemaker-python-sdk/tensorflow_moving_from_framework_mode_to_script_mode/tensorflow_moving_from_framework_mode_to_script_mode.ipynb \
3333
./amazon-sagemaker-examples/sagemaker-python-sdk/tensorflow_script_mode_pipe_mode/tensorflow_script_mode_pipe_mode.ipynb \
3434
./amazon-sagemaker-examples/sagemaker-python-sdk/tensorflow_script_mode_quickstart/tensorflow_script_mode_quickstart.ipynb \
35-
./amazon-sagemaker-examples/sagemaker-python-sdk/tensorflow_script_mode_using_shell_commands/tensorflow_script_mode_using_shell_commands.ipynb \
3635
./amazon-sagemaker-examples/sagemaker-python-sdk/tensorflow_serving_using_elastic_inference_with_your_own_model/tensorflow_serving_pretrained_model_elastic_inference.ipynb

tests/unit/sagemaker/huggingface/test_estimator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,7 @@ def _create_train_job(version, base_framework_version):
149149
"tags": None,
150150
"vpc_config": None,
151151
"metric_definitions": None,
152+
"environment": None,
152153
"experiment_config": None,
153154
"debugger_hook_config": {
154155
"CollectionConfigurations": [],

tests/unit/sagemaker/tensorflow/test_estimator.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ def _create_train_job(tf_version, horovod=False, ps=False, py_version="py2", smd
130130
"tags": None,
131131
"vpc_config": None,
132132
"metric_definitions": None,
133+
"environment": None,
133134
"experiment_config": None,
134135
"profiler_rule_configs": [
135136
{

tests/unit/sagemaker/tensorflow/test_estimator_init.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@
2020

2121
REGION = "us-west-2"
2222

23+
ENV_INPUT = {"env_key1": "env_val1", "env_key2": "env_val2", "env_key3": "env_val3"}
24+
2325

2426
@pytest.fixture()
2527
def sagemaker_session():
@@ -68,6 +70,26 @@ def test_framework_name(sagemaker_session):
6870
assert tf._framework_name == "tensorflow"
6971

7072

73+
def test_tf_add_environment_variables(sagemaker_session):
74+
tf = _build_tf(
75+
sagemaker_session,
76+
framework_version="1.15.2",
77+
py_version="py3",
78+
environment=ENV_INPUT,
79+
)
80+
assert tf.environment == ENV_INPUT
81+
82+
83+
def test_tf_miss_environment_variables(sagemaker_session):
84+
tf = _build_tf(
85+
sagemaker_session,
86+
framework_version="1.15.2",
87+
py_version="py3",
88+
environment=None,
89+
)
90+
assert not tf.environment
91+
92+
7193
def test_enable_sm_metrics(sagemaker_session):
7294
tf = _build_tf(
7395
sagemaker_session,

tests/unit/test_chainer.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,7 @@ def _create_train_job(version, py_version):
143143
"tags": None,
144144
"vpc_config": None,
145145
"metric_definitions": None,
146+
"environment": None,
146147
"experiment_config": None,
147148
"debugger_hook_config": {
148149
"CollectionConfigurations": [],

0 commit comments

Comments
 (0)