Skip to content

Commit 6b86a62

Browse files
authored
Merge branch 'master' into remove_tf
2 parents 2a0d73d + 9061757 commit 6b86a62

20 files changed

+345
-291
lines changed

CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@ CHANGELOG
66
==========
77

88
* bug-fix: Remove unnecessary dependency tensorflow
9+
* doc-fix: Change ``distribution`` to ``distributions``
10+
* bug-fix: Increase docker-compose http timeout and health check timeout to 120.
911

1012
1.16.1.post1
1113
============

src/sagemaker/fw_utils.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -219,9 +219,11 @@ def framework_name_from_image(image_name):
219219
else:
220220
# extract framework, python version and image tag
221221
# We must support both the legacy and current image name format.
222-
name_pattern = \
223-
re.compile('^sagemaker(?:-rl)?-(tensorflow|mxnet|chainer|pytorch|scikit-learn):(.*)-(.*?)-(py2|py3)$')
224-
legacy_name_pattern = re.compile('^sagemaker-(tensorflow|mxnet)-(py2|py3)-(cpu|gpu):(.*)$')
222+
name_pattern = re.compile(
223+
r'^sagemaker(?:-rl)?-(tensorflow|mxnet|chainer|pytorch|scikit-learn):(.*)-(.*?)-(py2|py3)$')
224+
legacy_name_pattern = re.compile(
225+
r'^sagemaker-(tensorflow|mxnet)-(py2|py3)-(cpu|gpu):(.*)$')
226+
225227
name_match = name_pattern.match(sagemaker_match.group(8))
226228
legacy_match = legacy_name_pattern.match(sagemaker_match.group(8))
227229

src/sagemaker/local/entities.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@
2929
logger.setLevel(logging.WARNING)
3030

3131
_UNUSED_ARN = 'local:arn-does-not-matter'
32-
HEALTH_CHECK_TIMEOUT_LIMIT = 30
32+
HEALTH_CHECK_TIMEOUT_LIMIT = 120
3333

3434

3535
class _LocalTrainingJob(object):
@@ -405,7 +405,7 @@ def _wait_for_serving_container(serving_port):
405405

406406
endpoint_url = 'http://localhost:%s/ping' % serving_port
407407
while True:
408-
i += 1
408+
i += 5
409409
if i >= HEALTH_CHECK_TIMEOUT_LIMIT:
410410
raise RuntimeError('Giving up, endpoint didn\'t launch correctly')
411411

@@ -416,7 +416,7 @@ def _wait_for_serving_container(serving_port):
416416
else:
417417
return
418418

419-
time.sleep(1)
419+
time.sleep(5)
420420

421421

422422
def _perform_request(endpoint_url, pool_manager=None):

src/sagemaker/local/image.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939

4040
CONTAINER_PREFIX = 'algo'
4141
DOCKER_COMPOSE_FILENAME = 'docker-compose.yaml'
42+
DOCKER_COMPOSE_HTTP_TIMEOUT_ENV = 'COMPOSE_HTTP_TIMEOUT'
43+
DOCKER_COMPOSE_HTTP_TIMEOUT = '120'
44+
4245

4346
# Environment variables to be set during training
4447
REGION_ENV_NAME = 'AWS_REGION'
@@ -359,6 +362,9 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
359362
additional_env_var_list = ['{}={}'.format(k, v) for k, v in additional_env_vars.items()]
360363
environment.extend(additional_env_var_list)
361364

365+
if os.environ.get(DOCKER_COMPOSE_HTTP_TIMEOUT_ENV) is None:
366+
os.environ[DOCKER_COMPOSE_HTTP_TIMEOUT_ENV] = DOCKER_COMPOSE_HTTP_TIMEOUT
367+
362368
if command == 'train':
363369
optml_dirs = {'output', 'output/data', 'input'}
364370

src/sagemaker/mxnet/README.rst

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -209,15 +209,15 @@ If you were previously relying on the default save method, you can now import on
209209
210210
save(args.model_dir, model)
211211
212-
Lastly, if you were relying on the container launching a parameter server for use with distributed training, you must now set ``distribution`` to the following dictionary when creating an MXNet estimator:
212+
Lastly, if you were relying on the container launching a parameter server for use with distributed training, you must now set ``distributions`` to the following dictionary when creating an MXNet estimator:
213213

214214
.. code:: python
215215
216216
from sagemaker.mxnet import MXNet
217217
218218
estimator = MXNet('path-to-distributed-training-script.py',
219219
...,
220-
distribution={'parameter_server': {'enabled': True}})
220+
distributions={'parameter_server': {'enabled': True}})
221221
222222
223223
Using third-party libraries
@@ -323,7 +323,7 @@ The following are optional arguments. When you create an ``MXNet`` object, you c
323323
framework_version and py_version. Refer to: `SageMaker MXNet Docker Containers
324324
<#sagemaker-mxnet-docker-containers>`_ for details on what the Official images support
325325
and where to find the source code to build your custom image.
326-
- ``distribution`` For versions 1.3 and above only.
326+
- ``distributions`` For versions 1.3 and above only.
327327
Specifies information for how to run distributed training.
328328
To launch a parameter server during training, set this argument to:
329329

src/sagemaker/mxnet/estimator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_versio
6767
Examples:
6868
123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0
6969
custom-image:latest.
70-
distribution (dict): A dictionary with information on how to run distributed training
70+
distributions (dict): A dictionary with information on how to run distributed training
7171
(default: None).
7272
**kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework` constructor.
7373
"""

src/sagemaker/tensorflow/estimator.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ def __init__(self, training_steps=None, evaluation_steps=None, checkpoint_path=N
199199
custom-image:latest.
200200
script_mode (bool): If set to True will the estimator will use the Script Mode containers (default: False).
201201
This will be ignored if py_version is set to 'py3'.
202-
distribution (dict): A dictionary with information on how to run distributed training
202+
distributions (dict): A dictionary with information on how to run distributed training
203203
(default: None). Currently we only support distributed training with parameter servers. To enable it
204204
use the following setup:
205205
{

src/sagemaker/tensorflow/serving.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from __future__ import absolute_import
1414

1515
import logging
16-
1716
import sagemaker
1817
from sagemaker.content_types import CONTENT_TYPE_JSON
1918
from sagemaker.fw_utils import create_image_uri
@@ -144,7 +143,6 @@ def _get_image_uri(self, instance_type):
144143
if self.image:
145144
return self.image
146145

147-
# reuse standard image uri function, then strip unwanted python component
148146
region_name = self.sagemaker_session.boto_region_name
149147
return create_image_uri(region_name, Model.FRAMEWORK_NAME, instance_type,
150148
self._framework_version)

src/sagemaker/utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
import errno
1616
import os
17+
import random
1718
import re
1819
import sys
1920
import tarfile
@@ -64,6 +65,14 @@ def name_from_base(base, max_length=63, short=False):
6465
return '{}-{}'.format(trimmed_base, timestamp)
6566

6667

68+
def unique_name_from_base(base, max_length=63):
69+
unique = '%04x' % random.randrange(16**4) # 4-digit hex
70+
ts = str(int(time.time()))
71+
available_length = max_length - 2 - len(ts) - len(unique)
72+
trimmed = base[:available_length]
73+
return '{}-{}-{}'.format(trimmed, ts, unique)
74+
75+
6776
def airflow_name_from_base(base, max_length=63, short=False):
6877
"""Append airflow execution_date macro (https://airflow.apache.org/code.html?#macros)
6978
to the provided string. The macro will beevaluated in Airflow operator runtime.

tests/conftest.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,11 @@ def mxnet_version(request):
9595
return request.param
9696

9797

98+
@pytest.fixture(scope='module', params=['1.3', '1.3.0'])
99+
def ei_mxnet_version(request):
100+
return request.param
101+
102+
98103
@pytest.fixture(scope='module', params=['0.4', '0.4.0'])
99104
def pytorch_version(request):
100105
return request.param
@@ -112,6 +117,11 @@ def tf_version(request):
112117
return request.param
113118

114119

120+
@pytest.fixture(scope='module', params=['1.11', '1.11.0'])
121+
def ei_tf_version(request):
122+
return request.param
123+
124+
115125
@pytest.fixture(scope='module', params=['0.10.1', '0.10.1', '0.11', '0.11.0'])
116126
def rl_coach_tf_version(request):
117127
return request.param

tests/integ/local_mode_utils.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import fcntl
16+
import os
17+
import time
18+
from contextlib import contextmanager
19+
20+
import tests.integ
21+
22+
LOCK_PATH = os.path.join(tests.integ.DATA_DIR, 'local_mode_lock')
23+
24+
25+
@contextmanager
26+
def lock():
27+
# Since Local Mode uses the same port for serving, we need a lock in order
28+
# to allow concurrent test execution.
29+
local_mode_lock_fd = open(LOCK_PATH, 'w')
30+
local_mode_lock = local_mode_lock_fd.fileno()
31+
32+
fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
33+
34+
try:
35+
yield
36+
finally:
37+
time.sleep(5)
38+
fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)

tests/integ/test_inference_pipeline.py

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -16,30 +16,30 @@
1616
import os
1717

1818
import pytest
19+
from tests.integ import DATA_DIR
20+
from tests.integ.timeout import timeout_and_delete_endpoint_by_name
1921

2022
from sagemaker.amazon.amazon_estimator import get_image_uri
2123
from sagemaker.content_types import CONTENT_TYPE_CSV
2224
from sagemaker.model import Model
2325
from sagemaker.pipeline import PipelineModel
2426
from sagemaker.predictor import RealTimePredictor, json_serializer
25-
from sagemaker.session import Session
2627
from sagemaker.sparkml.model import SparkMLModel
2728
from sagemaker.utils import sagemaker_timestamp
28-
from tests.integ import DATA_DIR
29-
from tests.integ.timeout import timeout_and_delete_endpoint_by_name
3029

3130

3231
@pytest.mark.continuous_testing
3332
@pytest.mark.regional_testing
3433
def test_inference_pipeline_model_deploy(sagemaker_session):
35-
# Creates a Pipeline model comprising of SparkML (serialized by MLeap) and XGBoost and deploys to one endpoint
3634
sparkml_data_path = os.path.join(DATA_DIR, 'sparkml_model')
3735
xgboost_data_path = os.path.join(DATA_DIR, 'xgboost_model')
3836
endpoint_name = 'test-inference-pipeline-deploy-{}'.format(sagemaker_timestamp())
39-
sparkml_model_data = sagemaker_session.upload_data(path=os.path.join(sparkml_data_path, 'mleap_model.tar.gz'),
40-
key_prefix='integ-test-data/sparkml/model')
41-
xgb_model_data = sagemaker_session.upload_data(path=os.path.join(xgboost_data_path, 'xgb_model.tar.gz'),
42-
key_prefix='integ-test-data/xgboost/model')
37+
sparkml_model_data = sagemaker_session.upload_data(
38+
path=os.path.join(sparkml_data_path, 'mleap_model.tar.gz'),
39+
key_prefix='integ-test-data/sparkml/model')
40+
xgb_model_data = sagemaker_session.upload_data(
41+
path=os.path.join(xgboost_data_path, 'xgb_model.tar.gz'),
42+
key_prefix='integ-test-data/xgboost/model')
4343
schema = json.dumps({
4444
"input": [
4545
{
@@ -74,10 +74,12 @@ def test_inference_pipeline_model_deploy(sagemaker_session):
7474
}
7575
})
7676
with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
77-
sparkml_model = SparkMLModel(model_data=sparkml_model_data, env={'SAGEMAKER_SPARKML_SCHEMA': schema},
77+
sparkml_model = SparkMLModel(model_data=sparkml_model_data,
78+
env={'SAGEMAKER_SPARKML_SCHEMA': schema},
7879
sagemaker_session=sagemaker_session)
79-
xgb_image = get_image_uri(Session().boto_region_name, 'xgboost')
80-
xgb_model = Model(model_data=xgb_model_data, image=xgb_image, sagemaker_session=sagemaker_session)
80+
xgb_image = get_image_uri(sagemaker_session.boto_region_name, 'xgboost')
81+
xgb_model = Model(model_data=xgb_model_data, image=xgb_image,
82+
sagemaker_session=sagemaker_session)
8183
model = PipelineModel(models=[sparkml_model, xgb_model], role='SageMakerRole',
8284
sagemaker_session=sagemaker_session, name=endpoint_name)
8385
model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)

0 commit comments

Comments
 (0)