aws
diff --git a/‎CHANGELOG.rst
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.rst
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/sagemaker/fw_utils.py
Lines changed: 5 additions & 3 deletions b/‎src/sagemaker/fw_utils.py
Lines changed: 5 additions & 3 deletions
diff --git a/‎src/sagemaker/local/entities.py
Lines changed: 3 additions & 3 deletions b/‎src/sagemaker/local/entities.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/sagemaker/local/image.py
Lines changed: 6 additions & 0 deletions b/‎src/sagemaker/local/image.py
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/sagemaker/mxnet/README.rst
Lines changed: 3 additions & 3 deletions b/‎src/sagemaker/mxnet/README.rst
Lines changed: 3 additions & 3 deletions
diff --git a/‎src/sagemaker/mxnet/estimator.py
Lines changed: 1 addition & 1 deletion b/‎src/sagemaker/mxnet/estimator.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/sagemaker/tensorflow/estimator.py
Lines changed: 1 addition & 1 deletion b/‎src/sagemaker/tensorflow/estimator.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/sagemaker/tensorflow/serving.py
Lines changed: 0 additions & 2 deletions b/‎src/sagemaker/tensorflow/serving.py
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/sagemaker/utils.py
Lines changed: 9 additions & 0 deletions b/‎src/sagemaker/utils.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎tests/conftest.py
Lines changed: 10 additions & 0 deletions b/‎tests/conftest.py
Lines changed: 10 additions & 0 deletions
diff --git a/‎tests/integ/local_mode_utils.py
Lines changed: 38 additions & 0 deletions b/‎tests/integ/local_mode_utils.py
Lines changed: 38 additions & 0 deletions
diff --git a/‎tests/integ/test_inference_pipeline.py
Lines changed: 13 additions & 11 deletions b/‎tests/integ/test_inference_pipeline.py
Lines changed: 13 additions & 11 deletions
@@ -6,6 +6,8 @@ CHANGELOG
 ==========
 
 * bug-fix: Remove unnecessary dependency tensorflow
+* doc-fix: Change ``distribution`` to ``distributions``
+* bug-fix: Increase docker-compose http timeout and health check timeout to 120.
 
 1.16.1.post1
 ============
 
@@ -219,9 +219,11 @@ def framework_name_from_image(image_name):
     else:
         # extract framework, python version and image tag
         # We must support both the legacy and current image name format.
-        name_pattern = \
-            re.compile('^sagemaker(?:-rl)?-(tensorflow|mxnet|chainer|pytorch|scikit-learn):(.*)-(.*?)-(py2|py3)$')
-        legacy_name_pattern = re.compile('^sagemaker-(tensorflow|mxnet)-(py2|py3)-(cpu|gpu):(.*)$')
+        name_pattern = re.compile(
+            r'^sagemaker(?:-rl)?-(tensorflow|mxnet|chainer|pytorch|scikit-learn):(.*)-(.*?)-(py2|py3)$')
+        legacy_name_pattern = re.compile(
+            r'^sagemaker-(tensorflow|mxnet)-(py2|py3)-(cpu|gpu):(.*)$')
+
         name_match = name_pattern.match(sagemaker_match.group(8))
         legacy_match = legacy_name_pattern.match(sagemaker_match.group(8))
 
 
@@ -29,7 +29,7 @@
 logger.setLevel(logging.WARNING)
 
 _UNUSED_ARN = 'local:arn-does-not-matter'
-HEALTH_CHECK_TIMEOUT_LIMIT = 30
+HEALTH_CHECK_TIMEOUT_LIMIT = 120
 
 
 class _LocalTrainingJob(object):
@@ -405,7 +405,7 @@ def _wait_for_serving_container(serving_port):
 
     endpoint_url = 'http://localhost:%s/ping' % serving_port
     while True:
-        i += 1
+        i += 5
         if i >= HEALTH_CHECK_TIMEOUT_LIMIT:
             raise RuntimeError('Giving up, endpoint didn\'t launch correctly')
 
@@ -416,7 +416,7 @@ def _wait_for_serving_container(serving_port):
         else:
             return
 
-        time.sleep(1)
+        time.sleep(5)
 
 
 def _perform_request(endpoint_url, pool_manager=None):
 
@@ -39,6 +39,9 @@
 
 CONTAINER_PREFIX = 'algo'
 DOCKER_COMPOSE_FILENAME = 'docker-compose.yaml'
+DOCKER_COMPOSE_HTTP_TIMEOUT_ENV = 'COMPOSE_HTTP_TIMEOUT'
+DOCKER_COMPOSE_HTTP_TIMEOUT = '120'
+
 
 # Environment variables to be set during training
 REGION_ENV_NAME = 'AWS_REGION'
@@ -359,6 +362,9 @@ def _generate_compose_file(self, command, additional_volumes=None, additional_en
         additional_env_var_list = ['{}={}'.format(k, v) for k, v in additional_env_vars.items()]
         environment.extend(additional_env_var_list)
 
+        if os.environ.get(DOCKER_COMPOSE_HTTP_TIMEOUT_ENV) is None:
+            os.environ[DOCKER_COMPOSE_HTTP_TIMEOUT_ENV] = DOCKER_COMPOSE_HTTP_TIMEOUT
+
         if command == 'train':
             optml_dirs = {'output', 'output/data', 'input'}
 
 
@@ -209,15 +209,15 @@ If you were previously relying on the default save method, you can now import on
 
         save(args.model_dir, model)
 
-Lastly, if you were relying on the container launching a parameter server for use with distributed training, you must now set ``distribution`` to the following dictionary when creating an MXNet estimator:
+Lastly, if you were relying on the container launching a parameter server for use with distributed training, you must now set ``distributions`` to the following dictionary when creating an MXNet estimator:
 
 .. code:: python
 
     from sagemaker.mxnet import MXNet
 
     estimator = MXNet('path-to-distributed-training-script.py',
                       ...,
-                      distribution={'parameter_server': {'enabled': True}})
+                      distributions={'parameter_server': {'enabled': True}})
 
 
 Using third-party libraries
@@ -323,7 +323,7 @@ The following are optional arguments. When you create an ``MXNet`` object, you c
    framework_version and py_version. Refer to: `SageMaker MXNet Docker Containers
    <#sagemaker-mxnet-docker-containers>`_ for details on what the Official images support
    and where to find the source code to build your custom image.
--  ``distribution`` For versions 1.3 and above only.
+-  ``distributions`` For versions 1.3 and above only.
    Specifies information for how to run distributed training.
    To launch a parameter server during training, set this argument to:
 
 
@@ -67,7 +67,7 @@ def __init__(self, entry_point, source_dir=None, hyperparameters=None, py_versio
                     Examples:
                         123.dkr.ecr.us-west-2.amazonaws.com/my-custom-image:1.0
                         custom-image:latest.
-             distribution (dict): A dictionary with information on how to run distributed training
+             distributions (dict): A dictionary with information on how to run distributed training
                 (default: None).
             **kwargs: Additional kwargs passed to the :class:`~sagemaker.estimator.Framework` constructor.
         """
 
@@ -199,7 +199,7 @@ def __init__(self, training_steps=None, evaluation_steps=None, checkpoint_path=N
                         custom-image:latest.
             script_mode (bool): If set to True will the estimator will use the Script Mode containers (default: False).
                 This will be ignored if py_version is set to 'py3'.
-            distribution (dict): A dictionary with information on how to run distributed training
+            distributions (dict): A dictionary with information on how to run distributed training
                 (default: None). Currently we only support distributed training with parameter servers. To enable it
                 use the following setup:
                     {
 
@@ -13,7 +13,6 @@
 from __future__ import absolute_import
 
 import logging
-
 import sagemaker
 from sagemaker.content_types import CONTENT_TYPE_JSON
 from sagemaker.fw_utils import create_image_uri
@@ -144,7 +143,6 @@ def _get_image_uri(self, instance_type):
         if self.image:
             return self.image
 
-        # reuse standard image uri function, then strip unwanted python component
         region_name = self.sagemaker_session.boto_region_name
         return create_image_uri(region_name, Model.FRAMEWORK_NAME, instance_type,
                                 self._framework_version)
@@ -14,6 +14,7 @@
 
 import errno
 import os
+import random
 import re
 import sys
 import tarfile
@@ -64,6 +65,14 @@ def name_from_base(base, max_length=63, short=False):
     return '{}-{}'.format(trimmed_base, timestamp)
 
 
+def unique_name_from_base(base, max_length=63):
+    unique = '%04x' % random.randrange(16**4)  # 4-digit hex
+    ts = str(int(time.time()))
+    available_length = max_length - 2 - len(ts) - len(unique)
+    trimmed = base[:available_length]
+    return '{}-{}-{}'.format(trimmed, ts, unique)
+
+
 def airflow_name_from_base(base, max_length=63, short=False):
     """Append airflow execution_date macro (https://airflow.apache.org/code.html?#macros)
     to the provided string. The macro will beevaluated in Airflow operator runtime.
 
@@ -95,6 +95,11 @@ def mxnet_version(request):
     return request.param
 
 
+@pytest.fixture(scope='module', params=['1.3', '1.3.0'])
+def ei_mxnet_version(request):
+    return request.param
+
+
 @pytest.fixture(scope='module', params=['0.4', '0.4.0'])
 def pytorch_version(request):
     return request.param
@@ -112,6 +117,11 @@ def tf_version(request):
     return request.param
 
 
+@pytest.fixture(scope='module', params=['1.11', '1.11.0'])
+def ei_tf_version(request):
+    return request.param
+
+
 @pytest.fixture(scope='module', params=['0.10.1', '0.10.1', '0.11', '0.11.0'])
 def rl_coach_tf_version(request):
     return request.param
 
@@ -0,0 +1,38 @@
+# Copyright 2017-2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import fcntl
+import os
+import time
+from contextlib import contextmanager
+
+import tests.integ
+
+LOCK_PATH = os.path.join(tests.integ.DATA_DIR, 'local_mode_lock')
+
+
+@contextmanager
+def lock():
+    # Since Local Mode uses the same port for serving, we need a lock in order
+    # to allow concurrent test execution.
+    local_mode_lock_fd = open(LOCK_PATH, 'w')
+    local_mode_lock = local_mode_lock_fd.fileno()
+
+    fcntl.lockf(local_mode_lock, fcntl.LOCK_EX)
+
+    try:
+        yield
+    finally:
+        time.sleep(5)
+        fcntl.lockf(local_mode_lock, fcntl.LOCK_UN)
@@ -16,30 +16,30 @@
 import os
 
 import pytest
+from tests.integ import DATA_DIR
+from tests.integ.timeout import timeout_and_delete_endpoint_by_name
 
 from sagemaker.amazon.amazon_estimator import get_image_uri
 from sagemaker.content_types import CONTENT_TYPE_CSV
 from sagemaker.model import Model
 from sagemaker.pipeline import PipelineModel
 from sagemaker.predictor import RealTimePredictor, json_serializer
-from sagemaker.session import Session
 from sagemaker.sparkml.model import SparkMLModel
 from sagemaker.utils import sagemaker_timestamp
-from tests.integ import DATA_DIR
-from tests.integ.timeout import timeout_and_delete_endpoint_by_name
 
 
 @pytest.mark.continuous_testing
 @pytest.mark.regional_testing
 def test_inference_pipeline_model_deploy(sagemaker_session):
-    # Creates a Pipeline model comprising of SparkML (serialized by MLeap) and XGBoost and deploys to one endpoint
     sparkml_data_path = os.path.join(DATA_DIR, 'sparkml_model')
     xgboost_data_path = os.path.join(DATA_DIR, 'xgboost_model')
     endpoint_name = 'test-inference-pipeline-deploy-{}'.format(sagemaker_timestamp())
-    sparkml_model_data = sagemaker_session.upload_data(path=os.path.join(sparkml_data_path, 'mleap_model.tar.gz'),
-                                                       key_prefix='integ-test-data/sparkml/model')
-    xgb_model_data = sagemaker_session.upload_data(path=os.path.join(xgboost_data_path, 'xgb_model.tar.gz'),
-                                                   key_prefix='integ-test-data/xgboost/model')
+    sparkml_model_data = sagemaker_session.upload_data(
+        path=os.path.join(sparkml_data_path, 'mleap_model.tar.gz'),
+        key_prefix='integ-test-data/sparkml/model')
+    xgb_model_data = sagemaker_session.upload_data(
+        path=os.path.join(xgboost_data_path, 'xgb_model.tar.gz'),
+        key_prefix='integ-test-data/xgboost/model')
     schema = json.dumps({
         "input": [
             {
@@ -74,10 +74,12 @@ def test_inference_pipeline_model_deploy(sagemaker_session):
         }
     })
     with timeout_and_delete_endpoint_by_name(endpoint_name, sagemaker_session):
-        sparkml_model = SparkMLModel(model_data=sparkml_model_data, env={'SAGEMAKER_SPARKML_SCHEMA': schema},
+        sparkml_model = SparkMLModel(model_data=sparkml_model_data,
+                                     env={'SAGEMAKER_SPARKML_SCHEMA': schema},
                                      sagemaker_session=sagemaker_session)
-        xgb_image = get_image_uri(Session().boto_region_name, 'xgboost')
-        xgb_model = Model(model_data=xgb_model_data, image=xgb_image, sagemaker_session=sagemaker_session)
+        xgb_image = get_image_uri(sagemaker_session.boto_region_name, 'xgboost')
+        xgb_model = Model(model_data=xgb_model_data, image=xgb_image,
+                          sagemaker_session=sagemaker_session)
         model = PipelineModel(models=[sparkml_model, xgb_model], role='SageMakerRole',
                               sagemaker_session=sagemaker_session, name=endpoint_name)
         model.deploy(1, 'ml.m4.xlarge', endpoint_name=endpoint_name)
Original file line number	Diff line number	Diff line change
`@@ -199,7 +199,7 @@ def __init__(self, training_steps=None, evaluation_steps=None, checkpoint_path=N`
`199`	`199`	`custom-image:latest.`
`200`	`200`	`script_mode (bool): If set to True will the estimator will use the Script Mode containers (default: False).`
`201`	`201`	`This will be ignored if py_version is set to 'py3'.`
`202`		`- distribution (dict): A dictionary with information on how to run distributed training`
	`202`	`+ distributions (dict): A dictionary with information on how to run distributed training`
`203`	`203`	`(default: None). Currently we only support distributed training with parameter servers. To enable it`
`204`	`204`	`use the following setup:`
`205`	`205`	`{`