aws · icywang86rui · Nov 6, 2018 · Oct 25, 2018 · Oct 26, 2018 · Oct 26, 2018
diff --git a/docker/1.11.0/Dockerfile.cpu b/docker/1.11.0/Dockerfile.cpu
@@ -0,0 +1,68 @@
+FROM ubuntu:16.04
+
+MAINTAINER Amazon AI
+
+ARG framework_installable
+ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
+ARG py_version
+
+# Validate that arguments are specified
+RUN test $framework_installable || exit 1 \
+    && test $py_version || exit 1
+
+RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa -y \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN buildDeps=" \
+        ca-certificates \
+        curl \
+        nginx \
+    " \
+    && apt-get update && apt-get install -y --no-install-recommends $buildDeps \
+    && rm -rf /var/lib/apt/lists/* \
+    && if [ $py_version -eq 3 ]; \
+           then apt-get update && apt-get install -y --no-install-recommends python3.6 \
+                && ln -s -f /usr/bin/python3.6 /usr/bin/python; \
+           else apt-get update && apt-get install -y --no-install-recommends python; fi
+
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
+
+RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py \
+		    --disable-pip-version-check \
+		    --no-cache-dir \
+		    "pip==18.1" \
+	; \
+	pip --version; \
+	find /usr/local -depth \
+		\( \
+			\( -type d -a \( -name test -o -name tests \) \) \
+			-o \
+			\( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \
+		\) -exec rm -rf '{}' +; \
+    rm get-pip.py
+
+# Set environment variables for MKL
+# TODO: investigate the right value for OMP_NUM_THREADS
+# For more about MKL with TensorFlow see:
+# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
+ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
+
+WORKDIR /
+
+COPY $framework_installable .
+COPY $framework_support_installable .
+
+RUN pip install --no-cache-dir $framework_installable \
+    $framework_support_installable\
+    "sagemaker-tensorflow>=1.11,<1.12" \
+    \
+    && rm -f $framework_installable \
+    && rm -f $framework_support_installable \
+    && pip uninstall -y --no-cache-dir \
+    markdown \
+    tensorboard
+
+ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
diff --git a/docker/1.11.0/Dockerfile.gpu b/docker/1.11.0/Dockerfile.gpu
@@ -0,0 +1,89 @@
+FROM nvidia/cuda:9.0-base-ubuntu16.04
+
+MAINTAINER Amazon AI
+
+ARG framework_installable
+ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
+ARG py_version
+
+# Validate that arguments are specified
+RUN test $framework_installable || exit 1 \
+    && test $py_version || exit 1
+
+RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
+    && add-apt-repository ppa:deadsnakes/ppa -y \
+    && rm -rf /var/lib/apt/lists/*
+
+ENV NCCL_VERSION=2.3.5-2+cuda9.0
+ENV CUDNN_VERSION=7.3.1.20-1+cuda9.0
+ENV TF_TENSORRT_VERSION=4.1.2
+
+RUN buildDeps=" \
+        ca-certificates \
+        cuda-command-line-tools-9-0 \
+        cuda-cublas-dev-9-0 \
+        cuda-cudart-dev-9-0 \
+        cuda-cufft-dev-9-0 \
+        cuda-curand-dev-9-0 \
+        cuda-cusolver-dev-9-0 \
+        cuda-cusparse-dev-9-0 \
+        curl \
+        libcudnn7=${CUDNN_VERSION} \
+        libnccl2=${NCCL_VERSION} \
+        libgomp1 \
+    " \
+    && apt-get update && apt-get install -y --no-install-recommends $buildDeps \
+    # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
+    # adds a new list which contains libnvinfer library, so it needs another
+    # 'apt-get update' to retrieve that list before it can actually install the
+    # library.
+    # We don't install libnvinfer-dev since we don't need to build against TensorRT,
+    # and libnvinfer4 doesn't contain libnvinfer.a static library.
+    && apt-get update && apt-get install -y --no-install-recommends \
+        nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 \
+    && apt-get update && apt-get install -y --no-install-recommends \
+        libnvinfer4=${TF_TENSORRT_VERSION}-1+cuda9.0 \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/* \
+    && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
+    && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
+    && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
+    && if [ $py_version -eq 3 ]; \
+           then apt-get update && apt-get install -y --no-install-recommends python3.6 \
+                && ln -s -f /usr/bin/python3.6 /usr/bin/python; \
+           else apt-get update && apt-get install -y --no-install-recommends python; fi
+
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
+
+RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py \
+		    --disable-pip-version-check \
+		    --no-cache-dir \
+		    "pip==18.1" \
+	; \
+	pip --version; \
+	find /usr/local -depth \
+		\( \
+			\( -type d -a \( -name test -o -name tests \) \) \
+			-o \
+			\( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \
+		\) -exec rm -rf '{}' +; \
+    rm get-pip.py
+
+WORKDIR /root
+
+COPY $framework_installable .
+COPY $framework_support_installable .
+
+RUN framework_installable_local=$(basename $framework_installable) \
+    && framework_support_installable_local=$(basename $framework_support_installable) \
+    \
+    && pip install --no-cache --upgrade $framework_installable_local \
+    && pip install $framework_support_installable_local \
+    && pip install "sagemaker-tensorflow>=1.11,<1.12" \
+    \
+    && rm $framework_installable_local \
+    && rm $framework_support_installable_local
+
+ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
diff --git a/setup.py b/setup.py
@@ -49,7 +49,7 @@ def read(fname):
         'Programming Language :: Python :: 3.6',
     ],
 
-    install_requires=['sagemaker-containers==2.1', 'numpy', 'scipy', 'sklearn',
+    install_requires=['sagemaker-containers>=2.2.6', 'numpy', 'scipy', 'sklearn',
                       'pandas', 'Pillow', 'h5py'],
     extras_require={
         'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock',

diff --git a/src/sagemaker_tensorflow_container/training.py b/src/sagemaker_tensorflow_container/training.py
@@ -13,19 +13,151 @@
 
 from __future__ import absolute_import
 
+import json
 import logging
+import subprocess
+import time
 
 import sagemaker_containers.beta.framework as framework
 
+
 logger = logging.getLogger(__name__)
 
 
+SAGEMAKER_PARAMETER_SERVER_NUM = 'sagemaker_parameter_server_num'
+
+
+def _is_host_master(hosts, current_host):
+    return current_host == hosts[0]
+
+
+def _build_tf_config(hosts, current_host, ps_num=0, ps_task=False):
+    """Builds a dictionary containing cluster information based on number of hosts and number of
+    parameter servers.
+
+    Args:
+        hosts (list[str]): List of host names in the cluster
+        current_host (str): Current host name
+        ps_num (int): Number of parameter servers (default: 0)
+        ps_task (bool): Set to True if this config is built for a parameter server process
+            (default: False)
+
+    Returns:
+        dict[str: dict]: A dictionary describing the cluster setup for distributed training.
+        For more information regarding TF_CONFIG:
+        https://cloud.google.com/ml-engine/docs/tensorflow/distributed-training-details
+    """
+    # Assign the first host as the master. Rest of the hosts if any will be worker hosts.
+    # The first ps_num hosts will also have a parameter task assign to them.
+    masters = hosts[:1]
+    workers = hosts[1:]
+    ps = hosts[:ps_num] if len(hosts) > 1 and ps_num > 0 else None
+
+    def host_addresses(hosts, port=2222):
+        return ['{}:{}'.format(host, port) for host in hosts]
+
+    tf_config = {
+        'cluster': {
+            'master': host_addresses(masters)
+        },
+        'environment': 'cloud'
+    }
+
+    if ps:
+        tf_config['cluster']['ps'] = host_addresses(ps, port='2223')
+
+    if workers:
+        tf_config['cluster']['worker'] = host_addresses(workers)
+
+    if ps_task:
+        if ps is None:
+            raise ValueError(
+                'Cannot have a ps task if there are no parameter servers in the cluster')
+        task_type = 'ps'
+        task_index = ps.index(current_host)
+    elif _is_host_master(hosts, current_host):
+        task_type = 'master'
+        task_index = 0
+    else:
+        task_type = 'worker'
+        task_index = workers.index(current_host)
+
+    tf_config['task'] = {'index': task_index, 'type': task_type}
+    return tf_config
+
+
+def _env_vars_with_tf_config(env, ps_task):
+    env_vars = env.to_env_vars()
+    env_vars['TF_CONFIG'] = json.dumps(_build_tf_config(
+        hosts=env.hosts,
+        current_host=env.current_host,
+        ps_num=env.additional_framework_parameters.get(SAGEMAKER_PARAMETER_SERVER_NUM),
+        ps_task=ps_task))
+    return env_vars
+
+
+def _run_ps(env):
+    env_vars = _env_vars_with_tf_config(env, ps_task=True)
+    return framework.modules.run_module(
+        env.module_dir, env.to_cmd_args(), env_vars, env.module_name, wait=False)
+
+
+def _run_worker(env, install_module=False):
+    env_vars = _env_vars_with_tf_config(env, ps_task=False)
+    if install_module:
+        return framework.modules.run_module(
+            env.module_dir, env.to_cmd_args(), env_vars, env.module_name)
+    else:
+        framework.modules.write_env_vars(env_vars)
+        framework.modules.run(env.module_name, env.to_cmd_args(), env_vars)
+
+
+def _should_run_ps_on_this_host(hosts, current_host, parameter_server_num):
+    return current_host in hosts[:parameter_server_num]
+
+
+def _wait_until_master_is_down(master):
+    while True:
+        try:
+            subprocess.check_call(
+                ['curl', '{}:2222'.format(master)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+            logger.info('master {} is still up, waiting for it to exit'.format(master))
+            time.sleep(10)
+        except subprocess.CalledProcessError:
+            logger.info('master {} is down, stopping parameter server'.format(master))
+            return
+
+
 def train(env):
-    framework.modules.run_module(env.module_dir, env.to_cmd_args(),
-                                 env.to_env_vars(), env.module_name)
+    """Get training job environment from env and run the training job.
+
+    Args:
+        env (sagemaker_containers.beta.framework.env.TrainingEnv): Instance of TrainingEnv class
+    """
+    parameter_server_num = env.additional_framework_parameters.get(SAGEMAKER_PARAMETER_SERVER_NUM)
+    if len(env.hosts) > 1 and parameter_server_num:
+
+        logger.info('Running distributed training job with {} parameter servers'.
+                    format(parameter_server_num))
+        if _should_run_ps_on_this_host(env.hosts, env.current_host, parameter_server_num):
+            logger.info('Launching parameter server process')
+            _run_ps(env)
+            logger.info('Launching worker process')
+            _run_worker(env, install_module=False)
+        else:
+            _run_worker(env, install_module=True)
+
+        if not _is_host_master(env.hosts, env.current_host):
+            _wait_until_master_is_down(env.hosts[0])
+
+    else:
+        framework.modules.run_module(env.module_dir, env.to_cmd_args(),
+                                     env.to_env_vars(), env.module_name)
 
 
 def main():
+    """Training entry point
+    """
     hyperparameters = framework.env.read_hyperparameters()
     env = framework.training_env(hyperparameters=hyperparameters)
     logger.setLevel(env.log_level)

diff --git a/test/conftest.py b/test/conftest.py
@@ -33,7 +33,7 @@ def pytest_addoption(parser):
     parser.addoption('--docker-base-name', default='preprod-tensorflow')
     parser.addoption('--tag', default=None)
     parser.addoption('--region', default='us-west-2')
-    parser.addoption('--framework-version', default='1.10.0')
+    parser.addoption('--framework-version', default='1.11.0')
     parser.addoption('--processor', default='cpu', choices=['gpu', 'cpu'])
     parser.addoption('--py-version', default='3', choices=['2', '3'])
     parser.addoption('--account-id', default='142577830533')