Skip to content

Add distributed training support #98

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Nov 6, 2018
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 68 additions & 0 deletions docker/1.11.0/Dockerfile.cpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
FROM ubuntu:16.04

MAINTAINER Amazon AI

ARG framework_installable
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
ARG py_version

# Validate that arguments are specified
RUN test $framework_installable || exit 1 \
&& test $py_version || exit 1

RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa -y \
&& rm -rf /var/lib/apt/lists/*

RUN buildDeps=" \
ca-certificates \
curl \
nginx \
" \
&& apt-get update && apt-get install -y --no-install-recommends $buildDeps \
&& rm -rf /var/lib/apt/lists/* \
&& if [ $py_version -eq 3 ]; \
then apt-get update && apt-get install -y --no-install-recommends python3.6 \
&& ln -s -f /usr/bin/python3.6 /usr/bin/python; \
else apt-get update && apt-get install -y --no-install-recommends python; fi

# Python won’t try to write .pyc or .pyo files on the import of source modules
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1

RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py \
--disable-pip-version-check \
--no-cache-dir \
"pip==18.1" \
; \
pip --version; \
find /usr/local -depth \
\( \
\( -type d -a \( -name test -o -name tests \) \) \
-o \
\( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \
\) -exec rm -rf '{}' +; \
rm get-pip.py

# Set environment variables for MKL
# TODO: investigate the right value for OMP_NUM_THREADS
# For more about MKL with TensorFlow see:
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0

WORKDIR /

COPY $framework_installable .
COPY $framework_support_installable .

RUN pip install --no-cache-dir $framework_installable \
$framework_support_installable\
"sagemaker-tensorflow>=1.11,<1.12" \
\
&& rm -f $framework_installable \
&& rm -f $framework_support_installable \
&& pip uninstall -y --no-cache-dir \
markdown \
tensorboard

ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
89 changes: 89 additions & 0 deletions docker/1.11.0/Dockerfile.gpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
FROM nvidia/cuda:9.0-base-ubuntu16.04

MAINTAINER Amazon AI

ARG framework_installable
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
ARG py_version

# Validate that arguments are specified
RUN test $framework_installable || exit 1 \
&& test $py_version || exit 1

RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa -y \
&& rm -rf /var/lib/apt/lists/*

ENV NCCL_VERSION=2.3.5-2+cuda9.0
ENV CUDNN_VERSION=7.3.1.20-1+cuda9.0
ENV TF_TENSORRT_VERSION=4.1.2

RUN buildDeps=" \
ca-certificates \
cuda-command-line-tools-9-0 \
cuda-cublas-dev-9-0 \
cuda-cudart-dev-9-0 \
cuda-cufft-dev-9-0 \
cuda-curand-dev-9-0 \
cuda-cusolver-dev-9-0 \
cuda-cusparse-dev-9-0 \
curl \
libcudnn7=${CUDNN_VERSION} \
libnccl2=${NCCL_VERSION} \
libgomp1 \
" \
&& apt-get update && apt-get install -y --no-install-recommends $buildDeps \
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
# adds a new list which contains libnvinfer library, so it needs another
# 'apt-get update' to retrieve that list before it can actually install the
# library.
# We don't install libnvinfer-dev since we don't need to build against TensorRT,
# and libnvinfer4 doesn't contain libnvinfer.a static library.
&& apt-get update && apt-get install -y --no-install-recommends \
nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 \
&& apt-get update && apt-get install -y --no-install-recommends \
libnvinfer4=${TF_TENSORRT_VERSION}-1+cuda9.0 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* \
&& rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
&& rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
&& rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
&& if [ $py_version -eq 3 ]; \
then apt-get update && apt-get install -y --no-install-recommends python3.6 \
&& ln -s -f /usr/bin/python3.6 /usr/bin/python; \
else apt-get update && apt-get install -y --no-install-recommends python; fi

# Python won’t try to write .pyc or .pyo files on the import of source modules
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1

RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
python get-pip.py \
--disable-pip-version-check \
--no-cache-dir \
"pip==18.1" \
; \
pip --version; \
find /usr/local -depth \
\( \
\( -type d -a \( -name test -o -name tests \) \) \
-o \
\( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \
\) -exec rm -rf '{}' +; \
rm get-pip.py

WORKDIR /root

COPY $framework_installable .
COPY $framework_support_installable .

RUN framework_installable_local=$(basename $framework_installable) \
&& framework_support_installable_local=$(basename $framework_support_installable) \
\
&& pip install --no-cache --upgrade $framework_installable_local \
&& pip install $framework_support_installable_local \
&& pip install "sagemaker-tensorflow>=1.11,<1.12" \
\
&& rm $framework_installable_local \
&& rm $framework_support_installable_local

ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can use a linter like https://github.com/hadolint to help you here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have run this locally, only thing comes up now is the pin version stuff. I will add it to the build in the next pr.

2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def read(fname):
'Programming Language :: Python :: 3.6',
],

install_requires=['sagemaker-containers==2.1', 'numpy', 'scipy', 'sklearn',
install_requires=['sagemaker-containers>=2.2.6', 'numpy', 'scipy', 'sklearn',
'pandas', 'Pillow', 'h5py'],
extras_require={
'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock',
Expand Down
136 changes: 134 additions & 2 deletions src/sagemaker_tensorflow_container/training.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,151 @@

from __future__ import absolute_import

import json
import logging
import subprocess
import time

import sagemaker_containers.beta.framework as framework


logger = logging.getLogger(__name__)


SAGEMAKER_PARAMETER_SERVER_NUM = 'sagemaker_parameter_server_num'


def _is_host_master(hosts, current_host):
return current_host == hosts[0]


def _build_tf_config(hosts, current_host, ps_num=0, ps_task=False):
"""Builds a dictionary containing cluster information based on number of hosts and number of
parameter servers.

Args:
hosts (list[str]): List of host names in the cluster
current_host (str): Current host name
ps_num (int): Number of parameter servers (default: 0)
ps_task (bool): Set to True if this config is built for a parameter server process
(default: False)

Returns:
dict[str: dict]: A dictionary describing the cluster setup for distributed training.
For more information regarding TF_CONFIG:
https://cloud.google.com/ml-engine/docs/tensorflow/distributed-training-details
"""
# Assign the first host as the master. Rest of the hosts if any will be worker hosts.
# The first ps_num hosts will also have a parameter task assign to them.
masters = hosts[:1]
workers = hosts[1:]
ps = hosts[:ps_num] if len(hosts) > 1 and ps_num > 0 else None

def host_addresses(hosts, port=2222):
return ['{}:{}'.format(host, port) for host in hosts]

tf_config = {
'cluster': {
'master': host_addresses(masters)
},
'environment': 'cloud'
}

if ps:
tf_config['cluster']['ps'] = host_addresses(ps, port='2223')

if workers:
tf_config['cluster']['worker'] = host_addresses(workers)

if ps_task:
if ps is None:
raise ValueError(
'Cannot have a ps task if there are no parameter servers in the cluster')
task_type = 'ps'
task_index = ps.index(current_host)
elif _is_host_master(hosts, current_host):
task_type = 'master'
task_index = 0
else:
task_type = 'worker'
task_index = workers.index(current_host)

tf_config['task'] = {'index': task_index, 'type': task_type}
return tf_config


def _env_vars_with_tf_config(env, ps_task):
env_vars = env.to_env_vars()
env_vars['TF_CONFIG'] = json.dumps(_build_tf_config(
hosts=env.hosts,
current_host=env.current_host,
ps_num=env.additional_framework_parameters.get(SAGEMAKER_PARAMETER_SERVER_NUM),
ps_task=ps_task))
return env_vars


def _run_ps(env):
env_vars = _env_vars_with_tf_config(env, ps_task=True)
return framework.modules.run_module(
env.module_dir, env.to_cmd_args(), env_vars, env.module_name, wait=False)


def _run_worker(env, install_module=False):
env_vars = _env_vars_with_tf_config(env, ps_task=False)
if install_module:
return framework.modules.run_module(
env.module_dir, env.to_cmd_args(), env_vars, env.module_name)
else:
framework.modules.write_env_vars(env_vars)
framework.modules.run(env.module_name, env.to_cmd_args(), env_vars)


def _should_run_ps_on_this_host(hosts, current_host, parameter_server_num):
return current_host in hosts[:parameter_server_num]


def _wait_until_master_is_down(master):
while True:
try:
subprocess.check_call(
['curl', '{}:2222'.format(master)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
logger.info('master {} is still up, waiting for it to exit'.format(master))
time.sleep(10)
except subprocess.CalledProcessError:
logger.info('master {} is down, stopping parameter server'.format(master))
return


def train(env):
framework.modules.run_module(env.module_dir, env.to_cmd_args(),
env.to_env_vars(), env.module_name)
"""Get training job environment from env and run the training job.

Args:
env (sagemaker_containers.beta.framework.env.TrainingEnv): Instance of TrainingEnv class
"""
parameter_server_num = env.additional_framework_parameters.get(SAGEMAKER_PARAMETER_SERVER_NUM)
if len(env.hosts) > 1 and parameter_server_num:

logger.info('Running distributed training job with {} parameter servers'.
format(parameter_server_num))
if _should_run_ps_on_this_host(env.hosts, env.current_host, parameter_server_num):
logger.info('Launching parameter server process')
_run_ps(env)
logger.info('Launching worker process')
_run_worker(env, install_module=False)
else:
_run_worker(env, install_module=True)

if not _is_host_master(env.hosts, env.current_host):
_wait_until_master_is_down(env.hosts[0])

else:
framework.modules.run_module(env.module_dir, env.to_cmd_args(),
env.to_env_vars(), env.module_name)


def main():
"""Training entry point
"""
hyperparameters = framework.env.read_hyperparameters()
env = framework.training_env(hyperparameters=hyperparameters)
logger.setLevel(env.log_level)
Expand Down
2 changes: 1 addition & 1 deletion test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def pytest_addoption(parser):
parser.addoption('--docker-base-name', default='preprod-tensorflow')
parser.addoption('--tag', default=None)
parser.addoption('--region', default='us-west-2')
parser.addoption('--framework-version', default='1.10.0')
parser.addoption('--framework-version', default='1.11.0')
parser.addoption('--processor', default='cpu', choices=['gpu', 'cpu'])
parser.addoption('--py-version', default='3', choices=['2', '3'])
parser.addoption('--account-id', default='142577830533')
Expand Down
Loading