Skip to content

Commit 032cf60

Browse files
authored
Add distributed training support (#98)
* Implement distributed support * Launch parameter server if user set sagemaker_parameter_server_enabled to be True * Add integ tests * Add unit tests * Add distributed sagemaker integ test * Add 1.11.0 and modify Dockerfile to reduce image size
1 parent 7047101 commit 032cf60

File tree

15 files changed

+762
-39
lines changed

15 files changed

+762
-39
lines changed

docker/1.11.0/Dockerfile.cpu

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
FROM ubuntu:16.04
2+
3+
LABEL maintainer="Amazon AI"
4+
5+
ARG framework_installable
6+
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
7+
ARG py_version
8+
9+
# Validate that arguments are specified
10+
RUN test $framework_installable || exit 1 \
11+
&& test $py_version || exit 1
12+
13+
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
14+
&& add-apt-repository ppa:deadsnakes/ppa -y \
15+
&& rm -rf /var/lib/apt/lists/*
16+
17+
RUN apt-get update && apt-get install -y --no-install-recommends \
18+
ca-certificates \
19+
curl \
20+
nginx \
21+
&& if [ $py_version -eq 3 ]; \
22+
then apt-get install -y --no-install-recommends python3.6 \
23+
&& ln -s -f /usr/bin/python3.6 /usr/bin/python; \
24+
else apt-get install -y --no-install-recommends python; fi \
25+
&& rm -rf /var/lib/apt/lists/*
26+
27+
# Python won’t try to write .pyc or .pyo files on the import of source modules
28+
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
29+
30+
RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
31+
python get-pip.py \
32+
--disable-pip-version-check \
33+
--no-cache-dir \
34+
"pip==18.1" \
35+
; \
36+
pip --version; \
37+
find /usr/local -depth \
38+
\( \
39+
\( -type d -a \( -name test -o -name tests \) \) \
40+
-o \
41+
\( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \
42+
\) -exec rm -rf '{}' +; \
43+
rm get-pip.py
44+
45+
# Set environment variables for MKL
46+
# TODO: investigate the right value for OMP_NUM_THREADS
47+
# For more about MKL with TensorFlow see:
48+
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
49+
ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
50+
51+
WORKDIR /
52+
53+
COPY $framework_installable .
54+
COPY $framework_support_installable .
55+
56+
RUN pip install --no-cache-dir $framework_installable \
57+
$framework_support_installable\
58+
"sagemaker-tensorflow>=1.11,<1.12" \
59+
\
60+
&& rm -f $framework_installable \
61+
&& rm -f $framework_support_installable \
62+
&& pip uninstall -y --no-cache-dir \
63+
markdown \
64+
tensorboard
65+
66+
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main

docker/1.11.0/Dockerfile.gpu

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
FROM nvidia/cuda:9.0-base-ubuntu16.04
2+
3+
LABEL maintainer="Amazon AI"
4+
5+
ARG framework_installable
6+
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
7+
ARG py_version
8+
9+
# Validate that arguments are specified
10+
RUN test $framework_installable || exit 1 \
11+
&& test $py_version || exit 1
12+
13+
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \
14+
&& add-apt-repository ppa:deadsnakes/ppa -y \
15+
&& rm -rf /var/lib/apt/lists/*
16+
17+
ENV NCCL_VERSION=2.3.5-2+cuda9.0
18+
ENV CUDNN_VERSION=7.3.1.20-1+cuda9.0
19+
ENV TF_TENSORRT_VERSION=4.1.2
20+
21+
RUN apt-get update && apt-get install -y --no-install-recommends \
22+
ca-certificates \
23+
cuda-command-line-tools-9-0 \
24+
cuda-cublas-dev-9-0 \
25+
cuda-cudart-dev-9-0 \
26+
cuda-cufft-dev-9-0 \
27+
cuda-curand-dev-9-0 \
28+
cuda-cusolver-dev-9-0 \
29+
cuda-cusparse-dev-9-0 \
30+
curl \
31+
libcudnn7=${CUDNN_VERSION} \
32+
libnccl2=${NCCL_VERSION} \
33+
libgomp1 \
34+
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
35+
# adds a new list which contains libnvinfer library, so it needs another
36+
# 'apt-get update' to retrieve that list before it can actually install the
37+
# library.
38+
# We don't install libnvinfer-dev since we don't need to build against TensorRT,
39+
# and libnvinfer4 doesn't contain libnvinfer.a static library.
40+
&& apt-get update && apt-get install -y --no-install-recommends \
41+
nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 \
42+
&& apt-get update && apt-get install -y --no-install-recommends \
43+
libnvinfer4=${TF_TENSORRT_VERSION}-1+cuda9.0 \
44+
&& apt-get clean \
45+
&& rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
46+
&& rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
47+
&& rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
48+
&& if [ $py_version -eq 3 ]; \
49+
then apt-get install -y --no-install-recommends python3.6 \
50+
&& ln -s -f /usr/bin/python3.6 /usr/bin/python; \
51+
else apt-get install -y --no-install-recommends python; fi \
52+
&& rm -rf /var/lib/apt/lists/*
53+
54+
# Python won’t try to write .pyc or .pyo files on the import of source modules
55+
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1
56+
57+
RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
58+
python get-pip.py \
59+
--disable-pip-version-check \
60+
--no-cache-dir \
61+
"pip==18.1" \
62+
; \
63+
pip --version; \
64+
find /usr/local -depth \
65+
\( \
66+
\( -type d -a \( -name test -o -name tests \) \) \
67+
-o \
68+
\( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \
69+
\) -exec rm -rf '{}' +; \
70+
rm get-pip.py
71+
72+
WORKDIR /
73+
74+
COPY $framework_installable .
75+
COPY $framework_support_installable .
76+
77+
RUN pip install --no-cache-dir $framework_installable \
78+
$framework_support_installable\
79+
"sagemaker-tensorflow>=1.11,<1.12" \
80+
\
81+
&& rm -f $framework_installable \
82+
&& rm -f $framework_support_installable \
83+
&& pip uninstall -y --no-cache-dir \
84+
markdown \
85+
tensorboard
86+
87+
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def read(fname):
4949
'Programming Language :: Python :: 3.6',
5050
],
5151

52-
install_requires=['sagemaker-containers==2.1', 'numpy', 'scipy', 'sklearn',
52+
install_requires=['sagemaker-containers>=2.2.6', 'numpy', 'scipy', 'sklearn',
5353
'pandas', 'Pillow', 'h5py'],
5454
extras_require={
5555
'test': ['tox', 'flake8', 'pytest', 'pytest-cov', 'pytest-xdist', 'mock',

src/sagemaker_tensorflow_container/training.py

Lines changed: 125 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,142 @@
1313

1414
from __future__ import absolute_import
1515

16+
import json
1617
import logging
18+
import subprocess
19+
import time
1720

1821
import sagemaker_containers.beta.framework as framework
1922

23+
2024
logger = logging.getLogger(__name__)
2125

2226

27+
SAGEMAKER_PARAMETER_SERVER_ENABLED = 'sagemaker_parameter_server_enabled'
28+
29+
30+
def _is_host_master(hosts, current_host):
31+
return current_host == hosts[0]
32+
33+
34+
def _build_tf_config(hosts, current_host, ps_task=False):
35+
"""Builds a dictionary containing cluster information based on number of hosts and number of
36+
parameter servers.
37+
38+
Args:
39+
hosts (list[str]): List of host names in the cluster
40+
current_host (str): Current host name
41+
ps_task (bool): Set to True if this config is built for a parameter server process
42+
(default: False)
43+
44+
Returns:
45+
dict[str: dict]: A dictionary describing the cluster setup for distributed training.
46+
For more information regarding TF_CONFIG:
47+
https://cloud.google.com/ml-engine/docs/tensorflow/distributed-training-details
48+
"""
49+
# Assign the first host as the master. Rest of the hosts if any will be worker hosts.
50+
# The first ps_num hosts will also have a parameter task assign to them.
51+
masters = hosts[:1]
52+
workers = hosts[1:]
53+
ps = hosts if len(hosts) > 1 else None
54+
55+
def host_addresses(hosts, port=2222):
56+
return ['{}:{}'.format(host, port) for host in hosts]
57+
58+
tf_config = {
59+
'cluster': {
60+
'master': host_addresses(masters)
61+
},
62+
'environment': 'cloud'
63+
}
64+
65+
if ps:
66+
tf_config['cluster']['ps'] = host_addresses(ps, port='2223')
67+
68+
if workers:
69+
tf_config['cluster']['worker'] = host_addresses(workers)
70+
71+
if ps_task:
72+
if ps is None:
73+
raise ValueError(
74+
'Cannot have a ps task if there are no parameter servers in the cluster')
75+
task_type = 'ps'
76+
task_index = ps.index(current_host)
77+
elif _is_host_master(hosts, current_host):
78+
task_type = 'master'
79+
task_index = 0
80+
else:
81+
task_type = 'worker'
82+
task_index = workers.index(current_host)
83+
84+
tf_config['task'] = {'index': task_index, 'type': task_type}
85+
return tf_config
86+
87+
88+
def _env_vars_with_tf_config(env, ps_task):
89+
env_vars = env.to_env_vars()
90+
env_vars['TF_CONFIG'] = json.dumps(_build_tf_config(
91+
hosts=env.hosts,
92+
current_host=env.current_host,
93+
ps_task=ps_task))
94+
return env_vars
95+
96+
97+
def _run_ps(env):
98+
env_vars = _env_vars_with_tf_config(env, ps_task=True)
99+
return framework.modules.run_module(
100+
env.module_dir, env.to_cmd_args(), env_vars, env.module_name, wait=False)
101+
102+
103+
def _run_worker(env, install_module=False):
104+
env_vars = _env_vars_with_tf_config(env, ps_task=False)
105+
if install_module:
106+
return framework.modules.run_module(
107+
env.module_dir, env.to_cmd_args(), env_vars, env.module_name)
108+
else:
109+
framework.modules.write_env_vars(env_vars)
110+
framework.modules.run(env.module_name, env.to_cmd_args(), env_vars)
111+
112+
113+
def _wait_until_master_is_down(master):
114+
while True:
115+
try:
116+
subprocess.check_call(
117+
['curl', '{}:2222'.format(master)], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
118+
logger.info('master {} is still up, waiting for it to exit'.format(master))
119+
time.sleep(10)
120+
except subprocess.CalledProcessError:
121+
logger.info('master {} is down, stopping parameter server'.format(master))
122+
return
123+
124+
23125
def train(env):
24-
framework.modules.run_module(env.module_dir, env.to_cmd_args(),
25-
env.to_env_vars(), env.module_name)
126+
"""Get training job environment from env and run the training job.
127+
128+
Args:
129+
env (sagemaker_containers.beta.framework.env.TrainingEnv): Instance of TrainingEnv class
130+
"""
131+
parameter_server_enabled = env.additional_framework_parameters.get(
132+
SAGEMAKER_PARAMETER_SERVER_ENABLED, False)
133+
if len(env.hosts) > 1 and parameter_server_enabled:
134+
135+
logger.info('Running distributed training job with parameter servers')
136+
logger.info('Launching parameter server process')
137+
_run_ps(env)
138+
logger.info('Launching worker process')
139+
_run_worker(env, install_module=False)
140+
141+
if not _is_host_master(env.hosts, env.current_host):
142+
_wait_until_master_is_down(env.hosts[0])
143+
144+
else:
145+
framework.modules.run_module(env.module_dir, env.to_cmd_args(),
146+
env.to_env_vars(), env.module_name)
26147

27148

28149
def main():
150+
"""Training entry point
151+
"""
29152
hyperparameters = framework.env.read_hyperparameters()
30153
env = framework.training_env(hyperparameters=hyperparameters)
31154
logger.setLevel(env.log_level)

test/conftest.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def pytest_addoption(parser):
3333
parser.addoption('--docker-base-name', default='preprod-tensorflow')
3434
parser.addoption('--tag', default=None)
3535
parser.addoption('--region', default='us-west-2')
36-
parser.addoption('--framework-version', default='1.10.0')
36+
parser.addoption('--framework-version', default='1.11.0')
3737
parser.addoption('--processor', default='cpu', choices=['gpu', 'cpu'])
3838
parser.addoption('--py-version', default='3', choices=['2', '3'])
3939
parser.addoption('--account-id', default='142577830533')

0 commit comments

Comments
 (0)