Skip to content

Commit e4cc41e

Browse files
authored
Add Dockerfile for TF 1.14 (#221)
* Add Dockerfile for TF 1.14 * Update packages to latest * Update openmpi to 4.0
1 parent a7a61bf commit e4cc41e

File tree

2 files changed

+304
-0
lines changed

2 files changed

+304
-0
lines changed

docker/1.14.0/Dockerfile.cpu

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
FROM ubuntu:16.04
2+
3+
LABEL maintainer="Amazon AI"
4+
5+
RUN apt-get update && apt-get install -y --no-install-recommends \
6+
software-properties-common \
7+
build-essential \
8+
openssh-client \
9+
openssh-server \
10+
ca-certificates \
11+
curl \
12+
git \
13+
wget \
14+
vim \
15+
gcc-4.9 \
16+
g++-4.9 \
17+
gcc-4.9-base \
18+
zlib1g-dev \
19+
&& rm -rf /var/lib/apt/lists/*
20+
21+
# Install Open MPI
22+
RUN mkdir /tmp/openmpi && \
23+
cd /tmp/openmpi && \
24+
curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz && \
25+
tar zxf openmpi-4.0.0.tar.gz && \
26+
cd openmpi-4.0.0 && \
27+
./configure --enable-orterun-prefix-by-default && \
28+
make -j $(nproc) all && \
29+
make install && \
30+
ldconfig && \
31+
rm -rf /tmp/openmpi
32+
33+
# Create a wrapper for OpenMPI to allow running as root by default
34+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
35+
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
36+
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
37+
chmod a+x /usr/local/bin/mpirun
38+
39+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
40+
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
41+
42+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
43+
44+
ENV PATH /usr/local/openmpi/bin/:$PATH
45+
46+
# SSH login fix. Otherwise user is kicked off after login
47+
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
48+
49+
# Create SSH key.
50+
RUN mkdir -p /root/.ssh/ && \
51+
mkdir -p /var/run/sshd && \
52+
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
53+
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
54+
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
55+
56+
# Set environment variables for MKL
57+
# For more about MKL with TensorFlow see:
58+
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
59+
ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0
60+
61+
WORKDIR /
62+
63+
ARG PYTHON=python3
64+
ARG PYTHON_PIP=python3-pip
65+
ARG PIP=pip3
66+
ARG PYTHON_VERSION=3.6.6
67+
68+
RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
69+
tar -xvf Python-$PYTHON_VERSION.tgz && cd Python-$PYTHON_VERSION && \
70+
./configure && make && make install && \
71+
apt-get update && apt-get install -y --no-install-recommends libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev && \
72+
make && make install && rm -rf ../Python-$PYTHON_VERSION* && \
73+
ln -s /usr/local/bin/pip3 /usr/bin/pip
74+
75+
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
76+
77+
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
78+
COPY $framework_support_installable .
79+
ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.14/AmazonLinux/cpu/final/tensorflow-1.14.0-cp36-cp36m-linux_x86_64.whl"
80+
81+
# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet with horovod
82+
# Backup existing GCC installation as priority 100, so that it can be recovered later.
83+
RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
84+
update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
85+
update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
86+
update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
87+
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
88+
update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
89+
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
90+
update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200
91+
92+
RUN ${PIP} --no-cache-dir install --upgrade pip setuptools
93+
94+
# Some TF tools expect a "python" binary
95+
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
96+
97+
RUN ${PIP} install --no-cache-dir -U \
98+
numpy==1.16.4 \
99+
scipy==1.2.2 \
100+
scikit-learn==0.20.3 \
101+
pandas==0.24.2 \
102+
Pillow==6.1.0 \
103+
h5py==2.9.0 \
104+
keras_applications==1.0.8 \
105+
keras_preprocessing==1.1.0 \
106+
keras==2.2.4 \
107+
requests==2.22.0 \
108+
awscli==1.16.196 \
109+
mpi4py==3.0.2 \
110+
"sagemaker-tensorflow>=1.13,<1.14" && \
111+
# Let's install TensorFlow separately in the end to avoid
112+
# the library version to be overwritten
113+
${PIP} install --force-reinstall --no-cache-dir -U \
114+
${TF_URL} \
115+
horovod==0.16.4 && \
116+
${PIP} install --no-cache-dir -U $framework_support_installable && \
117+
rm -f $framework_support_installable && \
118+
${PIP} uninstall -y --no-cache-dir \
119+
markdown
120+
121+
# Remove GCC pinning
122+
RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
123+
update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
124+
update-alternatives --remove g++ /usr/bin/g++-4.9 && \
125+
update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9
126+
127+
128+
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
129+
130+
CMD ["bin/bash"]

docker/1.14.0/Dockerfile.gpu

Lines changed: 174 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,174 @@
1+
FROM nvidia/cuda:10.0-base-ubuntu16.04
2+
3+
LABEL maintainer="Amazon AI"
4+
5+
RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
6+
ca-certificates \
7+
cuda-command-line-tools-10-0 \
8+
cuda-cublas-dev-10-0 \
9+
cuda-cudart-dev-10-0 \
10+
cuda-cufft-dev-10-0 \
11+
cuda-curand-dev-10-0 \
12+
cuda-cusolver-dev-10-0 \
13+
cuda-cusparse-dev-10-0 \
14+
curl \
15+
libcudnn7=7.5.1.10-1+cuda10.0 \
16+
# TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
17+
libnccl2=2.4.7-1+cuda10.0 \
18+
libgomp1 \
19+
gcc-4.9 \
20+
g++-4.9 \
21+
gcc-4.9-base \
22+
libnccl-dev=2.4.7-1+cuda10.0 \
23+
libfreetype6-dev \
24+
libhdf5-serial-dev \
25+
libpng12-dev \
26+
libzmq3-dev \
27+
git \
28+
wget \
29+
vim \
30+
build-essential \
31+
openssh-client \
32+
openssh-server \
33+
zlib1g-dev && \
34+
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
35+
# adds a new list which contains libnvinfer library, so it needs another
36+
# 'apt-get update' to retrieve that list before it can actually install the
37+
# library.
38+
# We don't install libnvinfer-dev since we don't need to build against TensorRT,
39+
# and libnvinfer4 doesn't contain libnvinfer.a static library.
40+
apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
41+
nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 && \
42+
apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
43+
libnvinfer5=5.0.2-1+cuda10.0 && \
44+
rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* && \
45+
rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* && \
46+
rm /usr/lib/x86_64-linux-gnu/libnvparsers* && \
47+
rm -rf /var/lib/apt/lists/* && \
48+
mkdir -p /var/run/sshd
49+
50+
###########################################################################
51+
# Horovod & its dependencies
52+
###########################################################################
53+
54+
# Install Open MPI
55+
RUN mkdir /tmp/openmpi && \
56+
cd /tmp/openmpi && \
57+
curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz && \
58+
tar zxf openmpi-4.0.0.tar.gz && \
59+
cd openmpi-4.0.0 && \
60+
./configure --enable-orterun-prefix-by-default && \
61+
make -j $(nproc) all && \
62+
make install && \
63+
ldconfig && \
64+
rm -rf /tmp/openmpi
65+
66+
ARG PYTHON=python3
67+
ARG PYTHON_PIP=python3-pip
68+
ARG PIP=pip3
69+
ARG PYTHON_VERSION=3.6.6
70+
71+
RUN wget https://www.python.org/ftp/python/$PYTHON_VERSION/Python-$PYTHON_VERSION.tgz && \
72+
tar -xvf Python-$PYTHON_VERSION.tgz && cd Python-$PYTHON_VERSION && \
73+
./configure && make && make install && \
74+
apt-get update && apt-get install -y --no-install-recommends libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev && \
75+
make && make install && rm -rf ../Python-$PYTHON_VERSION* && \
76+
ln -s /usr/local/bin/pip3 /usr/bin/pip
77+
78+
# Create a wrapper for OpenMPI to allow running as root by default
79+
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
80+
echo '#!/bin/bash' > /usr/local/bin/mpirun && \
81+
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
82+
chmod a+x /usr/local/bin/mpirun
83+
84+
# Configure OpenMPI to run good defaults:
85+
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
86+
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
87+
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
88+
89+
# Set default NCCL parameters
90+
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
91+
92+
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
93+
ENV PATH /usr/local/openmpi/bin/:$PATH
94+
ENV PATH=/usr/local/nvidia/bin:$PATH
95+
96+
# SSH login fix. Otherwise user is kicked off after login
97+
RUN mkdir -p /var/run/sshd && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
98+
99+
# Create SSH key.
100+
RUN mkdir -p /root/.ssh/ && \
101+
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
102+
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
103+
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
104+
105+
###########################################################################
106+
# Python won’t try to write .pyc or .pyo files on the import of source modules
107+
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
108+
109+
WORKDIR /
110+
111+
ARG TF_URL="https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.14/AmazonLinux/gpu/final/tensorflow-1.14.0-cp36-cp36m-linux_x86_64.whl"
112+
113+
RUN ${PIP} --no-cache-dir install --upgrade pip setuptools
114+
115+
# Some TF tools expect a "python" binary
116+
RUN ln -s $(which ${PYTHON}) /usr/local/bin/python
117+
118+
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
119+
COPY $framework_support_installable .
120+
121+
RUN ${PIP} install --no-cache-dir -U \
122+
numpy==1.16.4 \
123+
scipy==1.2.2 \
124+
scikit-learn==0.20.3 \
125+
pandas==0.24.2 \
126+
Pillow==6.1.0 \
127+
h5py==2.9.0 \
128+
keras_applications==1.0.8 \
129+
keras_preprocessing==1.1.0 \
130+
requests==2.22.0 \
131+
keras==2.2.4 \
132+
awscli==1.16.196 \
133+
mpi4py==3.0.2 \
134+
"sagemaker-tensorflow>=1.13,<1.14" \
135+
# Let's install TensorFlow separately in the end to avoid
136+
# the library version to be overwritten
137+
&& ${PIP} install --force-reinstall --no-cache-dir -U ${TF_URL} \
138+
&& ${PIP} install --no-cache-dir -U $framework_support_installable && \
139+
rm -f $framework_support_installable \
140+
&& ${PIP} uninstall -y --no-cache-dir \
141+
markdown
142+
143+
144+
# Pin GCC to 4.9 (priority 200) to compile correctly against TensorFlow, PyTorch, and MXNet with horovod
145+
# Backup existing GCC installation as priority 100, so that it can be recovered later.
146+
RUN update-alternatives --install /usr/bin/gcc gcc $(readlink -f $(which gcc)) 100 && \
147+
update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc $(readlink -f $(which gcc)) 100 && \
148+
update-alternatives --install /usr/bin/g++ g++ $(readlink -f $(which g++)) 100 && \
149+
update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ $(readlink -f $(which g++)) 100
150+
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-4.9 200 && \
151+
update-alternatives --install /usr/bin/x86_64-linux-gnu-gcc x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 200 && \
152+
update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-4.9 200 && \
153+
update-alternatives --install /usr/bin/x86_64-linux-gnu-g++ x86_64-linux-gnu-g++ /usr/bin/g++-4.9 200
154+
155+
156+
# Install Horovod, temporarily using CUDA stubs
157+
RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \
158+
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 ${PIP} install --no-cache-dir horovod==0.16.4 && \
159+
ldconfig
160+
161+
# Remove GCC pinning
162+
RUN update-alternatives --remove gcc /usr/bin/gcc-4.9 && \
163+
update-alternatives --remove x86_64-linux-gnu-gcc /usr/bin/gcc-4.9 && \
164+
update-alternatives --remove g++ /usr/bin/g++-4.9 && \
165+
update-alternatives --remove x86_64-linux-gnu-g++ /usr/bin/g++-4.9
166+
167+
# Allow OpenSSH to talk to containers without asking for confirmation
168+
RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new && \
169+
echo " StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new && \
170+
mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
171+
172+
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
173+
174+
CMD ["bin/bash"]

0 commit comments

Comments
 (0)