-
Notifications
You must be signed in to change notification settings - Fork 162
Upgrade to TensorFlow 1.13.1 #184
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
d36f6ab
3bdbc19
f205906
8f03d7b
253b228
b7b4f57
4a46317
24a209c
d479da1
7097b94
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,92 @@ | ||
FROM ubuntu:16.04 | ||
|
||
LABEL maintainer="Amazon AI" | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends \ | ||
software-properties-common \ | ||
build-essential \ | ||
openssh-client \ | ||
openssh-server \ | ||
ca-certificates \ | ||
curl \ | ||
&& add-apt-repository ppa:deadsnakes/ppa -y \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
# Install Open MPI | ||
RUN mkdir /tmp/openmpi && \ | ||
cd /tmp/openmpi && \ | ||
curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \ | ||
tar zxf openmpi-3.1.2.tar.gz && \ | ||
cd openmpi-3.1.2 && \ | ||
./configure --enable-orterun-prefix-by-default && \ | ||
make -j $(nproc) all && \ | ||
make install && \ | ||
ldconfig && \ | ||
rm -rf /tmp/openmpi | ||
|
||
# Create a wrapper for OpenMPI to allow running as root by default | ||
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ | ||
echo '#!/bin/bash' > /usr/local/bin/mpirun && \ | ||
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ | ||
chmod a+x /usr/local/bin/mpirun | ||
|
||
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ | ||
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf | ||
|
||
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH | ||
|
||
ENV PATH /usr/local/openmpi/bin/:$PATH | ||
|
||
# SSH login fix. Otherwise user is kicked off after login | ||
RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd | ||
|
||
# Create SSH key. | ||
RUN mkdir -p /root/.ssh/ && \ | ||
mkdir -p /var/run/sshd && \ | ||
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ | ||
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ | ||
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config | ||
|
||
# Set environment variables for MKL | ||
# For more about MKL with TensorFlow see: | ||
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn | ||
ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0 | ||
|
||
WORKDIR / | ||
|
||
ARG py_version | ||
ARG framework_installable | ||
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz | ||
|
||
RUN if [ $py_version -eq 3 ]; then PYTHON_VERSION=python3.6; else PYTHON_VERSION=python2.7; fi && \ | ||
apt-get update && apt-get install -y --no-install-recommends $PYTHON_VERSION-dev --allow-unauthenticated && \ | ||
ln -s -f /usr/bin/$PYTHON_VERSION /usr/bin/python && \ | ||
ln -s -f /usr/bin/$PYTHON_VERSION /usr/local/bin/python && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 | ||
|
||
RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ | ||
python get-pip.py --disable-pip-version-check --no-cache-dir "pip==18.1" && \ | ||
rm get-pip.py | ||
|
||
COPY $framework_installable tensorflow-1.13.1-py2.py3-none-any.whl | ||
COPY $framework_support_installable . | ||
|
||
RUN pip install --no-cache-dir -U \ | ||
keras==2.2.4 \ | ||
mpi4py==3.0.1 \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why do you need this when you have openmpi ? |
||
"sagemaker-tensorflow>=1.13,<1.14" && \ | ||
# Let's install TensorFlow separately in the end to avoid | ||
# the library version to be overwritten | ||
pip install --force-reinstall --no-cache-dir -U \ | ||
tensorflow-1.13.1-py2.py3-none-any.whl \ | ||
horovod && \ | ||
pip install --no-cache-dir -U $framework_support_installable && \ | ||
rm -f tensorflow-1.13.1-py2.py3-none-any.whl && \ | ||
rm -f $framework_support_installable && \ | ||
pip uninstall -y --no-cache-dir \ | ||
markdown \ | ||
tensorboard | ||
|
||
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
FROM nvidia/cuda:10.0-base-ubuntu16.04 | ||
|
||
LABEL maintainer="Amazon AI" | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ | ||
software-properties-common && \ | ||
add-apt-repository ppa:deadsnakes/ppa -y && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ | ||
ca-certificates \ | ||
cuda-command-line-tools-10-0 \ | ||
cuda-cublas-dev-10-0 \ | ||
cuda-cudart-dev-10-0 \ | ||
cuda-cufft-dev-10-0 \ | ||
cuda-curand-dev-10-0 \ | ||
cuda-cusolver-dev-10-0 \ | ||
cuda-cusparse-dev-10-0 \ | ||
curl \ | ||
libcudnn7=7.4.1.5-1+cuda10.0 \ | ||
# TensorFlow doesn't require libnccl anymore but Open MPI still depends on it | ||
libnccl2 \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As this is the dependencies of OpenMPI, and openmpi is pinned with version, could you please pin this version as well? |
||
libnccl-dev \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same above. |
||
libfreetype6-dev \ | ||
libhdf5-serial-dev \ | ||
libpng12-dev \ | ||
libzmq3-dev \ | ||
wget \ | ||
openssh-client \ | ||
openssh-server \ | ||
build-essential && \ | ||
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 | ||
# adds a new list which contains libnvinfer library, so it needs another | ||
# 'apt-get update' to retrieve that list before it can actually install the | ||
# library. | ||
# We don't install libnvinfer-dev since we don't need to build against TensorRT, | ||
# and libnvinfer4 doesn't contain libnvinfer.a static library. | ||
apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ | ||
nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 && \ | ||
apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \ | ||
libnvinfer5=5.0.2-1+cuda10.0 && \ | ||
rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* && \ | ||
rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* && \ | ||
rm /usr/lib/x86_64-linux-gnu/libnvparsers* && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
########################################################################### | ||
# Horovod & its dependencies | ||
########################################################################### | ||
|
||
# Install Open MPI | ||
RUN mkdir /tmp/openmpi && \ | ||
cd /tmp/openmpi && \ | ||
curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \ | ||
tar zxf openmpi-3.1.2.tar.gz && \ | ||
cd openmpi-3.1.2 && \ | ||
./configure --enable-orterun-prefix-by-default && \ | ||
make -j $(nproc) all && \ | ||
make install && \ | ||
ldconfig && \ | ||
rm -rf /tmp/openmpi | ||
|
||
ARG py_version | ||
ARG framework_installable | ||
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz | ||
|
||
RUN if [ $py_version -eq 3 ]; then PYTHON_VERSION=python3.6; else PYTHON_VERSION=python2.7; fi && \ | ||
apt-get update && apt-get install -y --no-install-recommends $PYTHON_VERSION-dev --allow-unauthenticated && \ | ||
ln -s -f /usr/bin/$PYTHON_VERSION /usr/bin/python && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
# Create a wrapper for OpenMPI to allow running as root by default | ||
RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \ | ||
echo '#!/bin/bash' > /usr/local/bin/mpirun && \ | ||
echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \ | ||
chmod a+x /usr/local/bin/mpirun | ||
|
||
# Configure OpenMPI to run good defaults: | ||
# --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0 | ||
RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \ | ||
echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf | ||
|
||
# Set default NCCL parameters | ||
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf | ||
|
||
ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH | ||
ENV PATH /usr/local/openmpi/bin/:$PATH | ||
ENV PATH=/usr/local/nvidia/bin:$PATH | ||
|
||
# SSH login fix. Otherwise user is kicked off after login | ||
RUN mkdir -p /var/run/sshd && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd | ||
|
||
# Create SSH key. | ||
RUN mkdir -p /root/.ssh/ && \ | ||
ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \ | ||
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \ | ||
printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config | ||
|
||
########################################################################### | ||
# Python won’t try to write .pyc or .pyo files on the import of source modules | ||
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8 | ||
|
||
RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ | ||
python get-pip.py --disable-pip-version-check --no-cache-dir "pip==18.1" && \ | ||
rm get-pip.py | ||
|
||
WORKDIR / | ||
|
||
COPY $framework_installable tensorflow-1.13.1-py2.py3-none-any.whl | ||
COPY $framework_support_installable . | ||
|
||
RUN pip install --no-cache-dir -U \ | ||
keras==2.2.4 \ | ||
mpi4py==3.0.1 \ | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment as above |
||
$framework_support_installable \ | ||
"sagemaker-tensorflow>=1.13,<1.14" \ | ||
# Let's install TensorFlow separately in the end to avoid | ||
# the library version to be overwritten | ||
&& pip install --force-reinstall --no-cache-dir -U tensorflow-1.13.1-py2.py3-none-any.whl \ | ||
\ | ||
&& rm -f tensorflow-1.13.1-py2.py3-none-any.whl \ | ||
&& rm -f $framework_support_installable \ | ||
&& pip uninstall -y --no-cache-dir \ | ||
markdown \ | ||
tensorboard | ||
|
||
# Install Horovod, temporarily using CUDA stubs | ||
RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \ | ||
HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod && \ | ||
ldconfig | ||
|
||
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we install openmpi v4.0?