aws · nadiaya · Dec 30, 2019 · Dec 30, 2019
diff --git a/docker/1.15.0/py2/Dockerfile.cpu b/docker/1.15.0/py2/Dockerfile.cpu
@@ -0,0 +1,129 @@
+FROM ubuntu:18.04
+
+LABEL maintainer="Amazon AI"
+
+# Prevent docker build get stopped by requesting user interaction
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+# Set environment variables for MKL
+# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn
+ENV KMP_AFFINITY=granularity=fine,compact,1,0
+ENV KMP_BLOCKTIME=1
+ENV KMP_SETTINGS=0
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# See http://bugs.python.org/issue19846
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+# Specify the location of module that contains the training logic for SageMaker
+# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
+
+# Define framework-related package sources
+ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz
+ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/cpu/final/tensorflow-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends \
+    software-properties-common \
+    build-essential \
+    openssh-client \
+    openssh-server \
+    ca-certificates \
+    curl \
+    git \
+    wget \
+    vim \
+    zlib1g-dev \
+ && rm -rf /var/lib/apt/lists/*
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
+ && tar zxf openmpi-4.0.1.tar.gz \
+ && cd openmpi-4.0.1 \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH=/usr/local/openmpi/bin/:$PATH
+
+# SSH login fix. Otherwise user is kicked off after login
+RUN sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+# Create SSH key.
+RUN mkdir -p /root/.ssh/ \
+ && mkdir -p /var/run/sshd \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+WORKDIR /
+
+RUN apt-get update \
+ && apt-get install -y \
+    python \
+    python-pip
+
+COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
+
+RUN pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python) /usr/local/bin/python
+
+RUN pip install --no-cache-dir -U \
+    numpy==1.16.5 \
+    scipy==1.2.2 \
+    scikit-learn==0.20.3 \
+    pandas==0.24.2 \
+    Pillow==6.2.1 \
+    h5py==2.9.0 \
+    keras_applications==1.0.8 \
+    keras_preprocessing==1.1.0 \
+    requests==2.22.0 \
+    keras==2.3.1 \
+    # botocore requires python-dateutil<2.8.1
+    "python-dateutil<2.8.1" \
+    awscli==1.16.296 \
+    mpi4py==3.0.2 \
+    "cryptography>=2.3" \
+    "sagemaker-tensorflow>=1.15,<1.16" \
+    # Let's install TensorFlow separately in the end to avoid the library version to be overwritten
+ && pip install --force-reinstall --no-cache-dir -U \
+    ${TF_URL} \
+ && pip install --no-cache-dir -U \
+    $FRAMEWORK_SUPPORT_INSTALLABLE \
+ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE \
+ && pip install --no-cache-dir -U \
+    # awscli requires PyYAML<5.2
+    "PyYAML<5.2" \
+    horovod==0.18.2
+
+COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \
+ && chmod +x /usr/local/bin/deep_learning_container.py
+
+RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
+
+ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
+CMD ["bin/bash"]
diff --git a/docker/1.15.0/py2/Dockerfile.gpu b/docker/1.15.0/py2/Dockerfile.gpu
@@ -0,0 +1,171 @@
+# Nvidia does not publish a TensorRT Runtime library for Ubuntu 18.04 with Cuda 10.1 support, so we stick with cuda 10.0.
+# https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/
+FROM nvidia/cuda:10.0-base-ubuntu18.04
+
+LABEL maintainer="Amazon AI"
+
+# Prevent docker build get stopped by requesting user interaction
+ENV DEBIAN_FRONTEND=noninteractive
+ENV DEBCONF_NONINTERACTIVE_SEEN=true
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+# See http://bugs.python.org/issue19846
+ENV PYTHONIOENCODING=UTF-8
+ENV LANG=C.UTF-8
+ENV LC_ALL=C.UTF-8
+# Specify the location of module that contains the training logic for SageMaker
+# https://docs.aws.amazon.com/sagemaker/latest/dg/docker-container-environmental-variables-entrypoint.html
+ENV SAGEMAKER_TRAINING_MODULE=sagemaker_tensorflow_container.training:main
+
+# Define framework-related package sources
+ARG FRAMEWORK_SUPPORT_INSTALLABLE=sagemaker_tensorflow_container*.tar.gz
+ARG TF_URL=https://tensorflow-aws.s3-us-west-2.amazonaws.com/1.15/AmazonLinux/gpu/final/tensorflow_gpu-1.15.0-cp27-cp27mu-manylinux2010_x86_64.whl
+
+RUN apt-get update \
+ && apt-get install -y --no-install-recommends --allow-unauthenticated \
+    ca-certificates \
+    cuda-command-line-tools-10-0 \
+    cuda-cublas-dev-10-0 \
+    cuda-cudart-dev-10-0 \
+    cuda-cufft-dev-10-0 \
+    cuda-curand-dev-10-0 \
+    cuda-cusolver-dev-10-0 \
+    cuda-cusparse-dev-10-0 \
+    curl \
+    libcudnn7=7.5.1.10-1+cuda10.0 \
+    # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
+    libnccl2=2.4.7-1+cuda10.0 \
+    libgomp1 \
+    libnccl-dev=2.4.7-1+cuda10.0 \
+    libfreetype6-dev \
+    libhdf5-serial-dev \
+    libpng-dev \
+    libzmq3-dev \
+    git \
+    wget \
+    vim \
+    build-essential \
+    openssh-client \
+    openssh-server \
+    zlib1g-dev \
+    # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0
+    # adds a new list which contains libnvinfer library, so it needs another
+    # 'apt-get update' to retrieve that list before it can actually install the library.
+    # We don't install libnvinfer-dev since we don't need to build against TensorRT,
+    # and libnvinfer4 doesn't contain libnvinfer.a static library.
+ && apt-get update \
+ && apt-get install -y --no-install-recommends --allow-unauthenticated  \
+    nvinfer-runtime-trt-repo-ubuntu1804-5.0.2-ga-cuda10.0 \
+ && apt-get update \
+ && apt-get install -y --no-install-recommends --allow-unauthenticated  \
+    libnvinfer5=5.0.2-1+cuda10.0 \
+ && rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \
+ && rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \
+ && rm /usr/lib/x86_64-linux-gnu/libnvparsers* \
+ && rm -rf /var/lib/apt/lists/* \
+ && mkdir -p /var/run/sshd
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi \
+ && cd /tmp/openmpi \
+ && curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.1.tar.gz \
+ && tar zxf openmpi-4.0.1.tar.gz \
+ && cd openmpi-4.0.1 \
+ && ./configure --enable-orterun-prefix-by-default \
+ && make -j $(nproc) all \
+ && make install \
+ && ldconfig \
+ && rm -rf /tmp/openmpi
+
+RUN apt-get update \
+ && apt-get install -y \
+    python \
+    python-pip
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real \
+ && echo '#!/bin/bash' > /usr/local/bin/mpirun \
+ && echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun \
+ && chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+#   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf \
+ && echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+# Set default NCCL parameters
+RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH /usr/local/openmpi/bin/:$PATH
+ENV PATH=/usr/local/nvidia/bin:$PATH
+
+# SSH login fix. Otherwise user is kicked off after login
+RUN mkdir -p /var/run/sshd \
+ && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+# Create SSH key.
+RUN mkdir -p /root/.ssh/ \
+ && ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa \
+ && cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
+ && printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+WORKDIR /
+
+RUN pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -s $(which python) /usr/local/bin/python
+
+COPY $FRAMEWORK_SUPPORT_INSTALLABLE .
+
+RUN pip install --no-cache-dir -U \
+    numpy==1.16.5 \
+    scipy==1.2.2 \
+    scikit-learn==0.20.3 \
+    pandas==0.24.2 \
+    Pillow==6.2.1 \
+    h5py==2.9.0 \
+    keras_applications==1.0.8 \
+    keras_preprocessing==1.1.0 \
+    requests==2.22.0 \
+    keras==2.3.1 \
+    # botocore requires python-dateutil<2.8.1
+    "python-dateutil<2.8.1" \
+    awscli==1.16.296 \
+    mpi4py==3.0.2 \
+    "cryptography>=2.3" \
+    "sagemaker-tensorflow>=1.15,<1.16" \
+    # Let's install TensorFlow separately in the end to avoid the library version to be overwritten
+ && pip install --force-reinstall --no-cache-dir -U \
+    ${TF_URL} \
+ && pip install --no-cache-dir -U \
+    $FRAMEWORK_SUPPORT_INSTALLABLE \
+ && rm -f $FRAMEWORK_SUPPORT_INSTALLABLE
+
+# Install Horovod, temporarily using CUDA stubs
+RUN ldconfig /usr/local/cuda/targets/x86_64-linux/lib/stubs \
+ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir \
+    # awscli requires PyYAML<5.2
+    "PyYAML<5.2" \
+    horovod==0.18.2 \
+ && ldconfig
+
+# Allow OpenSSH to talk to containers without asking for confirmation
+RUN cat /etc/ssh/ssh_config | grep -v StrictHostKeyChecking > /etc/ssh/ssh_config.new \
+ && echo "    StrictHostKeyChecking no" >> /etc/ssh/ssh_config.new \
+ && mv /etc/ssh/ssh_config.new /etc/ssh/ssh_config
+
+COPY dockerd-entrypoint.py /usr/local/bin/dockerd-entrypoint.py
+COPY deep_learning_container.py /usr/local/bin/deep_learning_container.py
+
+RUN chmod +x /usr/local/bin/dockerd-entrypoint.py \
+ && chmod +x /usr/local/bin/deep_learning_container.py
+
+RUN curl https://aws-dlc-licenses.s3.amazonaws.com/tensorflow/license.txt -o /license.txt
+
+ENTRYPOINT ["python", "/usr/local/bin/dockerd-entrypoint.py"]
+CMD ["bin/bash"]
diff --git a/docker/1.15.0/py2/dockerd-entrypoint.py b/docker/1.15.0/py2/dockerd-entrypoint.py
@@ -0,0 +1,23 @@
+# Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import os.path
+import shlex
+import subprocess
+import sys
+
+if not os.path.exists("/opt/ml/input/config"):
+    subprocess.call(['python', '/usr/local/bin/deep_learning_container.py', '&>/dev/null', '&'])
+
+subprocess.check_call(shlex.split(' '.join(sys.argv[1:])))