add sageamker container script for tf gpu 1.13.1

rahul003 · rahul003 · commit e234f4bc9117 · 2019-08-15T01:21:49.000Z
diff --git a/bin/sagemaker_containers/tf/1.13.1.sh b/bin/sagemaker_containers/tf/1.13.1.sh
@@ -0,0 +1,30 @@
+#!/usr/bin/env bash
+# run from tornasole_core folder
+set -ex
+
+export TF_BINARY_LOCATION=https://files.pythonhosted.org/packages/77/63/a9fa76de8dffe7455304c4ed635be4aa9c0bacef6e0633d87d5f54530c5c/tensorflow-1.13.1-cp36-cp36m-manylinux1_x86_64.whl
+export TF_BINARY_NAME=tensorflow-1.13.1-cp36-cp36m-manylinux1_x86_64.whl
+export TORNASOLE_BINARY_PATH=s3://tornasole-binaries-use1/tornasole_tensorflow/py3/tornasole-0.3-py2.py3-none-any.whl
+export TORNASOLE_BINARY_NAME=tornasole-0.3-py2.py3-none-any.whl
+export ECR_REPO_NAME=tornasole-preprod-tf-1.13.1-gpu
+export ECR_TAG_NAME=latest
+
+pushd .
+cd ~
+git clone https://github.com/aws/sagemaker-tensorflow-container.git
+cd sagemaker-tensorflow-container
+git checkout script-mode
+python setup.py sdist
+popd
+cp ~/sagemaker-tensorflow-container/dist/sagemaker_tensorflow_container-*.tar.gz bin/sagemaker_tf_container/
+cd bin/sagemaker_tf_container/
+aws s3 cp $TORNASOLE_BINARY_PATH .
+curl -O $TF_BINARY_LOCATION
+$(aws ecr get-login --no-include-email --region us-east-1)
+docker build -t $ECR_REPO_NAME --build-arg py_version=3 \
+    --build-arg framework_installable=$TF_BINARY_NAME \
+    --build-arg framework_support_installable=sagemaker_tensorflow_container-*.tar.gz \
+    --build-arg tornasole_framework_installable=$TORNASOLE_BINARY_NAME \
+    -f Dockerfile.gpu .
+docker tag $ECR_REPO_NAME:$ECR_TAG_NAME 072677473360.dkr.ecr.us-east-1.amazonaws.com/$ECR_REPO_NAME:$ECR_TAG_NAME
+docker push 072677473360.dkr.ecr.us-east-1.amazonaws.com/$ECR_REPO_NAME:$ECR_TAG_NAME
diff --git a/bin/sagemaker_containers/tf/Dockerfile.gpu b/bin/sagemaker_containers/tf/Dockerfile.gpu
@@ -0,0 +1,137 @@
+FROM nvidia/cuda:10.0-base-ubuntu16.04
+
+LABEL maintainer="Amazon AI"
+
+RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
+        software-properties-common && \
+    add-apt-repository ppa:deadsnakes/ppa -y && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
+        ca-certificates \
+        cuda-command-line-tools-10-0 \
+        cuda-cublas-dev-10-0 \
+        cuda-cudart-dev-10-0 \
+        cuda-cufft-dev-10-0 \
+        cuda-curand-dev-10-0 \
+        cuda-cusolver-dev-10-0 \
+        cuda-cusparse-dev-10-0 \
+        curl \
+        libcudnn7=7.4.1.5-1+cuda10.0 \
+        # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
+        libnccl2 \
+        libnccl-dev \
+        libfreetype6-dev \
+        libhdf5-serial-dev \
+        libpng12-dev \
+        libzmq3-dev \
+        wget \
+        openssh-client \
+        openssh-server \
+        build-essential && \
+    # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
+    # adds a new list which contains libnvinfer library, so it needs another
+    # 'apt-get update' to retrieve that list before it can actually install the
+    # library.
+    # We don't install libnvinfer-dev since we don't need to build against TensorRT,
+    # and libnvinfer4 doesn't contain libnvinfer.a static library.
+    apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
+        nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 && \
+    apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated  \
+        libnvinfer5=5.0.2-1+cuda10.0 && \
+    rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* && \
+    rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* && \
+    rm /usr/lib/x86_64-linux-gnu/libnvparsers* && \
+    rm -rf /var/lib/apt/lists/*
+
+###########################################################################
+# Horovod & its dependencies
+###########################################################################
+
+# Install Open MPI
+RUN mkdir /tmp/openmpi && \
+    cd /tmp/openmpi && \
+    curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \
+    tar zxf openmpi-3.1.2.tar.gz && \
+    cd openmpi-3.1.2 && \
+    ./configure --enable-orterun-prefix-by-default && \
+    make -j $(nproc) all && \
+    make install && \
+    ldconfig && \
+    rm -rf /tmp/openmpi
+
+ARG py_version
+ARG framework_installable
+ARG framework_support_installable
+
+RUN if [ $py_version -eq 3 ]; then PYTHON_VERSION=python3.6; else PYTHON_VERSION=python2.7; fi && \
+        apt-get update && apt-get install -y --no-install-recommends $PYTHON_VERSION-dev --allow-unauthenticated  && \
+        ln -s -f /usr/bin/$PYTHON_VERSION /usr/bin/python && \
+    rm -rf /var/lib/apt/lists/*
+
+# Create a wrapper for OpenMPI to allow running as root by default
+RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
+    echo '#!/bin/bash' > /usr/local/bin/mpirun && \
+    echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
+    chmod a+x /usr/local/bin/mpirun
+
+# Configure OpenMPI to run good defaults:
+#   --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
+RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
+    echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
+
+# Set default NCCL parameters
+RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
+
+ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
+ENV PATH /usr/local/openmpi/bin/:$PATH
+ENV PATH=/usr/local/nvidia/bin:$PATH
+
+# SSH login fix. Otherwise user is kicked off after login
+RUN mkdir -p /var/run/sshd && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
+
+# Create SSH key.
+RUN mkdir -p /root/.ssh/ && \
+  ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
+  cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
+  printf "Host *\n  StrictHostKeyChecking no\n" >> /root/.ssh/config
+
+###########################################################################
+# Python won’t try to write .pyc or .pyo files on the import of source modules
+ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
+
+RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
+    python get-pip.py --disable-pip-version-check --no-cache-dir "pip==18.1" && \
+    rm get-pip.py
+
+WORKDIR /
+
+COPY $framework_installable tensorflow-1.13.1-py2.py3-none-any.whl
+COPY $framework_support_installable .
+
+RUN pip install --no-cache-dir -U \
+    keras==2.2.4 \
+    mpi4py==3.0.1 \
+    $framework_support_installable \
+    "sagemaker-tensorflow>=1.13,<1.14" \
+    # Let's install TensorFlow separately in the end to avoid
+    # the library version to be overwritten
+    && pip install --force-reinstall --no-cache-dir -U tensorflow-1.13.1-py2.py3-none-any.whl \
+    \
+    && rm -f tensorflow-1.13.1-py2.py3-none-any.whl  \
+    && rm -f $framework_support_installable \
+    && pip uninstall -y --no-cache-dir \
+    markdown \
+    tensorboard
+
+# Install Horovod, temporarily using CUDA stubs
+RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \
+    HOROVOD_GPU_ALLREDUCE=NCCL  HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod && \
+    ldconfig
+
+ARG tornasole_framework_installable
+COPY $tornasole_framework_installable .
+
+RUN pip install $tornasole_framework_installable
+
+ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main