1
+ FROM nvidia/cuda:10.0-base-ubuntu16.04
2
+
3
+ LABEL maintainer="Amazon AI"
4
+
5
+ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
6
+ software-properties-common && \
7
+ add-apt-repository ppa:deadsnakes/ppa -y && \
8
+ rm -rf /var/lib/apt/lists/*
9
+
10
+ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
11
+ ca-certificates \
12
+ cuda-command-line-tools-10-0 \
13
+ cuda-cublas-dev-10-0 \
14
+ cuda-cudart-dev-10-0 \
15
+ cuda-cufft-dev-10-0 \
16
+ cuda-curand-dev-10-0 \
17
+ cuda-cusolver-dev-10-0 \
18
+ cuda-cusparse-dev-10-0 \
19
+ curl \
20
+ libcudnn7=7.4.1.5-1+cuda10.0 \
21
+ # TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
22
+ libnccl2 \
23
+ libnccl-dev \
24
+ libfreetype6-dev \
25
+ libhdf5-serial-dev \
26
+ libpng12-dev \
27
+ libzmq3-dev \
28
+ wget \
29
+ openssh-client \
30
+ openssh-server \
31
+ build-essential && \
32
+ # The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0
33
+ # adds a new list which contains libnvinfer library, so it needs another
34
+ # 'apt-get update' to retrieve that list before it can actually install the
35
+ # library.
36
+ # We don't install libnvinfer-dev since we don't need to build against TensorRT,
37
+ # and libnvinfer4 doesn't contain libnvinfer.a static library.
38
+ apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
39
+ nvinfer-runtime-trt-repo-ubuntu1604-5.0.2-ga-cuda10.0 && \
40
+ apt-get update && apt-get install -y --no-install-recommends --allow-unauthenticated \
41
+ libnvinfer5=5.0.2-1+cuda10.0 && \
42
+ rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* && \
43
+ rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* && \
44
+ rm /usr/lib/x86_64-linux-gnu/libnvparsers* && \
45
+ rm -rf /var/lib/apt/lists/*
46
+
47
+ ###########################################################################
48
+ # Horovod & its dependencies
49
+ ###########################################################################
50
+
51
+ # Install Open MPI
52
+ RUN mkdir /tmp/openmpi && \
53
+ cd /tmp/openmpi && \
54
+ curl -fSsL -O https://www.open-mpi.org/software/ompi/v3.1/downloads/openmpi-3.1.2.tar.gz && \
55
+ tar zxf openmpi-3.1.2.tar.gz && \
56
+ cd openmpi-3.1.2 && \
57
+ ./configure --enable-orterun-prefix-by-default && \
58
+ make -j $(nproc) all && \
59
+ make install && \
60
+ ldconfig && \
61
+ rm -rf /tmp/openmpi
62
+
63
+ ARG py_version
64
+ ARG framework_installable
65
+ ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz
66
+
67
+ RUN if [ $py_version -eq 3 ]; then PYTHON_VERSION=python3.6; else PYTHON_VERSION=python2.7; fi && \
68
+ apt-get update && apt-get install -y --no-install-recommends $PYTHON_VERSION-dev --allow-unauthenticated && \
69
+ ln -s -f /usr/bin/$PYTHON_VERSION /usr/bin/python && \
70
+ rm -rf /var/lib/apt/lists/*
71
+
72
+ # Create a wrapper for OpenMPI to allow running as root by default
73
+ RUN mv /usr/local/bin/mpirun /usr/local/bin/mpirun.real && \
74
+ echo '#!/bin/bash' > /usr/local/bin/mpirun && \
75
+ echo 'mpirun.real --allow-run-as-root "$@"' >> /usr/local/bin/mpirun && \
76
+ chmod a+x /usr/local/bin/mpirun
77
+
78
+ # Configure OpenMPI to run good defaults:
79
+ # --bind-to none --map-by slot --mca btl_tcp_if_exclude lo,docker0
80
+ RUN echo "hwloc_base_binding_policy = none" >> /usr/local/etc/openmpi-mca-params.conf && \
81
+ echo "rmaps_base_mapping_policy = slot" >> /usr/local/etc/openmpi-mca-params.conf
82
+
83
+ # Set default NCCL parameters
84
+ RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
85
+
86
+ ENV LD_LIBRARY_PATH=/usr/local/openmpi/lib:$LD_LIBRARY_PATH
87
+ ENV PATH /usr/local/openmpi/bin/:$PATH
88
+ ENV PATH=/usr/local/nvidia/bin:$PATH
89
+
90
+ # SSH login fix. Otherwise user is kicked off after login
91
+ RUN mkdir -p /var/run/sshd && sed 's@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g' -i /etc/pam.d/sshd
92
+
93
+ # Create SSH key.
94
+ RUN mkdir -p /root/.ssh/ && \
95
+ ssh-keygen -q -t rsa -N '' -f /root/.ssh/id_rsa && \
96
+ cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \
97
+ printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
98
+
99
+ ###########################################################################
100
+ # Python won’t try to write .pyc or .pyo files on the import of source modules
101
+ ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 PYTHONIOENCODING=UTF-8 LANG=C.UTF-8 LC_ALL=C.UTF-8
102
+
103
+ RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \
104
+ python get-pip.py --disable-pip-version-check --no-cache-dir "pip==18.1" && \
105
+ rm get-pip.py
106
+
107
+ WORKDIR /
108
+
109
+ COPY $framework_installable tensorflow-1.13.1-py2.py3-none-any.whl
110
+ COPY $framework_support_installable .
111
+
112
+ RUN pip install --no-cache-dir -U \
113
+ keras==2.2.4 \
114
+ mpi4py==3.0.1 \
115
+ $framework_support_installable \
116
+ "sagemaker-tensorflow>=1.13,<1.14" \
117
+ # Let's install TensorFlow separately in the end to avoid
118
+ # the library version to be overwritten
119
+ && pip install --force-reinstall --no-cache-dir -U tensorflow-1.13.1-py2.py3-none-any.whl \
120
+ \
121
+ && rm -f tensorflow-1.13.1-py2.py3-none-any.whl \
122
+ && rm -f $framework_support_installable \
123
+ && pip uninstall -y --no-cache-dir \
124
+ markdown \
125
+ tensorboard
126
+
127
+ # Install Horovod, temporarily using CUDA stubs
128
+ RUN ldconfig /usr/local/cuda-10.0/targets/x86_64-linux/lib/stubs && \
129
+ HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_WITH_TENSORFLOW=1 pip install --no-cache-dir horovod && \
130
+ ldconfig
131
+
132
+ ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main
0 commit comments