-
Notifications
You must be signed in to change notification settings - Fork 162
Add distributed training support #98
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
4865666
4ff119a
10be00e
d0812a1
269f85a
2ecb65b
5738e0f
386b3de
398e579
74e29a8
f4b7a82
16a8aad
bb25524
cd94ca9
74670e6
8bb1aac
accc678
93339fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
FROM ubuntu:16.04 | ||
|
||
MAINTAINER Amazon AI | ||
|
||
ARG framework_installable | ||
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz | ||
ARG py_version | ||
|
||
# Validate that arguments are specified | ||
RUN test $framework_installable || exit 1 \ | ||
&& test $py_version || exit 1 | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \ | ||
&& add-apt-repository ppa:deadsnakes/ppa -y \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
RUN buildDeps=" \ | ||
icywang86rui marked this conversation as resolved.
Show resolved
Hide resolved
|
||
ca-certificates \ | ||
curl \ | ||
nginx \ | ||
" \ | ||
&& apt-get update && apt-get install -y --no-install-recommends $buildDeps \ | ||
&& rm -rf /var/lib/apt/lists/* \ | ||
&& if [ $py_version -eq 3 ]; \ | ||
then apt-get update && apt-get install -y --no-install-recommends python3.6 \ | ||
&& ln -s -f /usr/bin/python3.6 /usr/bin/python; \ | ||
else apt-get update && apt-get install -y --no-install-recommends python; fi | ||
|
||
# Python won’t try to write .pyc or .pyo files on the import of source modules | ||
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 | ||
|
||
RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ | ||
python get-pip.py \ | ||
--disable-pip-version-check \ | ||
--no-cache-dir \ | ||
"pip==18.1" \ | ||
; \ | ||
pip --version; \ | ||
find /usr/local -depth \ | ||
\( \ | ||
\( -type d -a \( -name test -o -name tests \) \) \ | ||
-o \ | ||
\( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \ | ||
\) -exec rm -rf '{}' +; \ | ||
rm get-pip.py | ||
|
||
# Set environment variables for MKL | ||
# TODO: investigate the right value for OMP_NUM_THREADS | ||
# For more about MKL with TensorFlow see: | ||
# https://www.tensorflow.org/performance/performance_guide#tensorflow_with_intel%C2%AE_mkl_dnn | ||
ENV KMP_AFFINITY=granularity=fine,compact,1,0 KMP_BLOCKTIME=1 KMP_SETTINGS=0 | ||
|
||
WORKDIR / | ||
|
||
COPY $framework_installable . | ||
COPY $framework_support_installable . | ||
|
||
RUN pip install --no-cache-dir $framework_installable \ | ||
$framework_support_installable\ | ||
"sagemaker-tensorflow>=1.11,<1.12" \ | ||
\ | ||
&& rm -f $framework_installable \ | ||
&& rm -f $framework_support_installable \ | ||
&& pip uninstall -y --no-cache-dir \ | ||
markdown \ | ||
tensorboard | ||
|
||
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
FROM nvidia/cuda:9.0-base-ubuntu16.04 | ||
|
||
MAINTAINER Amazon AI | ||
|
||
ARG framework_installable | ||
ARG framework_support_installable=sagemaker_tensorflow_container-2.0.0.tar.gz | ||
ARG py_version | ||
|
||
# Validate that arguments are specified | ||
RUN test $framework_installable || exit 1 \ | ||
&& test $py_version || exit 1 | ||
|
||
RUN apt-get update && apt-get install -y --no-install-recommends software-properties-common \ | ||
&& add-apt-repository ppa:deadsnakes/ppa -y \ | ||
&& rm -rf /var/lib/apt/lists/* | ||
|
||
ENV NCCL_VERSION=2.3.5-2+cuda9.0 | ||
ENV CUDNN_VERSION=7.3.1.20-1+cuda9.0 | ||
ENV TF_TENSORRT_VERSION=4.1.2 | ||
|
||
RUN buildDeps=" \ | ||
ca-certificates \ | ||
cuda-command-line-tools-9-0 \ | ||
cuda-cublas-dev-9-0 \ | ||
cuda-cudart-dev-9-0 \ | ||
cuda-cufft-dev-9-0 \ | ||
cuda-curand-dev-9-0 \ | ||
cuda-cusolver-dev-9-0 \ | ||
cuda-cusparse-dev-9-0 \ | ||
curl \ | ||
libcudnn7=${CUDNN_VERSION} \ | ||
libnccl2=${NCCL_VERSION} \ | ||
libgomp1 \ | ||
" \ | ||
&& apt-get update && apt-get install -y --no-install-recommends $buildDeps \ | ||
icywang86rui marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# The 'apt-get install' of nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 | ||
# adds a new list which contains libnvinfer library, so it needs another | ||
# 'apt-get update' to retrieve that list before it can actually install the | ||
# library. | ||
# We don't install libnvinfer-dev since we don't need to build against TensorRT, | ||
# and libnvinfer4 doesn't contain libnvinfer.a static library. | ||
&& apt-get update && apt-get install -y --no-install-recommends \ | ||
nvinfer-runtime-trt-repo-ubuntu1604-4.0.1-ga-cuda9.0 \ | ||
&& apt-get update && apt-get install -y --no-install-recommends \ | ||
libnvinfer4=${TF_TENSORRT_VERSION}-1+cuda9.0 \ | ||
&& apt-get clean \ | ||
&& rm -rf /var/lib/apt/lists/* \ | ||
&& rm /usr/lib/x86_64-linux-gnu/libnvinfer_plugin* \ | ||
&& rm /usr/lib/x86_64-linux-gnu/libnvcaffe_parser* \ | ||
&& rm /usr/lib/x86_64-linux-gnu/libnvparsers* \ | ||
&& if [ $py_version -eq 3 ]; \ | ||
then apt-get update && apt-get install -y --no-install-recommends python3.6 \ | ||
&& ln -s -f /usr/bin/python3.6 /usr/bin/python; \ | ||
else apt-get update && apt-get install -y --no-install-recommends python; fi | ||
|
||
# Python won’t try to write .pyc or .pyo files on the import of source modules | ||
ENV PYTHONDONTWRITEBYTECODE=1 PYTHONUNBUFFERED=1 | ||
|
||
RUN curl -fSsL -O https://bootstrap.pypa.io/get-pip.py && \ | ||
python get-pip.py \ | ||
--disable-pip-version-check \ | ||
--no-cache-dir \ | ||
"pip==18.1" \ | ||
; \ | ||
pip --version; \ | ||
find /usr/local -depth \ | ||
\( \ | ||
\( -type d -a \( -name test -o -name tests \) \) \ | ||
-o \ | ||
\( -type f -a \( -name '*.pyc' -o -name '*.pyo' \) \) \ | ||
\) -exec rm -rf '{}' +; \ | ||
rm get-pip.py | ||
|
||
WORKDIR /root | ||
|
||
COPY $framework_installable . | ||
COPY $framework_support_installable . | ||
|
||
RUN framework_installable_local=$(basename $framework_installable) \ | ||
&& framework_support_installable_local=$(basename $framework_support_installable) \ | ||
\ | ||
&& pip install --no-cache --upgrade $framework_installable_local \ | ||
&& pip install $framework_support_installable_local \ | ||
&& pip install "sagemaker-tensorflow>=1.11,<1.12" \ | ||
\ | ||
&& rm $framework_installable_local \ | ||
&& rm $framework_support_installable_local | ||
|
||
ENV SAGEMAKER_TRAINING_MODULE sagemaker_tensorflow_container.training:main | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can use a linter like https://github.com/hadolint to help you here. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have run this locally, only thing comes up now is the pin version stuff. I will add it to the build in the next pr. |
Uh oh!
There was an error while loading. Please reload this page.