Upgrade to PyTorch 1.8.1, TensorFlow 2.5.0 and Rapids 21.08

rosbo · rosbo · commit b39b8120ae9e · 2021-08-26T21:22:47.000Z
And CUDA 11.2

http://b/181966788
diff --git a/Dockerfile b/Dockerfile
@@ -1,5 +1,5 @@
 ARG BASE_TAG=m78
-ARG TENSORFLOW_VERSION=2.4.1
+ARG TENSORFLOW_VERSION=2.5.0
 
 FROM gcr.io/deeplearning-platform-release/base-cpu:${BASE_TAG}
 
@@ -47,7 +47,7 @@ RUN conda config --add channels conda-forge && \
     conda install cartopy=0.19 imagemagick=7.0 pyproj==3.1.0 pysal==2.1.0 && \
     /tmp/clean-layer.sh
 
-RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
+RUN pip install torch==1.8.1+cpu torchvision==0.9.1+cpu torchaudio==0.8.1 torchtext==0.9.1 -f https://download.pytorch.org/whl/torch_stable.html && \
     /tmp/clean-layer.sh
 
 RUN pip install seaborn python-dateutil dask python-igraph && \
@@ -60,9 +60,9 @@ RUN pip install seaborn python-dateutil dask python-igraph && \
     /tmp/clean-layer.sh
 
 RUN pip install tensorflow==${TENSORFLOW_VERSION} && \
-    pip install tensorflow-gcs-config==2.4.0 && \
-    pip install tensorflow-addons==0.12.1 && \
-    pip install tensorflow_probability==0.12.2 && \
+    pip install tensorflow-gcs-config==${TENSORFLOW_VERSION} && \
+    pip install tensorflow-addons==0.13.0 && \
+    pip install tensorflow_probability==0.13.0 && \
     /tmp/clean-layer.sh
 
 RUN apt-get install -y libfreetype6-dev && \
@@ -330,8 +330,7 @@ RUN pip install bleach && \
     pip install widgetsnbextension && \
     pip install pyarrow && \
     pip install feather-format && \
-    # fastai >= 2.3.1 upgrades pytorch/torchvision. upgrade of pytorch will be handled in b/181966788
-    pip install fastai==2.2.7 && \
+    pip install fastai && \
     pip install allennlp && \
     # https://b.corp.google.com/issues/184685619#comment9: 3.9.0 is causing a major performance degradation with spacy 2.3.5
     pip install importlib-metadata==3.4.0 && \
diff --git a/gpu.Dockerfile b/gpu.Dockerfile
@@ -1,6 +1,6 @@
 ARG BASE_TAG=staging
 
-FROM nvidia/cuda:11.0-cudnn8-devel-ubuntu18.04 AS nvidia
+FROM nvidia/cuda:11.2.2-cudnn8-devel-ubuntu18.04 AS nvidia
 FROM gcr.io/kaggle-images/python:${BASE_TAG}
 
 ADD clean-layer.sh  /tmp/clean-layer.sh
@@ -13,7 +13,7 @@ COPY --from=nvidia /etc/apt/trusted.gpg /etc/apt/trusted.gpg.d/cuda.gpg
 RUN sed -i 's/deb https:\/\/developer.download.nvidia.com/deb http:\/\/developer.download.nvidia.com/' /etc/apt/sources.list.d/*.list
 
 ENV CUDA_MAJOR_VERSION=11
-ENV CUDA_MINOR_VERSION=0
+ENV CUDA_MINOR_VERSION=2
 ENV CUDA_VERSION=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION
 LABEL com.nvidia.volumes.needed="nvidia_driver"
 LABEL com.nvidia.cuda.version="${CUDA_VERSION}"
@@ -27,7 +27,9 @@ ENV LD_LIBRARY_PATH_NO_STUBS="/usr/local/nvidia/lib64:/usr/local/cuda/lib64:$LD_
 ENV LD_LIBRARY_PATH="/usr/local/nvidia/lib64:/usr/local/cuda/lib64:/usr/local/cuda/lib64/stubs:$LD_LIBRARY_PATH"
 ENV NVIDIA_VISIBLE_DEVICES=all
 ENV NVIDIA_DRIVER_CAPABILITIES=compute,utility
-ENV NVIDIA_REQUIRE_CUDA="cuda>=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION"
+# With CUDA enhanced compatibility, applications compiled with CUDA 11.1 can be run on the driver associated with CUDA 11.0 (i.e. R450).
+# See: https://docs.nvidia.com/deploy/cuda-compatibility/index.html#existing-apps-minor-versions
+ENV NVIDIA_REQUIRE_CUDA="cuda>=$CUDA_MAJOR_VERSION"
 RUN apt-get update && apt-get install -y --no-install-recommends \
       cuda-cupti-$CUDA_VERSION \
       cuda-cudart-$CUDA_VERSION \
@@ -37,10 +39,10 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
       cuda-nvml-dev-$CUDA_VERSION \
       cuda-minimal-build-$CUDA_VERSION \
       cuda-command-line-tools-$CUDA_VERSION \
-      libcudnn8=8.0.4.30-1+cuda$CUDA_VERSION \
-      libcudnn8-dev=8.0.4.30-1+cuda$CUDA_VERSION \
-      libnccl2=2.7.8-1+cuda$CUDA_VERSION \
-      libnccl-dev=2.7.8-1+cuda$CUDA_VERSION && \
+      libcudnn8=8.1.1.33-1+cuda$CUDA_VERSION \
+      libcudnn8-dev=8.1.1.33-1+cuda$CUDA_VERSION \
+      libnccl2=2.8.4-1+cuda$CUDA_VERSION \
+      libnccl-dev=2.8.4-1+cuda$CUDA_VERSION && \
     ln -s /usr/local/cuda-$CUDA_VERSION /usr/local/cuda && \
     ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1 && \
     /tmp/clean-layer.sh
@@ -55,15 +57,18 @@ RUN apt-get install -y ocl-icd-libopencl1 clinfo libboost-all-dev && \
 # the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/
 # However, because this image is based on the CPU image, this isn't possible but better
 # to put them at the top of this file to minize conflicts.
-RUN conda install cudf=21.06 cuml=21.06 cudatoolkit=$CUDA_VERSION && \
+RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_VERSION && \
     /tmp/clean-layer.sh
 
 # Install Pytorch and torchvision with GPU support.
-# Note: torchtext and torchaudio do not require a separate GPU package.
-RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION -f https://download.pytorch.org/whl/torch_stable.html && \
+# Note: torchtext and torchaudio do not require a separate package.
+# Replace `cu111` by `cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION` once build for CUDA 11.2 is released.
+# Introduced in CUDA 11.1, CUDA Enhanced Compatibility leverages  semantic versioning across components in the CUDA Toolkit, an application can be built for one CUDA minor release (such as 11.1) and work across all future minor releases within the major family (such as 11.x).
+# See: https://docs.nvidia.com/deploy/cuda-compatibility/index.html#overview
+RUN pip install torch==1.8.1+cu111 torchvision==0.9.1+cu111 -f https://download.pytorch.org/whl/torch_stable.html && \
     /tmp/clean-layer.sh
 
-# Install LightGBM with GPU
+# Install LightGBM with GPU support
 RUN pip uninstall -y lightgbm && \
     cd /usr/local/src && \
     git clone --recursive https://github.com/microsoft/LightGBM && \
@@ -79,7 +84,8 @@ RUN pip uninstall -y lightgbm && \
     /tmp/clean-layer.sh
 
 # Install JAX (Keep JAX version in sync with CPU image)
-RUN pip install jax==0.2.16 jaxlib==0.1.68+cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION -f https://storage.googleapis.com/jax-releases/jax_releases.html && \
+# TODO(b/181966788) Replace `cuda111` with `cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION` once new version is out.
+RUN pip install jax==0.2.16 jaxlib==0.1.68+cuda111 -f https://storage.googleapis.com/jax-releases/jax_releases.html && \
     /tmp/clean-layer.sh
 
 # Reinstall packages with a separate version for GPU support.
@@ -90,9 +96,8 @@ RUN pip uninstall -y mxnet && \
 # Install GPU-only packages
 RUN pip install pycuda && \
     pip install pynvrtc && \
-    # b/190622765 latest version is causing issue. nnabla fixed it in https://github.com/sony/nnabla/issues/892, waiting for new release before we can remove this pin.
-    pip install pynvml==8.0.4 && \
-    pip install nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \
+    # TODO(b/181966788) Replace `110` with `$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION` once new version of mxnet is out.
+    pip install nnabla-ext-cuda110 && \
     /tmp/clean-layer.sh
 
 # Re-add TensorBoard Jupyter extension patch