Kaggle · djherbis · Feb 9, 2023 · Jun 9, 2022 · Jul 6, 2022 · Aug 3, 2022
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
@@ -12,6 +12,15 @@ ARG TORCHVISION_VERSION
 FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl
 FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl
 FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
+{{ else }}
+FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
+{{ end }}
+
+# Ensures shared libraries installed with conda can be found by the dynamic link loader.
+ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib"
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib"
+
+{{ if eq .Accelerator "gpu" }}
 ARG CUDA_MAJOR_VERSION
 ARG CUDA_MINOR_VERSION
 ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
@@ -22,11 +31,10 @@ ENV PATH=/opt/bin:${PATH}
 ENV LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH"
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs"
 RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
-{{ else }}
-FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
 {{ end }}
+
 # Keep these variables in sync if base image is updated.
-ENV TENSORFLOW_VERSION=2.6.4
+ENV TENSORFLOW_VERSION=2.9.2
 
 # We need to redefine the ARG here to get the ARG value defined above the FROM instruction.
 # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
@@ -76,33 +84,42 @@ ENV PROJ_LIB=/opt/conda/share/proj
 # the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/
 RUN conda config --add channels nvidia && \
     conda config --add channels rapidsai && \
+    conda install -c conda-forge mamba && \
     # Base image channel order: conda-forge (highest priority), defaults.
     # End state: rapidsai (highest priority), nvidia, conda-forge, defaults.
-    conda install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \
+    mamba install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \
     /tmp/clean-layer.sh
 
 {{ if eq .Accelerator "gpu" }}
 
 # b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version.
-RUN pip uninstall -y pyarrow && \
-    conda install cudf=21.10 cuml=21.10 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \
-    /tmp/clean-layer.sh
-{{ end }}
+# b/267180053: RapidsAI (cudf/cuml) are not compatible with the latest tensorflow cudatoolkit version.
+# RUN pip uninstall -y pyarrow && \
+#    mamba install -y cudf cuml && \
+#    /tmp/clean-layer.sh
+# {{ end }}
 
 # Install implicit
 {{ if eq .Accelerator "gpu" }}
-RUN conda install implicit implicit-proc=*=gpu && \
+RUN mamba install implicit implicit-proc=*=gpu && \
     /tmp/clean-layer.sh
 {{ else }}
-RUN conda install implicit && \
+RUN mamba install implicit && \
     /tmp/clean-layer.sh
 {{ end}}
 
 # Install PyTorch
 {{ if eq .Accelerator "gpu" }}
 COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/
-RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \
+RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \
     pip install /tmp/torch/*.whl && \
+    # b/255757999 openmp (libomp.so) is an dependency of libtorchtext and libtorchaudio but
+    # the built from source versions don't seem to properly link it in. This forces the dep
+    # which makes sure that libomp is loaded when these libraries are loaded.
+    mamba install -y openmp && \
+    pip install patchelf && \
+    patchelf --add-needed libomp.so /opt/conda/lib/python3.7/site-packages/torchtext/lib/libtorchtext.so && \
+    patchelf --add-needed libomp.so /opt/conda/lib/python3.7/site-packages/torchaudio/lib/libtorchaudio.so && \
     rm -rf /tmp/torch && \
     /tmp/clean-layer.sh
 {{ else }}
@@ -141,7 +158,8 @@ RUN pip install jax[cpu] && \
 
 # Install mxnet
 {{ if eq .Accelerator "gpu" }}
-RUN pip install mxnet-cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \
+# No specific package for 11.3 minor versions, using 11.2 instead.
+RUN pip install mxnet-cu112 && \
     /tmp/clean-layer.sh
 {{ else }}
 RUN pip install mxnet && \
@@ -160,10 +178,11 @@ RUN pip install spacy && \
 # Install GPU specific packages
 {{ if eq .Accelerator "gpu" }}
 # Install GPU-only packages
+# No specific package for nnabla-ext-cuda 11.x minor versions.
 RUN pip install pycuda \
         pynvrtc \
         pynvml \
-        nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \
+        nnabla-ext-cuda${CUDA_MAJOR_VERSION}0 && \
     /tmp/clean-layer.sh
 {{ end }}
 
@@ -176,9 +195,9 @@ RUN pip install pysal \
     # Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda.
     apt-get install -y default-jre-headless && \
     pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o \
-        tensorflow-gcs-config==2.6.0 \
-        tensorflow-addons==0.14.0 \
-        tensorflow_decision_forests==0.2.0 && \
+        "tensorflow-gcs-config<=${TENSORFLOW_VERSION}" \
+        tensorflow-addons==0.17.1 \
+        tensorflow_decision_forests==0.2.7 && \
     /tmp/clean-layer.sh
 
 RUN apt-get install -y libfreetype6-dev && \
@@ -393,6 +412,8 @@ RUN pip install cython \
         mlcrate && \
     /tmp/clean-layer.sh
 
+
+# Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376
 RUN pip install bleach \
         certifi \
         cycler \
@@ -402,7 +423,7 @@ RUN pip install bleach \
         ipykernel \
         ipython \
         ipython-genutils \
-        ipywidgets \
+        ipywidgets==7.7.1 \
         isoweek \
         jedi \
         jsonschema \
@@ -459,6 +480,10 @@ RUN pip install bleach \
     #
     ###########
 
+# dlib has a libmkl incompatibility:
+# test_dlib_face_detector (test_dlib.TestDLib) ... INTEL MKL ERROR: /opt/conda/bin/../lib/libmkl_avx512.so.2: undefined symbol: mkl_sparse_optimize_bsr_trsm_i8.
+# Intel MKL FATAL ERROR: Cannot load libmkl_avx512.so.2 or libmkl_def.so.2.
+# nnabla breaks protobuf compatibiilty:
 RUN pip install flashtext \
         wandb \
         # b/214080882 blake3 0.3.0 is not compatible with vaex.
@@ -505,10 +530,8 @@ RUN pip install flashtext \
         transformers \
         # b/232247930 >= 2.2.0 requires pyarrow >= 6.0.0 which conflicts with dependencies for rapidsai 0.21.*
         datasets==2.1.0 \
-        dlib \
         kaggle-environments \
         geopandas \
-        nnabla \
         vowpalwabbit \
         pydub \
         pydegensac \
@@ -600,6 +623,9 @@ RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \
     jupyter-serverextension disable nb_conda --py --sys-prefix && \
     python -m nb_conda_kernels.install --disable
 
+# Force only one libcusolver
+RUN rm /opt/conda/bin/../lib/libcusolver.so.11 && ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11
+
 # Set backend for matplotlib
 ENV MPLBACKEND "agg"
 

diff --git a/Jenkinsfile b/Jenkinsfile
@@ -106,7 +106,7 @@ pipeline {
           stages {  
             stage('Build GPU Image') {
               options {
-                timeout(time: 180, unit: 'MINUTES')
+                timeout(time: 4324, unit: 'MINUTES')
               }
               steps {
                 sh '''#!/bin/bash

diff --git a/config.txt b/config.txt
@@ -1,12 +1,11 @@
 BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
-BASE_IMAGE_TAG=m94
-CPU_BASE_IMAGE_NAME=tf2-cpu.2-6
-GPU_BASE_IMAGE_NAME=tf2-gpu.2-6
+BASE_IMAGE_TAG=m96
+CPU_BASE_IMAGE_NAME=tf2-cpu.2-9
+GPU_BASE_IMAGE_NAME=tf2-gpu.2-9
 LIGHTGBM_VERSION=3.3.2
-TORCH_VERSION=1.11.0
-# TODO(b/215031404#comment4) Remove zlib sed command after upgrade to >= 0.11.1
-TORCHAUDIO_VERSION=0.11.0
-TORCHTEXT_VERSION=0.12.0
-TORCHVISION_VERSION=0.12.0
+TORCH_VERSION=1.12.0
+TORCHAUDIO_VERSION=0.12.0
+TORCHTEXT_VERSION=0.13.0
+TORCHVISION_VERSION=0.13.0
 CUDA_MAJOR_VERSION=11
-CUDA_MINOR_VERSION=0
+CUDA_MINOR_VERSION=3
diff --git a/packages/torch.Dockerfile b/packages/torch.Dockerfile
@@ -12,23 +12,30 @@ ARG CUDA_MINOR_VERSION
 # TORCHVISION_VERSION is mandatory
 RUN test -n "$TORCHVISION_VERSION"
 
+# Use mamba to speed up conda installs
+RUN conda install -c conda-forge mamba
+
 # Build instructions: https://github.com/pytorch/pytorch#from-source
-RUN conda install astunparse numpy ninja pyyaml mkl mkl-include setuptools==59.5.0 cmake cffi typing_extensions future six requests dataclasses
-RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}
+RUN mamba install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses
+RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}
 
 # By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash.
 # This causes dependency conflicts like these: https://paste.googleplex.com/4786486378496000
 ENV PYTORCH_BUILD_VERSION=$PACKAGE_VERSION
 ENV PYTORCH_BUILD_NUMBER=1
 
+# Ensures shared libraries installed with conda can be found by the dynamic link loader.
+# For PyTorch, we need specifically mkl.
+ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib"
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib"
 ENV TORCH_CUDA_ARCH_LIST="3.7;6.0;7.0+PTX;7.5+PTX"
 ENV FORCE_CUDA=1
 RUN cd /usr/local/src && \
     git clone --recursive https://github.com/pytorch/pytorch && \
     cd pytorch && \
     git checkout tags/v$PACKAGE_VERSION && \
     git submodule sync && \
-    git submodule update --init --recursive --jobs 0 && \
+    git submodule update --init --recursive --jobs 1 && \
     python setup.py bdist_wheel
 
 # Install torch which is required before we can build other torch* packages.
@@ -38,14 +45,17 @@ RUN pip install /usr/local/src/pytorch/dist/*.whl
 # Instructions: https://github.com/pytorch/audio#from-source
 # See comment above for PYTORCH_BUILD_VERSION.
 ENV BUILD_VERSION=$TORCHAUDIO_VERSION
-RUN cd /usr/local/src && \
+RUN sudo apt-get update && \
+    # ncurses.h is required for this install
+    sudo apt-get install libncurses-dev && \
+    # Fixing the build: https://github.com/pytorch/audio/issues/666#issuecomment-635928685
+    mamba install -c conda-forge ncurses && \
+    cd /usr/local/src && \
     git clone https://github.com/pytorch/audio && \
     cd audio && \
     git checkout tags/v$TORCHAUDIO_VERSION && \
     git submodule sync && \
-    git submodule update --init --recursive --jobs 0 && \
-    # TODO(b/215031404#comment4) Remove after upgrade next release (0.11.1)
-    sed -i s?https://zlib.net/zlib-1.2.11.tar.gz?https://sourceforge.net/projects/libpng/files/zlib/1.2.11/zlib-1.2.11.tar.gz? third_party/zlib/CMakeLists.txt && \
+    git submodule update --init --recursive --jobs 1 && \
     python setup.py bdist_wheel
 
 # Build torchtext
@@ -57,7 +67,7 @@ RUN cd /usr/local/src && \
     cd text && \
     git checkout tags/v$TORCHTEXT_VERSION && \
     git submodule sync && \
-    git submodule update --init --recursive --jobs 0 && \
+    git submodule update --init --recursive --jobs 1 && \
     python setup.py bdist_wheel
 
 # Build torchvision.
@@ -81,4 +91,4 @@ COPY --from=builder /usr/local/src/text/dist/*.whl /tmp/whl
 COPY --from=builder /usr/local/src/vision/dist/*.whl /tmp/whl
 
 # Print out the built .whl file.
-RUN ls -lh /tmp/whl/
+RUN ls -lh /tmp/whl/
diff --git a/tests/test_allennlp.py b/tests/test_allennlp.py
diff --git a/tests/test_dlib.py b/tests/test_dlib.py
diff --git a/tests/test_implicit.py b/tests/test_implicit.py
@@ -28,5 +28,11 @@ def test_model(self):
         model.fit(counts, show_progress=False)
         rows, cols = model.item_factors, model.user_factors
 
-        assert not np.isnan(np.sum(cols))
-        assert not np.isnan(np.sum(rows))
+        assert not np.isnan(np.sum(tonumpy(cols)))
+        assert not np.isnan(np.sum(tonumpy(rows)))
+
+
+def tonumpy(x):
+    if hasattr(x, 'to_numpy'):
+        return x.to_numpy()
+    return x
diff --git a/tests/test_nnabla.py b/tests/test_nnabla.py
diff --git a/tests/test_rapids.py b/tests/test_rapids.py