Skip to content

Commit 739e1b0

Browse files
authored
Merge pull request #1182 from Kaggle/upgrade-tf2.9
Upgrade to TensorFlow 2.9
2 parents eca5485 + 8f810af commit 739e1b0

File tree

9 files changed

+81
-119
lines changed

9 files changed

+81
-119
lines changed

Dockerfile.tmpl

Lines changed: 45 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,15 @@ ARG TORCHVISION_VERSION
1212
FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl
1313
FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl
1414
FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
15+
{{ else }}
16+
FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
17+
{{ end }}
18+
19+
# Ensures shared libraries installed with conda can be found by the dynamic link loader.
20+
ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib"
21+
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib"
22+
23+
{{ if eq .Accelerator "gpu" }}
1524
ARG CUDA_MAJOR_VERSION
1625
ARG CUDA_MINOR_VERSION
1726
ENV CUDA_MAJOR_VERSION=${CUDA_MAJOR_VERSION}
@@ -22,11 +31,10 @@ ENV PATH=/opt/bin:${PATH}
2231
ENV LD_LIBRARY_PATH_NO_STUBS="$LD_LIBRARY_PATH"
2332
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/cuda/lib64/stubs"
2433
RUN ln -s /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/stubs/libcuda.so.1
25-
{{ else }}
26-
FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
2734
{{ end }}
35+
2836
# Keep these variables in sync if base image is updated.
29-
ENV TENSORFLOW_VERSION=2.6.4
37+
ENV TENSORFLOW_VERSION=2.9.2
3038

3139
# We need to redefine the ARG here to get the ARG value defined above the FROM instruction.
3240
# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
@@ -76,33 +84,42 @@ ENV PROJ_LIB=/opt/conda/share/proj
7684
# the remaining pip commands: https://www.anaconda.com/using-pip-in-a-conda-environment/
7785
RUN conda config --add channels nvidia && \
7886
conda config --add channels rapidsai && \
87+
conda install -c conda-forge mamba && \
7988
# Base image channel order: conda-forge (highest priority), defaults.
8089
# End state: rapidsai (highest priority), nvidia, conda-forge, defaults.
81-
conda install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \
90+
mamba install mkl cartopy=0.19 imagemagick=7.1 pyproj==3.1.0 && \
8291
/tmp/clean-layer.sh
8392

8493
{{ if eq .Accelerator "gpu" }}
8594

8695
# b/232247930: uninstall pyarrow to avoid double installation with the GPU specific version.
87-
RUN pip uninstall -y pyarrow && \
88-
conda install cudf=21.10 cuml=21.10 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MINOR_VERSION && \
89-
/tmp/clean-layer.sh
90-
{{ end }}
96+
# b/267180053: RapidsAI (cudf/cuml) are not compatible with the latest tensorflow cudatoolkit version.
97+
# RUN pip uninstall -y pyarrow && \
98+
# mamba install -y cudf cuml && \
99+
# /tmp/clean-layer.sh
100+
# {{ end }}
91101

92102
# Install implicit
93103
{{ if eq .Accelerator "gpu" }}
94-
RUN conda install implicit implicit-proc=*=gpu && \
104+
RUN mamba install implicit implicit-proc=*=gpu && \
95105
/tmp/clean-layer.sh
96106
{{ else }}
97-
RUN conda install implicit && \
107+
RUN mamba install implicit && \
98108
/tmp/clean-layer.sh
99109
{{ end}}
100110

101111
# Install PyTorch
102112
{{ if eq .Accelerator "gpu" }}
103113
COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/
104-
RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \
114+
RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION} && \
105115
pip install /tmp/torch/*.whl && \
116+
# b/255757999 openmp (libomp.so) is an dependency of libtorchtext and libtorchaudio but
117+
# the built from source versions don't seem to properly link it in. This forces the dep
118+
# which makes sure that libomp is loaded when these libraries are loaded.
119+
mamba install -y openmp && \
120+
pip install patchelf && \
121+
patchelf --add-needed libomp.so /opt/conda/lib/python3.7/site-packages/torchtext/lib/libtorchtext.so && \
122+
patchelf --add-needed libomp.so /opt/conda/lib/python3.7/site-packages/torchaudio/lib/libtorchaudio.so && \
106123
rm -rf /tmp/torch && \
107124
/tmp/clean-layer.sh
108125
{{ else }}
@@ -141,7 +158,8 @@ RUN pip install jax[cpu] && \
141158

142159
# Install mxnet
143160
{{ if eq .Accelerator "gpu" }}
144-
RUN pip install mxnet-cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \
161+
# No specific package for 11.3 minor versions, using 11.2 instead.
162+
RUN pip install mxnet-cu112 && \
145163
/tmp/clean-layer.sh
146164
{{ else }}
147165
RUN pip install mxnet && \
@@ -160,10 +178,11 @@ RUN pip install spacy && \
160178
# Install GPU specific packages
161179
{{ if eq .Accelerator "gpu" }}
162180
# Install GPU-only packages
181+
# No specific package for nnabla-ext-cuda 11.x minor versions.
163182
RUN pip install pycuda \
164183
pynvrtc \
165184
pynvml \
166-
nnabla-ext-cuda$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION && \
185+
nnabla-ext-cuda${CUDA_MAJOR_VERSION}0 && \
167186
/tmp/clean-layer.sh
168187
{{ end }}
169188

@@ -176,9 +195,9 @@ RUN pip install pysal \
176195
# Use `conda install -c h2oai h2o` once Python 3.7 version is released to conda.
177196
apt-get install -y default-jre-headless && \
178197
pip install -f https://h2o-release.s3.amazonaws.com/h2o/latest_stable_Py.html h2o \
179-
tensorflow-gcs-config==2.6.0 \
180-
tensorflow-addons==0.14.0 \
181-
tensorflow_decision_forests==0.2.0 && \
198+
"tensorflow-gcs-config<=${TENSORFLOW_VERSION}" \
199+
tensorflow-addons==0.17.1 \
200+
tensorflow_decision_forests==0.2.7 && \
182201
/tmp/clean-layer.sh
183202

184203
RUN apt-get install -y libfreetype6-dev && \
@@ -393,6 +412,8 @@ RUN pip install cython \
393412
mlcrate && \
394413
/tmp/clean-layer.sh
395414

415+
416+
# Fix qgrid by pinning ipywidgets https://github.com/quantopian/qgrid/issues/376
396417
RUN pip install bleach \
397418
certifi \
398419
cycler \
@@ -402,7 +423,7 @@ RUN pip install bleach \
402423
ipykernel \
403424
ipython \
404425
ipython-genutils \
405-
ipywidgets \
426+
ipywidgets==7.7.1 \
406427
isoweek \
407428
jedi \
408429
jsonschema \
@@ -459,6 +480,10 @@ RUN pip install bleach \
459480
#
460481
###########
461482

483+
# dlib has a libmkl incompatibility:
484+
# test_dlib_face_detector (test_dlib.TestDLib) ... INTEL MKL ERROR: /opt/conda/bin/../lib/libmkl_avx512.so.2: undefined symbol: mkl_sparse_optimize_bsr_trsm_i8.
485+
# Intel MKL FATAL ERROR: Cannot load libmkl_avx512.so.2 or libmkl_def.so.2.
486+
# nnabla breaks protobuf compatibiilty:
462487
RUN pip install flashtext \
463488
wandb \
464489
# b/214080882 blake3 0.3.0 is not compatible with vaex.
@@ -505,10 +530,8 @@ RUN pip install flashtext \
505530
transformers \
506531
# b/232247930 >= 2.2.0 requires pyarrow >= 6.0.0 which conflicts with dependencies for rapidsai 0.21.*
507532
datasets==2.1.0 \
508-
dlib \
509533
kaggle-environments \
510534
geopandas \
511-
nnabla \
512535
vowpalwabbit \
513536
pydub \
514537
pydegensac \
@@ -600,6 +623,9 @@ RUN jupyter-nbextension disable nb_conda --py --sys-prefix && \
600623
jupyter-serverextension disable nb_conda --py --sys-prefix && \
601624
python -m nb_conda_kernels.install --disable
602625

626+
# Force only one libcusolver
627+
RUN rm /opt/conda/bin/../lib/libcusolver.so.11 && ln -s /usr/local/cuda/lib64/libcusolver.so.11 /opt/conda/bin/../lib/libcusolver.so.11
628+
603629
# Set backend for matplotlib
604630
ENV MPLBACKEND "agg"
605631

Jenkinsfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ pipeline {
106106
stages {
107107
stage('Build GPU Image') {
108108
options {
109-
timeout(time: 180, unit: 'MINUTES')
109+
timeout(time: 4324, unit: 'MINUTES')
110110
}
111111
steps {
112112
sh '''#!/bin/bash

config.txt

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,11 @@
11
BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
2-
BASE_IMAGE_TAG=m94
3-
CPU_BASE_IMAGE_NAME=tf2-cpu.2-6
4-
GPU_BASE_IMAGE_NAME=tf2-gpu.2-6
2+
BASE_IMAGE_TAG=m96
3+
CPU_BASE_IMAGE_NAME=tf2-cpu.2-9
4+
GPU_BASE_IMAGE_NAME=tf2-gpu.2-9
55
LIGHTGBM_VERSION=3.3.2
6-
TORCH_VERSION=1.11.0
7-
# TODO(b/215031404#comment4) Remove zlib sed command after upgrade to >= 0.11.1
8-
TORCHAUDIO_VERSION=0.11.0
9-
TORCHTEXT_VERSION=0.12.0
10-
TORCHVISION_VERSION=0.12.0
6+
TORCH_VERSION=1.12.0
7+
TORCHAUDIO_VERSION=0.12.0
8+
TORCHTEXT_VERSION=0.13.0
9+
TORCHVISION_VERSION=0.13.0
1110
CUDA_MAJOR_VERSION=11
12-
CUDA_MINOR_VERSION=0
11+
CUDA_MINOR_VERSION=3

packages/torch.Dockerfile

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,30 @@ ARG CUDA_MINOR_VERSION
1212
# TORCHVISION_VERSION is mandatory
1313
RUN test -n "$TORCHVISION_VERSION"
1414

15+
# Use mamba to speed up conda installs
16+
RUN conda install -c conda-forge mamba
17+
1518
# Build instructions: https://github.com/pytorch/pytorch#from-source
16-
RUN conda install astunparse numpy ninja pyyaml mkl mkl-include setuptools==59.5.0 cmake cffi typing_extensions future six requests dataclasses
17-
RUN conda install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}
19+
RUN mamba install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses
20+
RUN mamba install -c pytorch magma-cuda${CUDA_MAJOR_VERSION}${CUDA_MINOR_VERSION}
1821

1922
# By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash.
2023
# This causes dependency conflicts like these: https://paste.googleplex.com/4786486378496000
2124
ENV PYTORCH_BUILD_VERSION=$PACKAGE_VERSION
2225
ENV PYTORCH_BUILD_NUMBER=1
2326

27+
# Ensures shared libraries installed with conda can be found by the dynamic link loader.
28+
# For PyTorch, we need specifically mkl.
29+
ENV LIBRARY_PATH="$LIBRARY_PATH:/opt/conda/lib"
30+
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/opt/conda/lib"
2431
ENV TORCH_CUDA_ARCH_LIST="3.7;6.0;7.0+PTX;7.5+PTX"
2532
ENV FORCE_CUDA=1
2633
RUN cd /usr/local/src && \
2734
git clone --recursive https://github.com/pytorch/pytorch && \
2835
cd pytorch && \
2936
git checkout tags/v$PACKAGE_VERSION && \
3037
git submodule sync && \
31-
git submodule update --init --recursive --jobs 0 && \
38+
git submodule update --init --recursive --jobs 1 && \
3239
python setup.py bdist_wheel
3340

3441
# Install torch which is required before we can build other torch* packages.
@@ -38,14 +45,17 @@ RUN pip install /usr/local/src/pytorch/dist/*.whl
3845
# Instructions: https://github.com/pytorch/audio#from-source
3946
# See comment above for PYTORCH_BUILD_VERSION.
4047
ENV BUILD_VERSION=$TORCHAUDIO_VERSION
41-
RUN cd /usr/local/src && \
48+
RUN sudo apt-get update && \
49+
# ncurses.h is required for this install
50+
sudo apt-get install libncurses-dev && \
51+
# Fixing the build: https://github.com/pytorch/audio/issues/666#issuecomment-635928685
52+
mamba install -c conda-forge ncurses && \
53+
cd /usr/local/src && \
4254
git clone https://github.com/pytorch/audio && \
4355
cd audio && \
4456
git checkout tags/v$TORCHAUDIO_VERSION && \
4557
git submodule sync && \
46-
git submodule update --init --recursive --jobs 0 && \
47-
# TODO(b/215031404#comment4) Remove after upgrade next release (0.11.1)
48-
sed -i s?https://zlib.net/zlib-1.2.11.tar.gz?https://sourceforge.net/projects/libpng/files/zlib/1.2.11/zlib-1.2.11.tar.gz? third_party/zlib/CMakeLists.txt && \
58+
git submodule update --init --recursive --jobs 1 && \
4959
python setup.py bdist_wheel
5060

5161
# Build torchtext
@@ -57,7 +67,7 @@ RUN cd /usr/local/src && \
5767
cd text && \
5868
git checkout tags/v$TORCHTEXT_VERSION && \
5969
git submodule sync && \
60-
git submodule update --init --recursive --jobs 0 && \
70+
git submodule update --init --recursive --jobs 1 && \
6171
python setup.py bdist_wheel
6272

6373
# Build torchvision.
@@ -81,4 +91,4 @@ COPY --from=builder /usr/local/src/text/dist/*.whl /tmp/whl
8191
COPY --from=builder /usr/local/src/vision/dist/*.whl /tmp/whl
8292

8393
# Print out the built .whl file.
84-
RUN ls -lh /tmp/whl/
94+
RUN ls -lh /tmp/whl/

tests/test_allennlp.py

Lines changed: 0 additions & 15 deletions
This file was deleted.

tests/test_dlib.py

Lines changed: 0 additions & 14 deletions
This file was deleted.

tests/test_implicit.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,5 +28,11 @@ def test_model(self):
2828
model.fit(counts, show_progress=False)
2929
rows, cols = model.item_factors, model.user_factors
3030

31-
assert not np.isnan(np.sum(cols))
32-
assert not np.isnan(np.sum(rows))
31+
assert not np.isnan(np.sum(tonumpy(cols)))
32+
assert not np.isnan(np.sum(tonumpy(rows)))
33+
34+
35+
def tonumpy(x):
36+
if hasattr(x, 'to_numpy'):
37+
return x.to_numpy()
38+
return x

tests/test_nnabla.py

Lines changed: 0 additions & 28 deletions
This file was deleted.

tests/test_rapids.py

Lines changed: 0 additions & 22 deletions
This file was deleted.

0 commit comments

Comments
 (0)