Use Colab as a base image. (#1444)

djherbis · web-flow · commit 66bac486a2c5 · 2024-11-27T17:05:29.000-05:00
This change makes a number of major changes:
- Colab is the base image
- uv is the main package install tool
- leveraging requirements.txt instead of many separate installs
- stop building and installing tensorflow/torch/lightbgm/jax since those
are managed by the Colab base image now

In order to decide what packages to explicitly install I:
- looked at what packages are in the Colab base image
- looked at what packages were in the Kaggle image
- looked at what packages were explicitly mentioned in Kaggle Dockerfile

This may still take a few iterations to get all the right parts in the
image, but this should hopefully make the image much more manageable.

http://b/365782129
diff --git a/Dockerfile.tmpl b/Dockerfile.tmpl
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -21,66 +21,6 @@ pipeline {
   }
 
   stages {
-    stage('Pre-build Packages from Source') {
-      parallel {
-        stage('torch') {
-          options {
-            timeout(time: 300, unit: 'MINUTES')
-          }
-          steps {
-            sh '''#!/bin/bash
-              set -exo pipefail
-              source config.txt
-              cd packages/
-              ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
-                --package torch \
-                --version $TORCH_VERSION \
-                --build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \
-                --build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \
-                --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
-                --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
-                --push
-            '''
-          }
-        }
-        stage('lightgbm') {
-          options {
-            timeout(time: 10, unit: 'MINUTES')
-          }
-          steps {
-            sh '''#!/bin/bash
-              set -exo pipefail
-              source config.txt
-              cd packages/
-              ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
-                --package lightgbm \
-                --version $LIGHTGBM_VERSION \
-                --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
-                --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
-                --push
-            '''
-          }
-        }
-        stage('jaxlib') {
-          options {
-            timeout(time: 300, unit: 'MINUTES')
-          }
-          steps {
-            sh '''#!/bin/bash
-              set -exo pipefail
-              source config.txt
-              cd packages/
-              ./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
-                --package jaxlib \
-                --version $JAX_VERSION \
-                --build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
-                --build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
-                --push
-            '''
-          }
-        }
-      }
-    }
     stage('Build/Test/Diff') {
       parallel {
         stage('CPU') {
diff --git a/clean-layer.sh b/clean-layer.sh
@@ -19,6 +19,4 @@ apt-get clean
 # Ensures the current working directory won't be deleted
 cd /usr/local/src/
 # Delete source files used for building binaries
-rm -rf /usr/local/src/*
-# Delete conda downloaded tarballs
-conda clean -y --tarballs
+rm -rf /usr/local/src/*
diff --git a/config.txt b/config.txt
@@ -1,11 +1,2 @@
-BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
-BASE_IMAGE_TAG=m122
-CPU_BASE_IMAGE_NAME=tf2-cpu.2-16.py310
-GPU_BASE_IMAGE_NAME=tf2-gpu.2-16.py310
-LIGHTGBM_VERSION=4.2.0
-TORCH_VERSION=2.4.0
-TORCHAUDIO_VERSION=2.4.0
-TORCHVISION_VERSION=0.19.0
-JAX_VERSION=0.4.26
 CUDA_MAJOR_VERSION=12
-CUDA_MINOR_VERSION=3
+CUDA_MINOR_VERSION=2
diff --git a/kaggle_requirements.txt b/kaggle_requirements.txt
@@ -0,0 +1,139 @@
+altair>=5.4.0
+Babel
+Boruta
+Cartopy
+ImageHash
+Janome
+PyArabic
+PyUpSet
+Pympler
+Rtree
+shapely<2
+SimpleITK
+TPOT
+Theano
+Wand
+annoy
+arrow
+bayesian-optimization
+boto3
+catboost
+category-encoders
+cesium
+comm
+cytoolz
+dask-expr
+datasets
+datashader
+deap
+dipy
+docker
+easyocr
+eli5
+emoji
+fasttext
+featuretools
+fiona
+fury
+fuzzywuzzy
+geojson
+# geopandas > v0.14.4 breaks learn tools
+geopandas==v0.14.4
+google-cloud-aiplatform
+# google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1
+google-cloud-automl==1.0.1
+# b/315753846: Unpin translate package.
+google-cloud-translate==3.12.1
+google-cloud-videointelligence
+google-cloud-vision
+gpxpy
+h2o
+haversine
+hep-ml
+igraph
+ipympl
+ipywidgets==8.1.5
+isoweek
+jedi
+# b/276358430: fix Jupyter lsp freezing up the jupyter server
+jupyter-lsp==1.5.1
+# b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354)
+jupyter_server==2.12.5
+jupyterlab
+jupyterlab-lsp
+kaggle-environments
+kagglehub>=0.3.4
+# Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data():
+# See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a
+keras<3.6
+keras-cv
+keras-nlp
+keras-tuner
+kornia
+langid
+leven
+# b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'"
+libpysal<=4.9.2
+lime
+line_profiler
+mamba
+mlcrate
+mne
+mpld3
+nbdev
+nilearn
+olefile
+onnx
+openslide-bin
+openslide-python
+optuna
+pandas-profiling
+pandasql
+papermill
+path
+path.py
+pdf2image
+plotly-express
+preprocessing
+pudb
+pyLDAvis
+pycryptodome
+pydegensac
+pydicom
+pydub
+pyemd
+pyexcel-ods
+pymc3
+pymongo
+pypdf
+pytesseract
+python-lsp-server
+pytorch-ignite
+pytorch-lightning
+qgrid
+qtconsole
+ray
+rgf-python
+s3fs
+scikit-learn-intelex
+scikit-multilearn
+scikit-optimize
+scikit-plot
+scikit-surprise
+git+https://github.com/facebookresearch/segment-anything.git
+shap
+squarify
+tensorflow-cloud
+tensorflow-io
+tensorflow-text
+tensorflow_decision_forests
+timm
+torchinfo
+torchmetrics
+tsfresh
+vtk
+wandb
+wavio
+xgboost==2.0.3
+xvfbwrapper
+ydata-profiling
diff --git a/test b/test
@@ -3,7 +3,7 @@ set -e
 
 IMAGE_TAG='kaggle/python-build'
 IMAGE_TAG_OVERRIDE=''
-ADDITONAL_OPTS=''
+ADDITONAL_OPTS='--runtime runc ' # Use the CPU runtime by default
 PATTERN='test*.py'
 
 usage() {
@@ -69,8 +69,6 @@ readonly ADDITONAL_OPTS
 readonly PATTERN
 
 set -x
-docker run --rm --net=none -v /tmp/python-build:/tmp/python-build "$IMAGE_TAG" rm -rf /tmp/python-build/*
-docker rm jupyter_test || true
 mkdir -p /tmp/python-build/tmp
 mkdir -p /tmp/python-build/devshm
 mkdir -p /tmp/python-build/working
@@ -97,6 +95,9 @@ fi
 # Note about `--hostname localhost` (b/158137436)
 # hostname defaults to the container name which fails DNS name
 # resolution with --net=none (required to keep tests hermetic). See details in bug.
+#
+# Note about CLOUDSDK_CONFIG=/tmp/.config/gcloud
+# We use the /tmp dir since the filesystem is --read-only and we need writable space for gcloud configs.
 docker run --rm -t --read-only --net=none \
     -e HOME=/tmp -e KAGGLE_DATA_PROXY_TOKEN=test-key \
     -e KAGGLE_USER_SECRETS_TOKEN_KEY=test-secrets-key \
@@ -105,6 +106,7 @@ docker run --rm -t --read-only --net=none \
     -e KAGGLE_DATA_PROXY_PROJECT=test \
     -e TF_FORCE_GPU_ALLOW_GROWTH=true \
     -e XLA_PYTHON_CLIENT_PREALLOCATE=false \
+    -e CLOUDSDK_CONFIG=/tmp/.config/gcloud \
     --hostname localhost \
     --shm-size=2g \
     -v $PWD:/input:ro -v /tmp/python-build/working:/working \
diff --git a/tests/test_cuml.py b/tests/test_cuml.py
@@ -6,6 +6,7 @@
 class TestCuml(unittest.TestCase):
     @gpu_test
     @p100_exempt # b/342143152: cuML(>=24.4v) is inompatible with p100 GPUs.
+    @unittest.skip("b/381287748 cuML is not installed in Colab.")
     def test_pca_fit_transform(self):
         import unittest
         import numpy as np
diff --git a/tests/test_fastai.py b/tests/test_fastai.py
@@ -27,8 +27,9 @@ def test_tabular(self):
             "/input/tests/data/train.csv",
             cont_names=["pixel"+str(i) for i in range(784)],
             y_names='label',
-            procs=[FillMissing, Categorify, Normalize])       
+            procs=[FillMissing, Categorify, Normalize])
         learn = tabular_learner(dls, layers=[200, 100])
-        learn.fit_one_cycle(n_epoch=1)
+        with learn.no_bar():
+            learn.fit_one_cycle(n_epoch=1)
         
-        self.assertGreater(learn.smooth_loss, 0)
+            self.assertGreater(learn.smooth_loss, 0)
diff --git a/tests/test_lightgbm.py b/tests/test_lightgbm.py
@@ -34,7 +34,9 @@ def test_cpu(self):
 
         self.assertEqual(1, gbm.best_iteration)
 
+    # TODO(b/381256047): Colab needs to install GPU-enabled lightgbm.
     @gpu_test
+    @unittest.skip("Skipping this test until b/381256047 is resolved.")
     def test_gpu(self):
         lgb_train, lgb_eval = self.load_datasets()