Skip to content

Commit 66bac48

Browse files
authored
Use Colab as a base image. (#1444)
This change makes a number of major changes: - Colab is the base image - uv is the main package install tool - leveraging requirements.txt instead of many separate installs - stop building and installing tensorflow/torch/lightbgm/jax since those are managed by the Colab base image now In order to decide what packages to explicitly install I: - looked at what packages are in the Colab base image - looked at what packages were in the Kaggle image - looked at what packages were explicitly mentioned in Kaggle Dockerfile This may still take a few iterations to get all the right parts in the image, but this should hopefully make the image much more manageable. http://b/365782129
1 parent 03c832e commit 66bac48

File tree

9 files changed

+228
-598
lines changed

9 files changed

+228
-598
lines changed

Dockerfile.tmpl

Lines changed: 75 additions & 519 deletions
Large diffs are not rendered by default.

Jenkinsfile

Lines changed: 0 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -21,66 +21,6 @@ pipeline {
2121
}
2222

2323
stages {
24-
stage('Pre-build Packages from Source') {
25-
parallel {
26-
stage('torch') {
27-
options {
28-
timeout(time: 300, unit: 'MINUTES')
29-
}
30-
steps {
31-
sh '''#!/bin/bash
32-
set -exo pipefail
33-
source config.txt
34-
cd packages/
35-
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
36-
--package torch \
37-
--version $TORCH_VERSION \
38-
--build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \
39-
--build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \
40-
--build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
41-
--build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
42-
--push
43-
'''
44-
}
45-
}
46-
stage('lightgbm') {
47-
options {
48-
timeout(time: 10, unit: 'MINUTES')
49-
}
50-
steps {
51-
sh '''#!/bin/bash
52-
set -exo pipefail
53-
source config.txt
54-
cd packages/
55-
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
56-
--package lightgbm \
57-
--version $LIGHTGBM_VERSION \
58-
--build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
59-
--build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
60-
--push
61-
'''
62-
}
63-
}
64-
stage('jaxlib') {
65-
options {
66-
timeout(time: 300, unit: 'MINUTES')
67-
}
68-
steps {
69-
sh '''#!/bin/bash
70-
set -exo pipefail
71-
source config.txt
72-
cd packages/
73-
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
74-
--package jaxlib \
75-
--version $JAX_VERSION \
76-
--build-arg CUDA_MAJOR_VERSION=$CUDA_MAJOR_VERSION \
77-
--build-arg CUDA_MINOR_VERSION=$CUDA_MINOR_VERSION \
78-
--push
79-
'''
80-
}
81-
}
82-
}
83-
}
8424
stage('Build/Test/Diff') {
8525
parallel {
8626
stage('CPU') {

clean-layer.sh

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,4 @@ apt-get clean
1919
# Ensures the current working directory won't be deleted
2020
cd /usr/local/src/
2121
# Delete source files used for building binaries
22-
rm -rf /usr/local/src/*
23-
# Delete conda downloaded tarballs
24-
conda clean -y --tarballs
22+
rm -rf /usr/local/src/*

config.txt

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,2 @@
1-
BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
2-
BASE_IMAGE_TAG=m122
3-
CPU_BASE_IMAGE_NAME=tf2-cpu.2-16.py310
4-
GPU_BASE_IMAGE_NAME=tf2-gpu.2-16.py310
5-
LIGHTGBM_VERSION=4.2.0
6-
TORCH_VERSION=2.4.0
7-
TORCHAUDIO_VERSION=2.4.0
8-
TORCHVISION_VERSION=0.19.0
9-
JAX_VERSION=0.4.26
101
CUDA_MAJOR_VERSION=12
11-
CUDA_MINOR_VERSION=3
2+
CUDA_MINOR_VERSION=2

kaggle_requirements.txt

Lines changed: 139 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,139 @@
1+
altair>=5.4.0
2+
Babel
3+
Boruta
4+
Cartopy
5+
ImageHash
6+
Janome
7+
PyArabic
8+
PyUpSet
9+
Pympler
10+
Rtree
11+
shapely<2
12+
SimpleITK
13+
TPOT
14+
Theano
15+
Wand
16+
annoy
17+
arrow
18+
bayesian-optimization
19+
boto3
20+
catboost
21+
category-encoders
22+
cesium
23+
comm
24+
cytoolz
25+
dask-expr
26+
datasets
27+
datashader
28+
deap
29+
dipy
30+
docker
31+
easyocr
32+
eli5
33+
emoji
34+
fasttext
35+
featuretools
36+
fiona
37+
fury
38+
fuzzywuzzy
39+
geojson
40+
# geopandas > v0.14.4 breaks learn tools
41+
geopandas==v0.14.4
42+
google-cloud-aiplatform
43+
# google-cloud-automl 2.0.0 introduced incompatible API changes, need to pin to 1.0.1
44+
google-cloud-automl==1.0.1
45+
# b/315753846: Unpin translate package.
46+
google-cloud-translate==3.12.1
47+
google-cloud-videointelligence
48+
google-cloud-vision
49+
gpxpy
50+
h2o
51+
haversine
52+
hep-ml
53+
igraph
54+
ipympl
55+
ipywidgets==8.1.5
56+
isoweek
57+
jedi
58+
# b/276358430: fix Jupyter lsp freezing up the jupyter server
59+
jupyter-lsp==1.5.1
60+
# b/333854354: pin jupyter-server to version 2.12.5; later versions break LSP (b/333854354)
61+
jupyter_server==2.12.5
62+
jupyterlab
63+
jupyterlab-lsp
64+
kaggle-environments
65+
kagglehub>=0.3.4
66+
# Keras 3.6 broke test_keras.py > test_train > keras.datasets.mnist.load_data():
67+
# See https://github.com/keras-team/keras/commit/dcefb139863505d166dd1325066f329b3033d45a
68+
keras<3.6
69+
keras-cv
70+
keras-nlp
71+
keras-tuner
72+
kornia
73+
langid
74+
leven
75+
# b/328788268: libpysal 4.10 seems to fail with "module 'shapely' has no attribute 'Geometry'. Did you mean: 'geometry'"
76+
libpysal<=4.9.2
77+
lime
78+
line_profiler
79+
mamba
80+
mlcrate
81+
mne
82+
mpld3
83+
nbdev
84+
nilearn
85+
olefile
86+
onnx
87+
openslide-bin
88+
openslide-python
89+
optuna
90+
pandas-profiling
91+
pandasql
92+
papermill
93+
path
94+
path.py
95+
pdf2image
96+
plotly-express
97+
preprocessing
98+
pudb
99+
pyLDAvis
100+
pycryptodome
101+
pydegensac
102+
pydicom
103+
pydub
104+
pyemd
105+
pyexcel-ods
106+
pymc3
107+
pymongo
108+
pypdf
109+
pytesseract
110+
python-lsp-server
111+
pytorch-ignite
112+
pytorch-lightning
113+
qgrid
114+
qtconsole
115+
ray
116+
rgf-python
117+
s3fs
118+
scikit-learn-intelex
119+
scikit-multilearn
120+
scikit-optimize
121+
scikit-plot
122+
scikit-surprise
123+
git+https://github.com/facebookresearch/segment-anything.git
124+
shap
125+
squarify
126+
tensorflow-cloud
127+
tensorflow-io
128+
tensorflow-text
129+
tensorflow_decision_forests
130+
timm
131+
torchinfo
132+
torchmetrics
133+
tsfresh
134+
vtk
135+
wandb
136+
wavio
137+
xgboost==2.0.3
138+
xvfbwrapper
139+
ydata-profiling

test

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ set -e
33

44
IMAGE_TAG='kaggle/python-build'
55
IMAGE_TAG_OVERRIDE=''
6-
ADDITONAL_OPTS=''
6+
ADDITONAL_OPTS='--runtime runc ' # Use the CPU runtime by default
77
PATTERN='test*.py'
88

99
usage() {
@@ -69,8 +69,6 @@ readonly ADDITONAL_OPTS
6969
readonly PATTERN
7070

7171
set -x
72-
docker run --rm --net=none -v /tmp/python-build:/tmp/python-build "$IMAGE_TAG" rm -rf /tmp/python-build/*
73-
docker rm jupyter_test || true
7472
mkdir -p /tmp/python-build/tmp
7573
mkdir -p /tmp/python-build/devshm
7674
mkdir -p /tmp/python-build/working
@@ -97,6 +95,9 @@ fi
9795
# Note about `--hostname localhost` (b/158137436)
9896
# hostname defaults to the container name which fails DNS name
9997
# resolution with --net=none (required to keep tests hermetic). See details in bug.
98+
#
99+
# Note about CLOUDSDK_CONFIG=/tmp/.config/gcloud
100+
# We use the /tmp dir since the filesystem is --read-only and we need writable space for gcloud configs.
100101
docker run --rm -t --read-only --net=none \
101102
-e HOME=/tmp -e KAGGLE_DATA_PROXY_TOKEN=test-key \
102103
-e KAGGLE_USER_SECRETS_TOKEN_KEY=test-secrets-key \
@@ -105,6 +106,7 @@ docker run --rm -t --read-only --net=none \
105106
-e KAGGLE_DATA_PROXY_PROJECT=test \
106107
-e TF_FORCE_GPU_ALLOW_GROWTH=true \
107108
-e XLA_PYTHON_CLIENT_PREALLOCATE=false \
109+
-e CLOUDSDK_CONFIG=/tmp/.config/gcloud \
108110
--hostname localhost \
109111
--shm-size=2g \
110112
-v $PWD:/input:ro -v /tmp/python-build/working:/working \

tests/test_cuml.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
class TestCuml(unittest.TestCase):
77
@gpu_test
88
@p100_exempt # b/342143152: cuML(>=24.4v) is inompatible with p100 GPUs.
9+
@unittest.skip("b/381287748 cuML is not installed in Colab.")
910
def test_pca_fit_transform(self):
1011
import unittest
1112
import numpy as np

tests/test_fastai.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,9 @@ def test_tabular(self):
2727
"/input/tests/data/train.csv",
2828
cont_names=["pixel"+str(i) for i in range(784)],
2929
y_names='label',
30-
procs=[FillMissing, Categorify, Normalize])
30+
procs=[FillMissing, Categorify, Normalize])
3131
learn = tabular_learner(dls, layers=[200, 100])
32-
learn.fit_one_cycle(n_epoch=1)
32+
with learn.no_bar():
33+
learn.fit_one_cycle(n_epoch=1)
3334

34-
self.assertGreater(learn.smooth_loss, 0)
35+
self.assertGreater(learn.smooth_loss, 0)

tests/test_lightgbm.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,9 @@ def test_cpu(self):
3434

3535
self.assertEqual(1, gbm.best_iteration)
3636

37+
# TODO(b/381256047): Colab needs to install GPU-enabled lightgbm.
3738
@gpu_test
39+
@unittest.skip("Skipping this test until b/381256047 is resolved.")
3840
def test_gpu(self):
3941
lgb_train, lgb_eval = self.load_datasets()
4042

0 commit comments

Comments
 (0)