Skip to content

Commit 688a828

Browse files
committed
Build torch and lightgbm from source.
Fixes #984, #1059 Introduced a new "architecture" to easily build packages from source in the main build only if needed (i.e. if the base image or the package version has changed). This enable us to: - Upgrade PyTorch which doesn't have a wheel for 1.9.1 and CUDA 11. This prevented us from upgrading torch for ~6 months. - Move the lightgbm gpu source build to this architecture to shave ~3 minutes off the build time. http://b/181966788
1 parent 7cf514a commit 688a828

File tree

8 files changed

+302
-19
lines changed

8 files changed

+302
-19
lines changed

Dockerfile.tmpl

Lines changed: 27 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,29 @@
1+
ARG BASE_IMAGE_REPO
2+
ARG BASE_IMAGE_TAG
3+
ARG CPU_BASE_IMAGE_NAME
4+
ARG GPU_BASE_IMAGE_NAME
5+
ARG LIGHTGBM_VERSION
6+
ARG TORCH_VERSION
7+
ARG TORCHVISION_VERSION
8+
19
{{ if eq .Accelerator "gpu" }}
2-
FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80
10+
FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl
11+
FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl
12+
FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
313
ENV CUDA_MAJOR_VERSION=11
414
ENV CUDA_MINOR_VERSION=0
515
{{ else }}
6-
FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m80
16+
FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
717
{{ end }}
818
# Keep these variables in sync if base image is updated.
919
ENV TENSORFLOW_VERSION=2.6.0
20+
21+
# We need to redefine the ARG here to get the ARG value defined above the FROM instruction.
22+
# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
23+
ARG LIGHTGBM_VERSION
24+
ARG TORCH_VERSION
25+
ARG TORCHVISION_VERSION
26+
1027
# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0
1128
# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information
1229
ENV KMP_WARNINGS=0
@@ -52,29 +69,24 @@ RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MI
5269

5370
# Install PyTorch
5471
{{ if eq .Accelerator "gpu" }}
55-
RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
72+
COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/
73+
RUN pip install /tmp/torch/*.whl torchaudio==0.9.1 torchtext==0.10.1 -f https://download.pytorch.org/whl/torch_stable.html && \
74+
rm -rf /tmp/torch && \
5675
/tmp/clean-layer.sh
5776
{{ else }}
58-
RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
77+
RUN pip install torch==$TORCH_VERSION+cpu torchvision==$TORCHVISION_VERSION+cpu torchaudio==0.9.1 torchtext==0.10.1 -f https://download.pytorch.org/whl/torch_stable.html && \
5978
/tmp/clean-layer.sh
6079
{{ end }}
6180

6281
# Install LightGBM
63-
ENV LIGHTGBM_VERSION=3.2.1
6482
{{ if eq .Accelerator "gpu" }}
83+
COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/
6584
# Install OpenCL (required by LightGBM GPU version)
6685
RUN apt-get install -y ocl-icd-libopencl1 clinfo && \
6786
mkdir -p /etc/OpenCL/vendors && \
6887
echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \
69-
cd /usr/local/src && \
70-
git clone --recursive https://github.com/microsoft/LightGBM && \
71-
cd LightGBM && \
72-
git checkout tags/v$LIGHTGBM_VERSION && \
73-
mkdir build && cd build && \
74-
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
75-
make -j$(nproc) && \
76-
cd /usr/local/src/LightGBM/python-package && \
77-
python setup.py install --precompile && \
88+
pip install /tmp/lightgbm/*.whl && \
89+
rm -rf /tmp/lightgbm && \
7890
/tmp/clean-layer.sh
7991
{{ else }}
8092
RUN pip install lightgbm==$LIGHTGBM_VERSION && \
@@ -386,8 +398,7 @@ RUN pip install bleach && \
386398
pip install widgetsnbextension && \
387399
pip install pyarrow && \
388400
pip install feather-format && \
389-
# fastai >= 2.3.1 upgrades pytorch/torchvision. upgrade of pytorch will be handled in b/181966788
390-
pip install fastai==2.2.7 && \
401+
pip install fastai && \
391402
pip install allennlp && \
392403
# https://b.corp.google.com/issues/184685619#comment9: 3.9.0 is causing a major performance degradation with spacy 2.3.5
393404
pip install importlib-metadata==3.4.0 && \

Jenkinsfile

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,36 @@ pipeline {
3434
'''
3535
}
3636
}
37+
stage('Pre-build Packages from Source') {
38+
parallel {
39+
stage('torch') {
40+
options {
41+
timeout(time: 120, unit: 'MINUTES')
42+
}
43+
steps {
44+
sh '''#!/bin/bash
45+
set -exo pipefail
46+
source config.txt
47+
cd packages/
48+
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG --package torch --version $TORCH_VERSION --build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION --push
49+
'''
50+
}
51+
}
52+
stage('lightgbm') {
53+
options {
54+
timeout(time: 10, unit: 'MINUTES')
55+
}
56+
steps {
57+
sh '''#!/bin/bash
58+
set -exo pipefail
59+
source config.txt
60+
cd packages/
61+
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG --package lightgbm --version $LIGHTGBM_VERSION --push
62+
'''
63+
}
64+
}
65+
}
66+
}
3767
stage('Build/Test/Diff') {
3868
parallel {
3969
stage('CPU') {
@@ -79,7 +109,7 @@ pipeline {
79109
}
80110
stage('GPU') {
81111
agent { label 'ephemeral-linux-gpu' }
82-
stages {
112+
stages {
83113
stage('Build GPU Image') {
84114
options {
85115
timeout(time: 120, unit: 'MINUTES')

build

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,18 @@ done
4747
BUILD_ARGS+=" --build-arg GIT_COMMIT=$(git rev-parse HEAD)"
4848
BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')"
4949

50+
# Read build args from config.txt file.
51+
SRCDIR=$(dirname "${BASH_SOURCE[0]}")
52+
for l in `cat ${SRCDIR}/config.txt`; do
53+
BUILD_ARGS+=" --build-arg $l"
54+
done
55+
5056
readonly CACHE_FLAG
5157
readonly DOCKERFILE
5258
readonly ACCELERATOR
5359
readonly IMAGE_TAG
5460
readonly BUILD_ARGS
5561

56-
57-
SRCDIR=$(dirname "${BASH_SOURCE[0]}")
5862
DOCKERFILE_OUTDIR="${SRCDIR}/.generated"
5963
mkdir -p $DOCKERFILE_OUTDIR
6064
DOCKERFILE_PATH="$DOCKERFILE_OUTDIR/$DOCKERFILE"

config.txt

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
2+
BASE_IMAGE_TAG=m80
3+
CPU_BASE_IMAGE_NAME=tf2-cpu.2-6
4+
GPU_BASE_IMAGE_NAME=tf2-gpu.2-6
5+
LIGHTGBM_VERSION=3.2.1
6+
TORCH_VERSION=1.9.1
7+
TORCHVISION_VERSION=0.10.1

packages/README.md

Whitespace-only changes.

packages/build_package

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/bin/bash
2+
set -e
3+
4+
usage() {
5+
cat << EOF
6+
Usage: $0 [OPTIONS]
7+
Build a new package ".whl".
8+
9+
Options:
10+
-p, --package PACKAGE Package to build (e.g. lightgbm).
11+
-v, --version VERSION Package version to build.
12+
-b, --base-image IMAGE Base image tag (e.g. m80).
13+
-c, --use-cache Use layer cache when building a new image.
14+
-f, --force-rebuild Rebuild the image regardless of whether it already exist on GCR.
15+
-u, --push Push image to GCR.
16+
--build-arg ARG=VALUE Build arguments to pass to the docker build command.
17+
EOF
18+
}
19+
20+
PACKAGE=''
21+
PACKAGE_VERSION=''
22+
BASE_IMAGE=''
23+
DOCKERFILE=''
24+
CACHE_FLAG='--no-cache'
25+
FORCE_REBUILD=false
26+
PUSH_TO_GCR=false
27+
BUILD_ARGS=''
28+
29+
while :; do
30+
case "$1" in
31+
-h|--help)
32+
usage
33+
exit
34+
;;
35+
-p|--package)
36+
if [[ -z $2 ]]; then
37+
usage
38+
printf 'ERROR: No IMAGE specified after the %s flag.\n' "$1" >&2
39+
exit 1
40+
fi
41+
PACKAGE=$2
42+
DOCKERFILE="${PACKAGE}.Dockerfile"
43+
shift # skip the flag value
44+
;;
45+
-v|--version)
46+
if [[ -z $2 ]]; then
47+
usage
48+
printf 'ERROR: No VERSION specified after the %s flag.\n' "$1" >&2
49+
exit 1
50+
fi
51+
PACKAGE_VERSION=$2
52+
shift # skip the flag value
53+
;;
54+
-t|--base-image)
55+
if [[ -z $2 ]]; then
56+
usage
57+
printf 'ERROR: No TAG specified after the %s flag.\n' "$1" >&2
58+
exit 1
59+
fi
60+
BASE_IMAGE=$2
61+
shift # skip the flag value
62+
;;
63+
-c|--use-cache)
64+
CACHE_FLAG=''
65+
;;
66+
-f|--force-rebuild)
67+
FORCE_REBUILD=true
68+
;;
69+
-u|--push)
70+
PUSH_TO_GCR=true
71+
;;
72+
--build-arg)
73+
if [[ -z $2 ]]; then
74+
usage
75+
printf 'ERROR: No ARG=VALUE specified after the %s flag.\n' "$1" >&2
76+
exit 1
77+
fi
78+
BUILD_ARGS+=" $1 $2"
79+
shift # skip the flag value
80+
;;
81+
-?*)
82+
usage
83+
printf 'ERROR: Unknown option: %s\n' "$1" >&2
84+
exit 1
85+
;;
86+
*)
87+
break
88+
esac
89+
90+
shift
91+
done
92+
93+
readonly PACKAGE
94+
readonly PACKAGE_VERSION
95+
readonly BASE_IMAGE
96+
readonly DOCKERFILE
97+
readonly CACHE_FLAG
98+
readonly FORCE_REBUILD
99+
100+
SRCDIR=$(dirname "${BASH_SOURCE[0]}")
101+
DOCKERFILE_PATH="$SRCDIR/$DOCKERFILE"
102+
103+
if [[ -z "$PACKAGE_VERSION" ]]; then
104+
printf 'ERROR: missing --version flag.\n'
105+
exit 1
106+
fi
107+
108+
if [[ -z "$BASE_IMAGE" ]]; then
109+
printf 'ERROR: missing --base-image flag.\n'
110+
exit 1
111+
fi
112+
113+
if [[ -z "$DOCKERFILE" ]]; then
114+
printf 'ERROR: missing --package flag.\n'
115+
exit 1
116+
fi
117+
118+
# Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80`
119+
TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//}
120+
# Replace the `:` in `tf2-gpu.2-6:m80` by `-`
121+
TAG=${TAG/:/-}
122+
# Append the package version
123+
TAG=$TAG-$PACKAGE_VERSION
124+
# Add the gcr repo.
125+
TAG=gcr.io/kaggle-images/python-$PACKAGE-whl:$TAG
126+
127+
SHOULD_BUILD=true
128+
if ! $FORCE_REBUILD; then
129+
echo "Checking if $TAG exists..."
130+
docker pull $TAG && SHOULD_BUILD=false
131+
fi
132+
133+
if $SHOULD_BUILD; then
134+
echo "Building $TAG..."
135+
docker build --rm --pull $BUILD_ARGS \
136+
$CACHE_FLAG \
137+
-t $TAG \
138+
-f "$DOCKERFILE_PATH" \
139+
--build-arg BASE_IMAGE=$BASE_IMAGE \
140+
--build-arg PACKAGE_VERSION=$PACKAGE_VERSION \
141+
$SRCDIR
142+
143+
if $PUSH_TO_GCR; then
144+
echo "Pushing $TAG to GCR..."
145+
docker push $TAG
146+
fi
147+
else
148+
echo "Skipping build. $TAG already exists."
149+
echo "Use --force-rebuild if you want to build a new version anyway."
150+
fi

packages/lightgbm.Dockerfile

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
ARG BASE_IMAGE
2+
3+
FROM ${BASE_IMAGE} AS builder
4+
5+
ARG PACKAGE_VERSION
6+
7+
# Build instructions: https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm
8+
RUN apt-get update && \
9+
apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev ocl-icd-libopencl1 clinfo
10+
11+
RUN cd /usr/local/src && \
12+
git clone --recursive https://github.com/microsoft/LightGBM && \
13+
cd LightGBM && \
14+
git checkout tags/v$PACKAGE_VERSION && \
15+
mkdir build && cd build && \
16+
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
17+
make -j$(nproc) && \
18+
cd /usr/local/src/LightGBM/python-package && \
19+
python setup.py bdist_wheel
20+
21+
# Using multi-stage builds to ensure the output image is very small
22+
# See: https://docs.docker.com/develop/develop-images/multistage-build/
23+
FROM alpine:latest
24+
25+
RUN mkdir -p /tmp/whl/
26+
COPY --from=builder /usr/local/src/LightGBM/python-package/dist/*.whl /tmp/whl
27+
28+
# Print out the built .whl file.
29+
RUN ls -lh /tmp/whl/

packages/torch.Dockerfile

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
ARG BASE_IMAGE
2+
3+
FROM ${BASE_IMAGE} AS builder
4+
5+
ARG PACKAGE_VERSION
6+
ARG TORCHVISION_VERSION
7+
8+
# TORCHVISION_VERSION is mandatory
9+
RUN test -n "$TORCHVISION_VERSION"
10+
11+
# Build instructions: https://github.com/pytorch/pytorch#from-source
12+
RUN conda install astunparse numpy ninja pyyaml mkl mkl-include setuptools cmake cffi typing_extensions future six requests dataclasses
13+
14+
# By default, it uses the version from version.txt which includes the `a0` (alpha zero) suffix and part of the git hash.
15+
# This causes dependency conflicts like these: https://paste.googleplex.com/4786486378496000
16+
ENV PYTORCH_BUILD_VERSION=$PACKAGE_VERSION
17+
ENV PYTORCH_BUILD_NUMBER=1
18+
19+
ENV TORCH_CUDA_ARCH_LIST="3.7;6.0;7.0+PTX;7.5+PTX"
20+
ENV FORCE_CUDA=1
21+
RUN cd /usr/local/src && \
22+
git clone --recursive https://github.com/pytorch/pytorch && \
23+
cd pytorch && \
24+
git checkout tags/v$PACKAGE_VERSION && \
25+
git submodule sync && \
26+
git submodule update --init --recursive --jobs 0 && \
27+
python setup.py bdist_wheel
28+
29+
# Build torchvision.
30+
# Instructions: https://github.com/pytorch/vision/tree/main#installation
31+
32+
# Install torch which is required before we can build torchvision.
33+
RUN pip install /usr/local/src/pytorch/dist/*.whl
34+
35+
# See comment above for PYTORCH_BUILD_VERSION.
36+
ENV BUILD_VERSION=$TORCHVISION_VERSION
37+
RUN cd /usr/local/src && \
38+
git clone --recursive https://github.com/pytorch/vision && \
39+
cd vision && \
40+
git checkout tags/v$TORCHVISION_VERSION && \
41+
python setup.py bdist_wheel
42+
43+
# Using multi-stage builds to ensure the output image is very small
44+
# See: https://docs.docker.com/develop/develop-images/multistage-build/
45+
FROM alpine:latest
46+
47+
RUN mkdir -p /tmp/whl/
48+
COPY --from=builder /usr/local/src/pytorch/dist/*.whl /tmp/whl
49+
COPY --from=builder /usr/local/src/vision/dist/*.whl /tmp/whl
50+
51+
# Print out the built .whl file.
52+
RUN ls -lh /tmp/whl/

0 commit comments

Comments
 (0)