Skip to content

Build torch and lightgbm from source. #1083

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Oct 5, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 34 additions & 16 deletions Dockerfile.tmpl
Original file line number Diff line number Diff line change
@@ -1,12 +1,33 @@
ARG BASE_IMAGE_REPO
ARG BASE_IMAGE_TAG
ARG CPU_BASE_IMAGE_NAME
ARG GPU_BASE_IMAGE_NAME
ARG LIGHTGBM_VERSION
ARG TORCH_VERSION
ARG TORCHAUDIO_VERSION
ARG TORCHTEXT_VERSION
ARG TORCHVISION_VERSION

{{ if eq .Accelerator "gpu" }}
FROM gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80
FROM gcr.io/kaggle-images/python-lightgbm-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${LIGHTGBM_VERSION} AS lightgbm_whl
FROM gcr.io/kaggle-images/python-torch-whl:${GPU_BASE_IMAGE_NAME}-${BASE_IMAGE_TAG}-${TORCH_VERSION} AS torch_whl
FROM ${BASE_IMAGE_REPO}/${GPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
ENV CUDA_MAJOR_VERSION=11
ENV CUDA_MINOR_VERSION=0
{{ else }}
FROM gcr.io/deeplearning-platform-release/tf2-cpu.2-6:m80
FROM ${BASE_IMAGE_REPO}/${CPU_BASE_IMAGE_NAME}:${BASE_IMAGE_TAG}
{{ end }}
# Keep these variables in sync if base image is updated.
ENV TENSORFLOW_VERSION=2.6.0

# We need to redefine the ARG here to get the ARG value defined above the FROM instruction.
# See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
ARG LIGHTGBM_VERSION
ARG TORCH_VERSION
ARG TORCHAUDIO_VERSION
ARG TORCHTEXT_VERSION
ARG TORCHVISION_VERSION

# Disable pesky logs like: KMP_AFFINITY: pid 6121 tid 6121 thread 0 bound to OS proc set 0
# See: https://stackoverflow.com/questions/57385766/disable-tensorflow-log-information
ENV KMP_WARNINGS=0
Expand All @@ -15,6 +36,9 @@ ADD clean-layer.sh /tmp/clean-layer.sh
ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
ADD patches/template_conf.json /opt/kaggle/conf.json

# Adds the libcuda.so to LD_LIBRARY_PATH which is necessary for the GPU mxnet package.
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/cuda/compat

{{ if eq .Accelerator "gpu" }}
# b/200968891 Keeps horovod once torch is upgraded.
RUN pip uninstall -y horovod && \
Expand Down Expand Up @@ -52,29 +76,24 @@ RUN conda install cudf=21.08 cuml=21.08 cudatoolkit=$CUDA_MAJOR_VERSION.$CUDA_MI

# Install PyTorch
{{ if eq .Accelerator "gpu" }}
RUN pip install torch==1.7.1+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchvision==0.8.2+cu$CUDA_MAJOR_VERSION$CUDA_MINOR_VERSION torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
COPY --from=torch_whl /tmp/whl/*.whl /tmp/torch/
RUN pip install /tmp/torch/*.whl && \
rm -rf /tmp/torch && \
/tmp/clean-layer.sh
{{ else }}
RUN pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 torchtext==0.8.1 -f https://download.pytorch.org/whl/torch_stable.html && \
RUN pip install torch==$TORCH_VERSION+cpu torchvision==$TORCHVISION_VERSION+cpu torchaudio==$TORCHAUDIO_VERSION torchtext==$TORCHTEXT_VERSION -f https://download.pytorch.org/whl/torch_stable.html && \
/tmp/clean-layer.sh
{{ end }}

# Install LightGBM
ENV LIGHTGBM_VERSION=3.2.1
{{ if eq .Accelerator "gpu" }}
COPY --from=lightgbm_whl /tmp/whl/*.whl /tmp/lightgbm/
# Install OpenCL (required by LightGBM GPU version)
RUN apt-get install -y ocl-icd-libopencl1 clinfo && \
mkdir -p /etc/OpenCL/vendors && \
echo "libnvidia-opencl.so.1" > /etc/OpenCL/vendors/nvidia.icd && \
cd /usr/local/src && \
git clone --recursive https://github.com/microsoft/LightGBM && \
cd LightGBM && \
git checkout tags/v$LIGHTGBM_VERSION && \
mkdir build && cd build && \
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
make -j$(nproc) && \
cd /usr/local/src/LightGBM/python-package && \
python setup.py install --precompile && \
pip install /tmp/lightgbm/*.whl && \
rm -rf /tmp/lightgbm && \
/tmp/clean-layer.sh
{{ else }}
RUN pip install lightgbm==$LIGHTGBM_VERSION && \
Expand Down Expand Up @@ -386,8 +405,7 @@ RUN pip install bleach && \
pip install widgetsnbextension && \
pip install pyarrow && \
pip install feather-format && \
# fastai >= 2.3.1 upgrades pytorch/torchvision. upgrade of pytorch will be handled in b/181966788
pip install fastai==2.2.7 && \
pip install fastai && \
pip install allennlp && \
# https://b.corp.google.com/issues/184685619#comment9: 3.9.0 is causing a major performance degradation with spacy 2.3.5
pip install importlib-metadata==3.4.0 && \
Expand Down
38 changes: 37 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,42 @@ pipeline {
'''
}
}
stage('Pre-build Packages from Source') {
parallel {
stage('torch') {
options {
timeout(time: 180, unit: 'MINUTES')
}
steps {
sh '''#!/bin/bash
set -exo pipefail
source config.txt
cd packages/
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG \
--package torch \
--version $TORCH_VERSION \
--build-arg TORCHAUDIO_VERSION=$TORCHAUDIO_VERSION \
--build-arg TORCHTEXT_VERSION=$TORCHTEXT_VERSION \
--build-arg TORCHVISION_VERSION=$TORCHVISION_VERSION \
--push
'''
}
}
stage('lightgbm') {
options {
timeout(time: 10, unit: 'MINUTES')
}
steps {
sh '''#!/bin/bash
set -exo pipefail
source config.txt
cd packages/
./build_package --base-image $BASE_IMAGE_REPO/$GPU_BASE_IMAGE_NAME:$BASE_IMAGE_TAG --package lightgbm --version $LIGHTGBM_VERSION --push
'''
}
}
}
}
stage('Build/Test/Diff') {
parallel {
stage('CPU') {
Expand Down Expand Up @@ -79,7 +115,7 @@ pipeline {
}
stage('GPU') {
agent { label 'ephemeral-linux-gpu' }
stages {
stages {
stage('Build GPU Image') {
options {
timeout(time: 120, unit: 'MINUTES')
Expand Down
8 changes: 6 additions & 2 deletions build
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,18 @@ done
BUILD_ARGS+=" --build-arg GIT_COMMIT=$(git rev-parse HEAD)"
BUILD_ARGS+=" --build-arg BUILD_DATE=$(date '+%Y%m%d-%H%M%S')"

# Read build args from config.txt file.
SRCDIR=$(dirname "${BASH_SOURCE[0]}")
for l in `cat ${SRCDIR}/config.txt`; do
BUILD_ARGS+=" --build-arg $l"
done

readonly CACHE_FLAG
readonly DOCKERFILE
readonly ACCELERATOR
readonly IMAGE_TAG
readonly BUILD_ARGS


SRCDIR=$(dirname "${BASH_SOURCE[0]}")
DOCKERFILE_OUTDIR="${SRCDIR}/.generated"
mkdir -p $DOCKERFILE_OUTDIR
DOCKERFILE_PATH="$DOCKERFILE_OUTDIR/$DOCKERFILE"
Expand Down
9 changes: 9 additions & 0 deletions config.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
BASE_IMAGE_REPO=gcr.io/deeplearning-platform-release
BASE_IMAGE_TAG=m80
CPU_BASE_IMAGE_NAME=tf2-cpu.2-6
GPU_BASE_IMAGE_NAME=tf2-gpu.2-6
LIGHTGBM_VERSION=3.2.1
TORCH_VERSION=1.9.1
TORCHAUDIO_VERSION=0.9.1
TORCHTEXT_VERSION=0.10.1
TORCHVISION_VERSION=0.10.1
Empty file added packages/README.md
Empty file.
150 changes: 150 additions & 0 deletions packages/build_package
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/bin/bash
set -e

usage() {
cat << EOF
Usage: $0 [OPTIONS]
Build a new package ".whl".

Options:
-p, --package PACKAGE Package to build (e.g. lightgbm).
-v, --version VERSION Package version to build.
-b, --base-image IMAGE Base image tag (e.g. m80).
-c, --use-cache Use layer cache when building a new image.
-f, --force-rebuild Rebuild the image regardless of whether it already exist on GCR.
-u, --push Push image to GCR.
--build-arg ARG=VALUE Build arguments to pass to the docker build command.
EOF
}

PACKAGE=''
PACKAGE_VERSION=''
BASE_IMAGE=''
DOCKERFILE=''
CACHE_FLAG='--no-cache'
FORCE_REBUILD=false
PUSH_TO_GCR=false
BUILD_ARGS=''

while :; do
case "$1" in
-h|--help)
usage
exit
;;
-p|--package)
if [[ -z $2 ]]; then
usage
printf 'ERROR: No IMAGE specified after the %s flag.\n' "$1" >&2
exit 1
fi
PACKAGE=$2
DOCKERFILE="${PACKAGE}.Dockerfile"
shift # skip the flag value
;;
-v|--version)
if [[ -z $2 ]]; then
usage
printf 'ERROR: No VERSION specified after the %s flag.\n' "$1" >&2
exit 1
fi
PACKAGE_VERSION=$2
shift # skip the flag value
;;
-t|--base-image)
if [[ -z $2 ]]; then
usage
printf 'ERROR: No TAG specified after the %s flag.\n' "$1" >&2
exit 1
fi
BASE_IMAGE=$2
shift # skip the flag value
;;
-c|--use-cache)
CACHE_FLAG=''
;;
-f|--force-rebuild)
FORCE_REBUILD=true
;;
-u|--push)
PUSH_TO_GCR=true
;;
--build-arg)
if [[ -z $2 ]]; then
usage
printf 'ERROR: No ARG=VALUE specified after the %s flag.\n' "$1" >&2
exit 1
fi
BUILD_ARGS+=" $1 $2"
shift # skip the flag value
;;
-?*)
usage
printf 'ERROR: Unknown option: %s\n' "$1" >&2
exit 1
;;
*)
break
esac

shift
done

readonly PACKAGE
readonly PACKAGE_VERSION
readonly BASE_IMAGE
readonly DOCKERFILE
readonly CACHE_FLAG
readonly FORCE_REBUILD

SRCDIR=$(dirname "${BASH_SOURCE[0]}")
DOCKERFILE_PATH="$SRCDIR/$DOCKERFILE"

if [[ -z "$PACKAGE_VERSION" ]]; then
printf 'ERROR: missing --version flag.\n'
exit 1
fi

if [[ -z "$BASE_IMAGE" ]]; then
printf 'ERROR: missing --base-image flag.\n'
exit 1
fi

if [[ -z "$DOCKERFILE" ]]; then
printf 'ERROR: missing --package flag.\n'
exit 1
fi

# Keep only `tf2-gpu.2-6:m80` in `gcr.io/deeplearning-platform-release/tf2-gpu.2-6:m80`
TAG=${BASE_IMAGE/gcr.io\/deeplearning-platform-release\//}
# Replace the `:` in `tf2-gpu.2-6:m80` by `-`
TAG=${TAG/:/-}
# Append the package version
TAG=$TAG-$PACKAGE_VERSION
# Add the gcr repo.
TAG=gcr.io/kaggle-images/python-$PACKAGE-whl:$TAG

SHOULD_BUILD=true
if ! $FORCE_REBUILD; then
echo "Checking if $TAG exists..."
docker pull $TAG && SHOULD_BUILD=false
fi

if $SHOULD_BUILD; then
echo "Building $TAG..."
docker build --rm --pull $BUILD_ARGS \
$CACHE_FLAG \
-t $TAG \
-f "$DOCKERFILE_PATH" \
--build-arg BASE_IMAGE=$BASE_IMAGE \
--build-arg PACKAGE_VERSION=$PACKAGE_VERSION \
$SRCDIR

if $PUSH_TO_GCR; then
echo "Pushing $TAG to GCR..."
docker push $TAG
fi
else
echo "Skipping build. $TAG already exists."
echo "Use --force-rebuild if you want to build a new version anyway."
fi
29 changes: 29 additions & 0 deletions packages/lightgbm.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
ARG BASE_IMAGE

FROM ${BASE_IMAGE} AS builder

ARG PACKAGE_VERSION

# Build instructions: https://lightgbm.readthedocs.io/en/latest/GPU-Tutorial.html#build-lightgbm
RUN apt-get update && \
apt-get install -y build-essential cmake libboost-dev libboost-system-dev libboost-filesystem-dev ocl-icd-libopencl1 clinfo

RUN cd /usr/local/src && \
git clone --recursive https://github.com/microsoft/LightGBM && \
cd LightGBM && \
git checkout tags/v$PACKAGE_VERSION && \
mkdir build && cd build && \
cmake -DUSE_GPU=1 -DOpenCL_LIBRARY=/usr/local/cuda/lib64/libOpenCL.so -DOpenCL_INCLUDE_DIR=/usr/local/cuda/include/ .. && \
make -j$(nproc) && \
cd /usr/local/src/LightGBM/python-package && \
python setup.py bdist_wheel

# Using multi-stage builds to ensure the output image is very small
# See: https://docs.docker.com/develop/develop-images/multistage-build/
FROM alpine:latest

RUN mkdir -p /tmp/whl/
COPY --from=builder /usr/local/src/LightGBM/python-package/dist/*.whl /tmp/whl

# Print out the built .whl file.
RUN ls -lh /tmp/whl/
Loading