Skip to content

Commit f4d00d4

Browse files
authored
Build Kaggle TPU image (#1116)
TODO: - Currently fails on Cloud Build but works locally on Cloud Top: https://pantheon.corp.google.com/cloud-build/builds;region=global/904c778d-45c4-4646-9183-f10b18a4910a?project=kkb-infra. May be related to not enough RAM. Will require more investigation. - TensorFlow related libraries (tensorflow-addons, tensorflow-gcs-config) will need to be recompiled against the new TPU binary. - Install TPU support for JAX & Pytorch. http://b/152075195
1 parent 564389b commit f4d00d4

File tree

4 files changed

+102
-0
lines changed

4 files changed

+102
-0
lines changed

tpu/Dockerfile

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
ARG BASE_IMAGE_TAG
2+
ARG LIBTPU_IMAGE_TAG
3+
ARG TENSORFLOW_WHL_IMAGE_TAG
4+
5+
FROM gcr.io/cloud-tpu-v2-images/libtpu:${LIBTPU_IMAGE_TAG} as libtpu
6+
FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:${TENSORFLOW_WHL_IMAGE_TAG} AS tensorflow_whl
7+
FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG}
8+
9+
COPY --from=libtpu /libtpu.so /lib
10+
11+
COPY --from=tensorflow_whl /tmp/tensorflow_pkg/tensorflow*.whl /tmp/tensorflow_pkg/
12+
RUN pip install /tmp/tensorflow_pkg/tensorflow*.whl && \
13+
rm -rf /tmp/tensorflow_pkg

tpu/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# Build the Kaggle TPU image
2+
3+
NOTE: Building a new Kaggle TPU image can only be done by members of the Kaggle team.
4+
5+
1. Set the `_BASE_IMAGE_TAG` substitution in [cloudbuild.yaml](cloudbuild.yaml) to the desired version.
6+
1. Submit the build to Google Cloud Build by running:
7+
```
8+
gcloud builds submit --async
9+
```

tpu/cloudbuild.yaml

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
steps:
2+
- id: 'build-tensorflow-whl'
3+
name: 'gcr.io/cloud-builders/docker'
4+
args:
5+
- build
6+
- --rm
7+
- --tag=gcr.io/kaggle-images/python-tpu-tensorflow-whl:$BUILD_ID
8+
- --file=tensorflow.Dockerfile
9+
- --build-arg=BASE_IMAGE_TAG=$_BASE_IMAGE_TAG
10+
- .
11+
12+
# TODO(b/152075195): Build JAX & Pytorch TPU enabled packages.
13+
14+
- id: 'build-tpu-image'
15+
waitFor: ['build-tensorflow-whl']
16+
name: 'gcr.io/cloud-builders/docker'
17+
args:
18+
- build
19+
- --rm
20+
- --tag=gcr.io/kaggle-images/python-tpu:$BUILD_ID
21+
- --file=Dockerfile
22+
- --build-arg=BASE_IMAGE_TAG=$_BASE_IMAGE_TAG
23+
- --build-arg=LIBTPU_IMAGE_TAG=$_LIBTPU_IMAGE_TAG
24+
- --build-arg=TENSORFLOW_WHL_IMAGE_TAG=$BUILD_ID
25+
- .
26+
27+
options:
28+
machineType: E2_HIGHCPU_32
29+
diskSizeGb: 1000
30+
31+
timeout: 86400s
32+
33+
substitutions:
34+
_BASE_IMAGE_TAG: v107
35+
_LIBTPU_IMAGE_TAG: libtpu_1.1.0_RC00
36+
37+
images:
38+
- gcr.io/kaggle-images/python-tpu:$BUILD_ID
39+
- gcr.io/kaggle-images/python-tpu-tensorflow-whl:$BUILD_ID
40+
41+
tags: ['python', 'tpu']

tpu/tensorflow.Dockerfile

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
ARG BASE_IMAGE_TAG
2+
3+
FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} AS builder
4+
5+
# Use Bazelisk to ensure the proper bazel version is used.
6+
RUN cd /usr/local/src && \
7+
wget --no-verbose "https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-linux-amd64" && \
8+
mv bazelisk-linux-amd64 /usr/local/bin/bazel && \
9+
chmod u+x /usr/local/bin/bazel
10+
11+
# Fetch TensorFlow & install dependencies.
12+
RUN cd /usr/local/src && \
13+
git clone https://github.com/tensorflow/tensorflow && \
14+
cd tensorflow && \
15+
git checkout tags/v${TENSORFLOW_VERSION} && \
16+
# TODO(rosbo): Is it really needed?
17+
pip install keras_applications --no-deps && \
18+
pip install keras_preprocessing --no-deps
19+
20+
# Create a TensorFlow wheel for CPU
21+
RUN cd /usr/local/src/tensorflow && \
22+
cat /dev/null | ./configure && \
23+
bazel build \
24+
--config=opt \
25+
--distinct_host_configuration=true \
26+
--define=framework_shared_object=true \
27+
--define=with_tpu_support=true \
28+
--copt=-DLIBTPU_ON_GCE \
29+
//tensorflow/tools/pip_package:build_pip_package \
30+
--local_ram_resources=HOST_RAM*.5
31+
32+
RUN cd /usr/local/src/tensorflow && \
33+
bazel-bin/tensorflow/tools/pip_package/build_pip_package /tmp/tensorflow_pkg
34+
35+
# TODO(b/152075195): Will likely need to install custom build for TFA & tensorflow-gcs-config
36+
37+
# Use multi-stage builds to minimize image output size.
38+
FROM alpine:latest
39+
COPY --from=builder /tmp/tensorflow_pkg/tensorflow*.whl /tmp/tensorflow_pkg/

0 commit comments

Comments
 (0)