TPU Image with python3.8

djherbis · djherbis · commit 0528d4696edb · 2022-09-16T18:28:18.000Z
Tensorflow for TPU VM is not supported on python3.7 and therefore we
need a brand new image instead of one based on prior images.

http://b/213335159
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -56,22 +56,6 @@ pipeline {
             '''
           }
         }
-        stage('tensorflow TPU') {
-          options {
-            timeout(time: 240, unit: 'MINUTES')
-          }
-          steps {
-            sh '''#!/bin/bash
-              set -exo pipefail
-              source tpu/config.txt
-              cd packages/
-              ./build_package --base-image gcr.io/kaggle-images/python:${BASE_IMAGE_TAG} \
-                --package tpu-tensorflow \
-                --version $TENSORFLOW_VERSION \
-                --push
-            '''
-          }
-        }
       }
     }
     stage('Build/Test/Diff') {
@@ -171,23 +155,12 @@ pipeline {
           stages {
             stage('Build Tensorflow TPU Image') {
               options {
-                timeout(time: 20, unit: 'MINUTES')
+                timeout(time: 60, unit: 'MINUTES')
               }
               steps {
                 sh '''#!/bin/bash
                   set -exo pipefail
 
-                  # Login to docker to get access to gcr.io/cloud-tpu-v2-images/libtpu
-                  # SA: jenkins-test@kaggle-playground-170215.iam.gserviceaccount.com
-                  # To grant access to a SA, start a TPU VM with that SA once.
-                  # Disable echo to avoid printing sensitive tokens:
-                  set +x
-                  METADATA=http://metadata.google.internal/computeMetadata/v1
-                  SVC_ACCT=$METADATA/instance/service-accounts/default
-                  ACCESS_TOKEN=$(/usr/bin/curl -s -H 'Metadata-Flavor: Google' $SVC_ACCT/token | cut -d'"' -f 4)
-                  docker login --username oauth2accesstoken --password $ACCESS_TOKEN https://gcr.io
-                  set -x
-
                   ./tpu/build | ts
                   ./push --tpu ${PRETEST_TAG}
                 '''
diff --git a/packages/tpu-tensorflow.Dockerfile b/packages/tpu-tensorflow.Dockerfile
diff --git a/tpu/Dockerfile b/tpu/Dockerfile
@@ -1,43 +1,60 @@
-ARG BASE_IMAGE_TAG
-ARG TENSORFLOW_VERSION
-
-FROM gcr.io/kaggle-images/python-tpu-tensorflow-whl:python-${BASE_IMAGE_TAG}-${TENSORFLOW_VERSION} AS tensorflow_whl
-FROM gcr.io/kaggle-images/python:${BASE_IMAGE_TAG}
+FROM python:3.8
 
 # We need to define the ARG here to get the ARG below the FROM statement to access it within this build context
 # See: https://docs.docker.com/engine/reference/builder/#understand-how-arg-and-from-interact
 ARG TORCH_VERSION
+ARG TENSORFLOW_VERSION
 
 ENV ISTPUVM=1
 
-COPY --from=tensorflow_whl /tmp/tensorflow_pkg/tensorflow*.whl /tmp/tensorflow_pkg/
-RUN pip install /tmp/tensorflow_pkg/tensorflow*.whl && \
-    rm -rf /tmp/tensorflow_pkg && \
-    /tmp/clean-layer.sh
+ADD patches/nbconvert-extensions.tpl /opt/kaggle/nbconvert-extensions.tpl
+ADD patches/template_conf.json /opt/kaggle/conf.json
+
+# Tensorflow install:
+RUN pip install https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/tensorflow/tf-${TENSORFLOW_VERSION}/tensorflow-${TENSORFLOW_VERSION}-cp38-cp38-linux_x86_64.whl && \
+    curl --output /lib/libtpu.so https://storage.googleapis.com/cloud-tpu-tpuvm-artifacts/libtpu/1.3.0/libtpu.so
 
 # LIBTPU installed here:
-ENV DEFAULT_LIBTPU=/opt/conda/lib/python3.7/site-packages/libtpu/libtpu.so
-ENV PYTORCH_LIBTPU=/opt/conda/lib/python3.7/site-packages/libtpu/torch-libtpu.so
-ENV JAX_LIBTPU=/opt/conda/lib/python3.7/site-packages/libtpu/jax-libtpu.so
+ENV PIP_LIBTPU=/usr/local/lib/python3.8/site-packages/libtpu/libtpu.so
+ENV DEFAULT_LIBTPU=/lib/libtpu.so
+ENV PYTORCH_LIBTPU=/lib/torch-libtpu.so
+ENV JAX_LIBTPU=/lib/jax-libtpu.so
+
+# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm
+RUN pip install "jax[tpu]==0.3.10" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
+
+RUN cp $PIP_LIBTPU $JAX_LIBTPU
 
 # https://cloud.google.com/tpu/docs/pytorch-xla-ug-tpu-vm#changing_pytorch_version
-RUN pip uninstall -y torch && \
-    pip install torch==${TORCH_VERSION} && \
-    # The URL doesn't include patch version. i.e. must use 1.11 instead of 1.11.0
-    pip install torch_xla[tpuvm] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-${TORCH_VERSION%.*}-cp37-cp37m-linux_x86_64.whl && \
-    cp $DEFAULT_LIBTPU $PYTORCH_LIBTPU && \
-    /tmp/clean-layer.sh
+RUN pip install torch==${TORCH_VERSION}
+  
+# The URL doesn't include patch version. i.e. must use 1.11 instead of 1.11.0
+RUN pip install torch_xla[tpuvm] -f https://storage.googleapis.com/tpu-pytorch/wheels/tpuvm/torch_xla-${TORCH_VERSION%.*}-cp38-cp38-linux_x86_64.whl
 
-# https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm#install_jax_on_your_cloud_tpu_vm
-RUN pip install "jax[tpu]>=0.2.16" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
-    cp $DEFAULT_LIBTPU $JAX_LIBTPU && \
-    /tmp/clean-layer.sh
+RUN cp $PIP_LIBTPU $PYTORCH_LIBTPU
 
 # Monkey-patch TF, JAX & PYTORCH to load the correct libtpu.so when they are imported:
-RUN sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${PYTORCH_LIBTPU}'|" /opt/conda/lib/python3.7/site-packages/torch_xla/__init__.py && \
-    sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${JAX_LIBTPU}'|" /opt/conda/lib/python3.7/site-packages/jax/_src/cloud_tpu_init.py && \
-    sed -i "1s/^/from jax._src.cloud_tpu_init import cloud_tpu_init\ncloud_tpu_init()\n/" /opt/conda/lib/python3.7/site-packages/tensorflow/__init__.py
+RUN sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${PYTORCH_LIBTPU}'|" /usr/local/lib/python3.8/site-packages/torch_xla/__init__.py && \
+    sed -i "s|^\(\(.*\)libtpu.configure_library_path.*\)|\1\n\2os.environ['TPU_LIBRARY_PATH'] = '${JAX_LIBTPU}'|" /usr/local/lib/python3.8/site-packages/jax/_src/cloud_tpu_init.py
+
+# Packages needed by the Notebook editor:
+RUN pip install papermill jupyterlab python-lsp-server[all] jupyterlab-lsp
 
 # Set these env vars so that they don't produce errs calling the metadata server to load them:
 ENV TPU_ACCELERATOR_TYPE=v3-8
-ENV TPU_PROCESS_ADDRESSES=local
+ENV TPU_PROCESS_ADDRESSES=local
+
+# Metadata
+ARG GIT_COMMIT=unknown
+ARG BUILD_DATE=unknown
+
+LABEL git-commit=$GIT_COMMIT
+LABEL build-date=$BUILD_DATE
+ENV GIT_COMMIT=${GIT_COMMIT}
+ENV BUILD_DATE=${BUILD_DATE}
+
+LABEL tensorflow-version=$TENSORFLOW_VERSION
+LABEL kaggle-lang=python
+
+# Correlate current release with the git hash inside the kernel editor by running `!cat /etc/git_commit`.
+RUN echo "$GIT_COMMIT" > /etc/git_commit && echo "$BUILD_DATE" > /etc/build_date
diff --git a/tpu/config.txt b/tpu/config.txt
@@ -1,4 +1,2 @@
-# TODO(b/213335159): Use ci-pretest for BASE_IMAGE_TAG once stable.
-BASE_IMAGE_TAG=v115
-TENSORFLOW_VERSION=2.8.0
+TENSORFLOW_VERSION=2.9.1
 TORCH_VERSION=1.11.0