upload artifacts to GHA Cache when merged to main

leofang · leofang · commit 99077b656cf8 · 2025-01-06T18:38:07.000Z
diff --git a/.github/workflows/build-and-test.yml b/.github/workflows/build-and-test.yml
@@ -42,6 +42,9 @@ jobs:
                #  (matrix.host-platform == 'win-64' && 'windows-amd64-cpu8') }}
     outputs:
       BUILD_CTK_VER: ${{ steps.pass_env.outputs.CUDA_VERSION }}
+    defaults:
+      run:
+        shell: bash --noprofile --norc -xeuo pipefail {0}
     steps:
       - name: Checkout ${{ github.event.repository.name }}
         uses: actions/checkout@v4
@@ -62,7 +65,6 @@ jobs:
         uses: ilammy/msvc-dev-cmd@v1
   
       - name: Set environment variables
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           PYTHON_VERSION_FORMATTED=$(echo '${{ matrix.python-version }}' | tr -d '.')
           if [[ "${{ matrix.host-platform }}" == linux* ]]; then
@@ -75,14 +77,25 @@ jobs:
           fi
   
           echo "PARALLEL_LEVEL=$(nproc)" >> $GITHUB_ENV
-          echo "CUDA_CORE_ARTIFACT_NAME=cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ matrix.host-platform }}-${{ github.sha }}" >> $GITHUB_ENV
+          CUDA_CORE_ARTIFACT_BASENAME="cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ matrix.host-platform }}"
+          echo "CUDA_CORE_ARTIFACT_BASENAME=${CUDA_CORE_ARTIFACT_BASENAME}" >> $GITHUB_ENV
+          echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV
           echo "CUDA_CORE_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_core/dist")" >> $GITHUB_ENV
-          echo "CUDA_BINDINGS_ARTIFACT_NAME=cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ matrix.cuda-version }}-${{ matrix.host-platform }}-${{ github.sha }}" >> $GITHUB_ENV
+          CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ matrix.cuda-version }}-${{ matrix.host-platform }}"
+          echo "CUDA_BINDINGS_ARTIFACT_BASENAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}" >> $GITHUB_ENV
+          echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV
           echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV
           echo "CIBW_BUILD=${CIBW_BUILD}" >> $GITHUB_ENV
-  
+
+          # When the CI is run due to merging to main, we want it to populate GHA Cache not Artifacts,
+          # so that CI workflows running on every branch have a fallback to use.
+          if [[ "${{ github.ref_name}}" == main ]]; then
+            echo "USE_CACHE=1" >> $GITHUB_ENV
+          else
+            echo "USE_CACHE=0" >> $GITHUB_ENV
+          fi
+
       - name: Dump environment
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           env
 
@@ -97,7 +110,6 @@ jobs:
           output-dir: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
 
       - name: List the cuda.core artifacts directory
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           if [[ "${{ matrix.host-platform }}" == win* ]]; then
             export CHOWN=chown
@@ -108,19 +120,40 @@ jobs:
           ls -lahR ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
 
       - name: Check cuda.core wheel
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           pip install twine
           twine check ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
 
       - name: Upload cuda.core build artifacts
+        if: ${{ env.USE_CACHE == '0' }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
           path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}/*.whl
           if-no-files-found: error
           overwrite: 'true'
 
+      - name: Prepare cuda.core cache
+        if: ${{ env.USE_CACHE == '1' }}
+        run: |
+          if [[ "${{ env.USE_CACHE }}" == 1 ]]; then
+            # this file is uploaded to GHA Cache
+            tar -c -f "${{ env.CUDA_CORE_ARTIFACT_BASENAME }}.tar.gz" -C "${{ env.CUDA_CORE_ARTIFACTS_DIR }}" .
+            du -h "${{ env.CUDA_CORE_ARTIFACT_BASENAME }}.tar.gz"
+            # check if the previous runs from the same PR have populated the cache, if so need to clean it up
+            CACHE_KEY=${{ env.CUDA_CORE_ARTIFACT_NAME }}
+            if [ $(gh cache list | grep $CACHE_KEY | wc -l) == "1" ]; then
+              gh cache delete $CACHE_KEY
+            fi
+          fi
+
+      - name: Cache cuda.core build artifacts
+        if: ${{ env.USE_CACHE == '1' }}
+        uses: actions/cache/save@v4
+        with:
+          key: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
+          path: ${{ env.CUDA_CORE_ARTIFACT_BASENAME }}.tar.gz
+
       - name: Set up mini CTK
         uses: ./.github/actions/fetch_ctk
         continue-on-error: false
@@ -146,7 +179,6 @@ jobs:
           output-dir: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
 
       - name: List the cuda.bindings artifacts directory
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           if [[ "${{ matrix.host-platform }}" == win* ]]; then
             export CHOWN=chown
@@ -158,18 +190,39 @@ jobs:
 
       # TODO: enable this after NVIDIA/cuda-python#297 is resolved
       # - name: Check cuda.bindings wheel
-      #   shell: bash --noprofile --norc -xeuo pipefail {0}
       #   run: |
       #     twine check ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
 
+      - name: Prepare cuda.bindings cache
+        if: ${{ env.USE_CACHE == '1' }}
+        run: |
+          if [[ "${{ env.USE_CACHE }}" == 1 ]]; then
+            # this file is uploaded to GHA Cache
+            tar -c -f "${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}.tar.gz" -C "${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}" .
+            du -h "${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}.tar.gz"
+            # check if the previous runs from the same PR have populated the cache, if so need to clean it up
+            CACHE_KEY=${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
+            if [ $(gh cache list | grep $CACHE_KEY | wc -l) == "1" ]; then
+              gh cache delete $CACHE_KEY
+            fi
+          fi
+
       - name: Upload cuda.bindings build artifacts
+        if: ${{ env.USE_CACHE == '0' }}
         uses: actions/upload-artifact@v4
         with:
           name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
           path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}/*.whl
           if-no-files-found: error
           overwrite: 'true'
 
+      - name: Cache cuda.bindings build artifacts
+        if: ${{ env.USE_CACHE == '1' }}
+        uses: actions/cache/save@v4
+        with:
+          key: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
+          path: ${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}.tar.gz
+
       - name: Pass environment variables to the next runner
         id: pass_env
         run: |
@@ -205,7 +258,7 @@ jobs:
             runner: H100
     name: Test (${{ matrix.host-platform }}, Python ${{ matrix.python-version }}, CUDA ${{ matrix.cuda-version }}, Runner ${{ matrix.runner }})
     # The build stage could fail but we want the CI to keep moving.
-    if: ${{ github.repository_owner == 'nvidia' && always() }}
+    if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
     permissions:
       id-token: write # This is required for configure-aws-credentials
       contents: read  # This is required for actions/checkout
@@ -221,9 +274,11 @@ jobs:
         NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }}
     needs:
       - build
+    defaults:
+      run:
+        shell: bash --noprofile --norc -xeuo pipefail {0}
     steps:
       - name: Ensure GPU is working
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: nvidia-smi
 
       - name: Checkout ${{ github.event.repository.name }}
@@ -232,7 +287,6 @@ jobs:
           fetch-depth: 0
 
       - name: Set environment variables
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           PYTHON_VERSION_FORMATTED=$(echo '${{ matrix.python-version }}' | tr -d '.')
           if [[ "${{ matrix.host-platform }}" == linux* ]]; then
@@ -251,32 +305,83 @@ jobs:
           fi
 
           # make outputs from the previous job as env vars
-          echo "CUDA_CORE_ARTIFACT_NAME=cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ matrix.host-platform }}-${{ github.sha }}" >> $GITHUB_ENV
+          CUDA_CORE_ARTIFACT_BASENAME="cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ matrix.host-platform }}"
+          echo "CUDA_CORE_ARTIFACT_BASENAME=${CUDA_CORE_ARTIFACT_BASENAME}" >> $GITHUB_ENV
+          echo "CUDA_CORE_ARTIFACT_NAME=${CUDA_CORE_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV
           echo "CUDA_CORE_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_core/dist")" >> $GITHUB_ENV
-          echo "CUDA_BINDINGS_ARTIFACT_NAME=cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ needs.build.outputs.BUILD_CTK_VER }}-${{ matrix.host-platform }}-${{ github.sha }}" >> $GITHUB_ENV
+          CUDA_BINDINGS_ARTIFACT_BASENAME="cuda-bindings-python${PYTHON_VERSION_FORMATTED}-cuda${{ matrix.cuda-version }}-${{ matrix.host-platform }}"
+          echo "CUDA_BINDINGS_ARTIFACT_BASENAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}" >> $GITHUB_ENV
+          echo "CUDA_BINDINGS_ARTIFACT_NAME=${CUDA_BINDINGS_ARTIFACT_BASENAME}-${{ github.sha }}" >> $GITHUB_ENV
           echo "CUDA_BINDINGS_ARTIFACTS_DIR=$(realpath "$REPO_DIR/cuda_bindings/dist")" >> $GITHUB_ENV
           echo "SKIP_CUDA_BINDINGS_TEST=${SKIP_CUDA_BINDINGS_TEST}" >> $GITHUB_ENV
 
+      # We'll try GHA Artifacts first, and then fall back to GHA Cache
       - name: Download cuda.bindings build artifacts
+        id: cuda-bindings-download
         uses: actions/download-artifact@v4
         with:
           name: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
           path: ${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}
 
+      - name: Restore cuda.bindings cache
+        if: ${{ failure() && steps.cuda-bindings-download.conclusion == 'failure' }}
+        id: cuda-bindings-cache
+        uses: actions/cache/restore@v4
+        with:
+          key: ${{ env.CUDA_BINDINGS_ARTIFACT_NAME }}
+          path: ${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}.tar.gz
+          restore-keys: ${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}
+          fail-on-cache-miss: true
+
+      - name: Report cache restore status (hit)
+        if: ${{ steps.cuda-bindings-cache.conclusion != 'skipped' &&
+                steps.cuda-bindings-cache.outputs.cache-hit == 'true' }}
+        run: |
+          echo "cache is found"
+          CACHE_DIR="${{ env.CUDA_BINDINGS_ARTIFACTS_DIR }}"
+          CACHE_ARCHIVE="${{ env.CUDA_BINDINGS_ARTIFACT_BASENAME }}.tar.gz"
+          ls -l $CACHE_ARCHIVE
+          mkdir -p $CACHE_DIR
+          du -h $CACHE_ARCHIVE &&
+            tar -x -f $CACHE_ARCHIVE -C $CACHE_DIR &&
+            rm -f $CACHE_ARCHIVE || echo "WARNING: cache could not be retrieved."
+
       - name: Display structure of downloaded cuda.bindings artifacts
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           pwd
           ls -lahR $CUDA_BINDINGS_ARTIFACTS_DIR
 
       - name: Download cuda.core build artifacts
+        id: cuda-core-download
         uses: actions/download-artifact@v4
         with:
           name: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
           path: ${{ env.CUDA_CORE_ARTIFACTS_DIR }}
 
+      - name: Restore cuda.core cache
+        if: ${{ failure() && steps.cuda-core-download.conclusion == 'failure' }}
+        id: cuda-core-cache
+        uses: actions/cache/restore@v4
+        with:
+          key: ${{ env.CUDA_CORE_ARTIFACT_NAME }}
+          path: ${{ env.CUDA_CORE_ARTIFACT_BASENAME }}.tar.gz
+          restore-keys: ${{ env.CUDA_CORE_ARTIFACT_BASENAME }}
+          fail-on-cache-miss: true
+
+      - name: Report cache restore status (hit)
+        if: ${{ steps.cuda-core-cache.conclusion != 'skipped' &&
+                steps.cuda-core-cache.outputs.cache-hit == 'true' }}
+        run: |
+          echo "cache is found"
+          CACHE_DIR="${{ env.CUDA_CORE_ARTIFACTS_DIR }}"
+          CACHE_ARCHIVE="${{ env.CUDA_CORE_ARTIFACT_BASENAME }}.tar.gz"
+          ls -l $CACHE_ARCHIVE
+          mkdir -p $CACHE_DIR
+          du -h $CACHE_ARCHIVE &&
+            tar -x -f $CACHE_ARCHIVE -C $CACHE_DIR &&
+            rm -f $CACHE_ARCHIVE || echo "WARNING: cache could not be retrieved."
+
       - name: Display structure of downloaded cuda.core build artifacts
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           pwd
           ls -lahR $CUDA_CORE_ARTIFACTS_DIR
@@ -295,7 +400,6 @@ jobs:
 
       - name: Run cuda.bindings tests
         if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0' }}
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           ls $CUDA_PATH
 
@@ -311,7 +415,6 @@ jobs:
           popd
 
       - name: Run cuda.core tests
-        shell: bash --noprofile --norc -xeuo pipefail {0}
         run: |
           if [[ ${{ matrix.python-version }} == "3.13" ]]; then
             # TODO: remove this hack once cuda-python has a cp313 build
@@ -336,7 +439,7 @@ jobs:
   doc:
     name: Docs
     # The build stage could fail but we want the CI to keep moving.
-    if: ${{ github.repository_owner == 'nvidia' && always() }}
+    if: ${{ github.repository_owner == 'nvidia' && !cancelled() }}
     # Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
     permissions:
       id-token: write