pytorch
diff --git a/‎.github/scripts/extract_benchmark_results.py
Lines changed: 1 addition & 1 deletion b/‎.github/scripts/extract_benchmark_results.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 63 additions & 17 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 63 additions & 17 deletions
diff --git a/‎.github/workflows/apple-perf.yml
Lines changed: 61 additions & 15 deletions b/‎.github/workflows/apple-perf.yml
Lines changed: 61 additions & 15 deletions
diff --git a/‎.github/workflows/upload-android-test-specs.yml
Lines changed: 0 additions & 94 deletions b/‎.github/workflows/upload-android-test-specs.yml
Lines changed: 0 additions & 94 deletions
@@ -451,7 +451,7 @@ def main() -> None:
             continue
 
         output_dir = os.path.join(args.output_dir, schema)
-        os.mkdir(output_dir)
+        os.makedirs(output_dir, exist_ok=True)
 
         output_file = os.path.basename(args.artifacts)
         with open(f"{output_dir}/{output_file}", "w") as f:
 
@@ -3,6 +3,16 @@ name: android-perf
 on:
   schedule:
     - cron: 0 0 * * *
+  pull_request:
+    paths:
+      - .github/workflows/android-perf.yml
+      - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/android-perf.yml
+      - extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml.j2
   # Note: GitHub has an upper limit of 10 inputs
   workflow_dispatch:
     inputs:
@@ -30,10 +40,6 @@ on:
         description: The list of configs used the benchmark
         required: false
         type: string
-      test_spec:
-        description: The test spec to drive the test on AWS devices
-        required: false
-        type: string
   workflow_call:
     inputs:
       models:
@@ -60,10 +66,6 @@ on:
         description: The list of configs used the benchmark
         required: false
         type: string
-      test_spec:
-        description: The test spec to drive the test on AWS devices
-        required: false
-        type: string
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@@ -84,9 +86,9 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: "stories110M,dl3,mv3,mv2,ic4,ic3,vit"
-          CRON_DEFAULT_DEVICES: "samsung_galaxy_s22"
-          CRON_DEFAULT_DELEGATES: "xnnpack,qnn"
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,dl3,mv3,mv2,ic4,ic3,vit' || 'stories110M' }}
+          CRON_DEFAULT_DEVICES: samsung_galaxy_s22
+          CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,qnn' || 'xnnpack' }}
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
@@ -125,6 +127,43 @@ jobs:
           echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
           echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
 
+  prepare-test-specs:
+    runs-on: linux.2xlarge
+    needs: set-parameters
+    strategy:
+      matrix:
+          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Prepare the spec
+        shell: bash
+        working-directory: extension/benchmark/android/benchmark
+        run: |
+          set -eux
+
+          # The model will be exported in the next step to this S3 path
+          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip"
+          # We could write a script to properly use jinja here, but there is only one variable,
+          # so let's just sed it
+          sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' android-llm-device-farm-test-spec.yml.j2
+          cp android-llm-device-farm-test-spec.yml.j2 android-llm-device-farm-test-spec.yml
+
+          # Just print the test spec for debugging
+          cat android-llm-device-farm-test-spec.yml
+
+      - name: Upload the spec
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}
+          retention-days: 1
+          if-no-files-found: error
+          path: extension/benchmark/android/benchmark/android-llm-device-farm-test-spec.yml
+
   export-models:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
@@ -170,9 +209,18 @@ jobs:
                 echo "Unsupported delegate ${{ matrix.delegate }}"
                 exit 1
             fi
-            PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}"
+            PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh \
+              -model "${{ matrix.model }}" \
+              -build_tool "${BUILD_MODE}" \
+              -dtype "${DTYPE}" \
+              -mode "${DELEGATE_CONFIG}" \
+              -upload "${ARTIFACTS_DIR_NAME}"
         else
-            PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}"
+            PYTHON_EXECUTABLE=python bash .ci/scripts/test_model.sh \
+              "${{ matrix.model }}" \
+              "${BUILD_MODE}" \
+              "${{ matrix.delegate }}" \
+              "${ARTIFACTS_DIR_NAME}"
         fi
         echo "::endgroup::"
 
@@ -212,6 +260,7 @@ jobs:
     uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
     needs:
       - set-parameters
+      - prepare-test-specs
       - build-benchmark-app
       - export-models
     strategy:
@@ -231,10 +280,7 @@ jobs:
       device-pool-arn: ${{ matrix.device }}
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/minibench/app-debug-androidTest.apk
-      # NB: Need to set the default spec here so that it works for periodic too
-      test-spec: ${{ inputs.test_spec || 'https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml' }}
-      # Uploaded to S3 from the previous job
-      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
+      test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/android-llm-device-farm-test-spec.yml
 
   upload-benchmark-results:
     needs:
 
@@ -3,6 +3,16 @@ name: apple-perf
 on:
   schedule:
     - cron: 0 1 * * *
+  pull_request:
+    paths:
+      - .github/workflows/apple-perf.yml
+      - extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
+  push:
+    branches:
+      - main
+    paths:
+      - .github/workflows/apple-perf.yml
+      - extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml.j2
   # Note: GitHub has an upper limit of 10 inputs
   workflow_dispatch:
     inputs:
@@ -25,10 +35,6 @@ on:
         description: The list of configs used the benchmark
         required: false
         type: string
-      test_spec:
-        description: The test spec to drive the test on AWS devices
-        required: false
-        type: string
   workflow_call:
     inputs:
       models:
@@ -50,10 +56,6 @@ on:
         description: The list of configs used the benchmark
         required: false
         type: string
-      test_spec:
-        description: The test spec to drive the test on AWS devices
-        required: false
-        type: string
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
@@ -74,9 +76,9 @@ jobs:
           # Separate default values from the workflow dispatch. To ensure defaults are accessible
           # during scheduled runs and to provide flexibility for different defaults between
           # on-demand and periodic benchmarking.
-          CRON_DEFAULT_MODELS: "stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l"
-          CRON_DEFAULT_DEVICES: "apple_iphone_15"
-          CRON_DEFAULT_DELEGATES: "xnnpack,coreml,mps"
+          CRON_DEFAULT_MODELS: ${{ github.event_name == 'schedule' && 'stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l' || 'stories110M' }}
+          CRON_DEFAULT_DEVICES: apple_iphone_15
+          CRON_DEFAULT_DELEGATES: ${{ github.event_name == 'schedule' && 'xnnpack,coreml,mps' || 'xnnpack' }}
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
@@ -114,6 +116,41 @@ jobs:
           echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
           echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
 
+  prepare-test-specs:
+    runs-on: linux.2xlarge
+    needs: set-parameters
+    strategy:
+      matrix:
+          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      fail-fast: false
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Prepare the spec
+        shell: bash
+        working-directory: extension/benchmark/apple/Benchmark
+        run: |
+          set -eux
+          # The model will be exported in the next step to this S3 path
+          MODEL_PATH="https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip"
+          # We could write a script to properly use jinja here, but there is only one variable,
+          # so let's just sed it
+          sed -i -e 's,{{ model_path }},'"${MODEL_PATH}"',g' default-ios-device-farm-appium-test-spec.yml.j2
+          cp default-ios-device-farm-appium-test-spec.yml.j2 default-ios-device-farm-appium-test-spec.yml
+          # Just print the test spec for debugging
+          cat default-ios-device-farm-appium-test-spec.yml
+
+      - name: Upload the spec
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}
+          retention-days: 1
+          if-no-files-found: error
+          path: extension/benchmark/apple/Benchmark/default-ios-device-farm-appium-test-spec.yml
+
   export-models:
     name: export-models
     uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
@@ -173,10 +210,19 @@ jobs:
             DELEGATE_CONFIG="mps"
           fi
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-            bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}"
+            bash .ci/scripts/test_llama.sh \
+              -model "${{ matrix.model }}" \
+              -build_tool "${BUILD_MODE}" \
+              -dtype "${DTYPE}" \
+              -mode "${DELEGATE_CONFIG}" \
+              -upload "${ARTIFACTS_DIR_NAME}"
         else
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
-            bash .ci/scripts/test_model.sh "${{ matrix.model }}" "${BUILD_MODE}" "${{ matrix.delegate }}" "${ARTIFACTS_DIR_NAME}"
+            bash .ci/scripts/test_model.sh \
+              "${{ matrix.model }}" \
+              "${BUILD_MODE}" \
+              "${{ matrix.delegate }}" \
+              "${ARTIFACTS_DIR_NAME}"
         fi
         echo "::endgroup::"
 
@@ -282,6 +328,7 @@ jobs:
     if: always()
     needs:
       - set-parameters
+      - prepare-test-specs
       - upload-benchmark-app
       - export-models
     permissions:
@@ -307,8 +354,7 @@ jobs:
       # Uploaded to S3 from the previous job
       ios-ipa-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.ipa
       ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip
-      test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }}
-      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
+      test-spec: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/default-ios-device-farm-appium-test-spec.yml
 
   upload-benchmark-results:
     needs: