pytorch
diff --git a/‎.ci/scripts/gather_test_models.py
Lines changed: 8 additions & 1 deletion b/‎.ci/scripts/gather_test_models.py
Lines changed: 8 additions & 1 deletion
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 11 additions & 0 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 11 additions & 0 deletions
diff --git a/‎examples/models/llama2/custom_ops/__init__.py renamed to ‎.github/ghstack_direct b/‎examples/models/llama2/custom_ops/__init__.py renamed to ‎.github/ghstack_direct
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 273 additions & 0 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 273 additions & 0 deletions
diff --git a/‎.github/workflows/android.yml
Lines changed: 1 addition & 2 deletions b/‎.github/workflows/android.yml
Lines changed: 1 addition & 2 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 4 additions & 18 deletions b/‎.github/workflows/pull.yml
Lines changed: 4 additions & 18 deletions
@@ -27,6 +27,7 @@
         # This one causes timeout on smaller runner, the root cause is unclear (T161064121)
         "dl3": "linux.12xlarge",
         "emformer_join": "linux.12xlarge",
+        "emformer_predict": "linux.12xlarge",
     }
 }
 
@@ -35,9 +36,11 @@
     # Just some examples on how custom timeout can be set
     "linux": {
         "mobilebert": 90,
+        "emformer_predict": 360,
     },
     "macos": {
         "mobilebert": 90,
+        "emformer_predict": 360,
     },
 }
 
@@ -84,7 +87,11 @@ def model_should_run_on_event(model: str, event: str) -> bool:
     """
     if event == "pull_request":
         return model in ["mv3", "vit"]
-    return True
+    elif event == "push":
+        # 'emformer_predict' is running super slow. Only run it periodically
+        return model not in ["emformer_predict"]
+    else:
+        return True
 
 
 def model_should_run_on_target_os(model: str, target_os: str) -> bool:
 
@@ -13,6 +13,7 @@ MODEL_NAME=$1 # stories110M.pt
 BUILD_TOOL=$2 # buck2 or cmake
 DTYPE=$3 # fp16 or fp32
 MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
+UPLOAD_DIR=${5:-}
 if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
     echo "Expecting atleast 4 positional arguments"
     echo "Usage: [...]"
@@ -126,6 +127,15 @@ cleanup_files() {
   rm params.json
 }
 
+prepare_artifacts_upload() {
+  if [ -n "$UPLOAD_DIR" ]; then
+    echo "Preparing for uploading generated artifacs"
+    zip -j model.zip "${EXPORTED_MODEL_NAME}" tokenizer.bin
+    mkdir -p "${UPLOAD_DIR}"
+    mv model.zip "${UPLOAD_DIR}"
+  fi
+}
+
 # Download and create artifacts.
 PARAMS="params.json"
 touch "${PARAMS}"
@@ -205,6 +215,7 @@ if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
   echo "Actual result: ${RESULT}"
   echo "Success"
 
+  prepare_artifacts_upload
   cleanup_files
 else
   echo "Expected result prefix: ${EXPECTED_PREFIX}"
 
@@ -0,0 +1,273 @@
+name: android-perf
+
+on:
+  schedule:
+    - cron: 0 0 * * *
+  # Note: GitHub has an upper limit of 10 inputs
+  workflow_dispatch:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: stories110M
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: samsung_galaxy_s2x
+      delegates:
+        description: Backend delegates
+        required: false
+        type: string
+        default: xnnpack
+      threadpool:
+        description: Run with threadpool?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+      test_spec:
+        description: The test spec to drive the test on AWS devices
+        required: false
+        type: string
+        default: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml
+  workflow_call:
+    inputs:
+      models:
+        description: Models to be benchmarked
+        required: false
+        type: string
+        default: stories110M
+      devices:
+        description: Target devices to run benchmark
+        required: false
+        type: string
+        default: samsung_galaxy_s2x
+      delegates:
+        description: Backend delegates
+        required: false
+        type: string
+        default: xnnpack
+      threadpool:
+        description: Run with threadpool?
+        required: false
+        type: boolean
+        default: false
+      benchmark_configs:
+        description: The list of configs used the benchmark
+        required: false
+        type: string
+      test_spec:
+        description: The test spec to drive the test on AWS devices
+        required: false
+        type: string
+        default: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+jobs:
+  set-parameters:
+    runs-on: linux.2xlarge
+    outputs:
+      models: ${{ steps.set-parameters.outputs.models }}
+      devices: ${{ steps.set-parameters.outputs.devices }}
+      delegates: ${{ steps.set-parameters.outputs.delegates }}
+    steps:
+      - name: Set parameters
+        id: set-parameters
+        shell: bash
+        env:
+          # Separate default values from the workflow dispatch. To ensure defaults are accessible
+          # during scheduled runs and to provide flexibility for different defaults between
+          # on-demand and periodic benchmarking.
+          CRON_DEFAULT_MODELS: "stories110M"
+          CRON_DEFAULT_DEVICES: "samsung_galaxy_s2x"
+          CRON_DEFAULT_DELEGATES: "xnnpack"
+        run: |
+          set -ex
+          MODELS="${{ inputs.models }}"
+          if [ -z "$MODELS" ]; then
+            MODELS="$CRON_DEFAULT_MODELS"
+          fi
+          DEVICES="${{ inputs.devices }}"
+          if [ -z "$DEVICES" ]; then
+            DEVICES="$CRON_DEFAULT_DEVICES"
+          fi
+          DELEGATES="${{ inputs.delegates }}"
+          if [ -z "$DELEGATES" ]; then
+            DELEGATES="$CRON_DEFAULT_DELEGATES"
+          fi
+
+          # Mapping devices to their corresponding device-pool-arn
+          declare -A DEVICE_POOL_ARNS
+          DEVICE_POOL_ARNS[samsung_galaxy_s2x]="arn:aws:devicefarm:us-west-2:308535385114:devicepool:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/e59f866a-30aa-4aa1-87b7-4510e5820dfa"
+
+          # Resolve device names with their corresponding ARNs
+          if [[ ! $(echo "$DEVICES" | jq empty 2>/dev/null) ]]; then
+            DEVICES=$(echo "$DEVICES" | jq -Rc 'split(",")')
+          fi
+          declare -a MAPPED_ARNS=()
+          for DEVICE in $(echo "$DEVICES" | jq -r '.[]'); do
+            if [[ -z "${DEVICE_POOL_ARNS[$DEVICE]}" ]]; then
+              echo "Error: No ARN found for device '$DEVICE'. Abort." >&2
+              exit 1
+            fi
+            MAPPED_ARNS+=("${DEVICE_POOL_ARNS[$DEVICE]}")
+          done
+
+          echo "models=$(echo $MODELS | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+          MAPPED_ARNS_JSON=$(printf '%s\n' "${MAPPED_ARNS[@]}" | jq -R . | jq -s .)
+          echo "devices=$(echo "$MAPPED_ARNS_JSON" | jq -c .)" >> $GITHUB_OUTPUT
+          echo "delegates=$(echo $DELEGATES | jq -Rc 'split(",")')" >> $GITHUB_OUTPUT
+
+  export-models:
+    name: export-models
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    needs: set-parameters
+    strategy:
+      matrix:
+          model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+          delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+      fail-fast: false
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      timeout: 60
+      upload-artifact: android-models
+      script: |
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
+        echo "Exporting model: ${{ matrix.model }}"
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded/${{ matrix.model }}_${{ matrix.delegate }}
+
+        # TODO(T197546696): Note that the following scripts/steps only work for llama. It's expected to fail for other models+delegates.
+        # Install requirements for export_llama
+        PYTHON_EXECUTABLE=python bash examples/models/llama2/install_requirements.sh
+        # Test llama2
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama.sh "${{ matrix.model }}.pt" "cmake" "fp32" "xnnpack+custom+qe" "${ARTIFACTS_DIR_NAME}"\
+
+  # Upload models to S3. The artifacts are needed not only by the device farm but also TorchChat
+  upload-models:
+    needs: export-models
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the models from GitHub
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: android-models
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the models
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the models to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 1
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
+
+  build-llm-demo:
+    name: build-llm-demo
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@main
+    needs: set-parameters
+    strategy:
+      matrix:
+          tokenizer: [bpe]
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12-android
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      upload-artifact: android-apps
+      script: |
+        set -eux
+
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh cmake
+        export ARTIFACTS_DIR_NAME=artifacts-to-be-uploaded
+
+        # TODO: This needs to be replaced with a generic loader .apk
+        # Build LLM Demo for Android
+        bash build/build_android_llm_demo.sh ${{ matrix.tokenizer }} ${ARTIFACTS_DIR_NAME}
+
+  # Upload artifacts to S3. The artifacts are needed not only by the device farm but also TorchChat
+  upload-android-apps:
+    needs: build-llm-demo
+    runs-on: linux.2xlarge
+    steps:
+      - name: Download the apps from GitHub
+        uses: actions/download-artifact@v3
+        with:
+          # The name here needs to match the name of the upload-artifact parameter
+          name: android-apps
+          path: ${{ runner.temp }}/artifacts/
+
+      - name: Verify the apps
+        shell: bash
+        working-directory: ${{ runner.temp }}/artifacts/
+        run: |
+          ls -lah ./
+
+      - name: Upload the apps to S3
+        uses: seemethere/upload-artifact-s3@v5
+        with:
+          s3-bucket: gha-artifacts
+          s3-prefix: |
+            ${{ github.repository }}/${{ github.run_id }}/artifact
+          retention-days: 14
+          if-no-files-found: ignore
+          path: ${{ runner.temp }}/artifacts/
+
+  # Let's see how expensive this job is, we might want to tone it down by running it periodically
+  benchmark-on-device:
+    permissions:
+      id-token: write
+      contents: read
+    uses: pytorch/test-infra/.github/workflows/mobile_job.yml@main
+    needs:
+      - set-parameters
+      - upload-models
+      - upload-android-apps
+    strategy:
+      matrix:
+        model: ${{ fromJson(needs.set-parameters.outputs.models) }}
+        delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
+        device: ${{ fromJson(needs.set-parameters.outputs.devices) }}
+    with:
+      device-type: android
+      runner: linux.2xlarge
+      test-infra-ref: ''
+      # This is the ARN of ExecuTorch project on AWS
+      project-arn: arn:aws:devicefarm:us-west-2:308535385114:project:02a2cf0f-6d9b-45ee-ba1a-a086587469e6
+      device-pool-arn: ${{ matrix.device }}
+      # Uploaded to S3 from the previous job, the name of the app comes from the project itself.
+      # Unlike models there are limited numbers of build flavor for apps, and the model controls whether it should build with bpe/tiktoken tokenizer.
+      # It's okay to build all possible apps with all possible flavors in job "build-llm-demo". However, in this job, once a model is given, there is only
+      # one app+flavor that could load and run the model.
+      # TODO: Hard code llm_demo_bpe for now in this job.
+      android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug.apk
+      android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_bpe/app-debug-androidTest.apk
+      test-spec: ${{ inputs.test_spec }}
+      # Uploaded to S3 from the previous job
+      extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
@@ -170,8 +170,7 @@ jobs:
       # Uploaded to S3 from the previous job, the name of the app comes from the project itself
       android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug.apk
       android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/llm_demo_${{ matrix.tokenizer }}/app-debug-androidTest.apk
-      # The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
-      test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
+      test-spec: https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml
       # Among the input, this is the biggest file, so it is cached on AWS to make the test faster. Note that the file is deleted by AWS after 30
       # days and the job will automatically re-upload the file when that happens.
       extra-data: https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b-0717.zip
@@ -205,27 +205,13 @@ jobs:
 
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
+        # install pybind
+        bash install_requirements.sh --pybind xnnpack
+
         # install Llava requirements
         bash examples/models/llama2/install_requirements.sh
         bash examples/models/llava/install_requirements.sh
 
-        # run export_llava.sh
-        python examples/models/llava/export_llava.py --use-sdpa-with-kv-cache --pte-name llava_custom_sdpa.pte
-
-        # verify file exists
-        if [ ! -f "llava_custom_sdpa.pte" ]; then
-            echo "llava_custom_sdpa.pte not found!"
-            exit 1
-        fi
-
-        python examples/models/llava/export_llava.py --no-use-sdpa-with-kv-cache --pte-name llava.pte
-
-        # verify file exists
-        if [ ! -f "llava.pte" ]; then
-            echo "llava.pte not found!"
-            exit 1
-        fi
-
         # run python unittest
         python -m unittest examples.models.llava.test.test_llava
 
@@ -337,7 +323,7 @@ jobs:
         size=${arr[4]}
         # threshold=48120 on devserver with gcc11.4
         # todo(lfq): update once binary size is below 50kb.
-        threshold="51768"
+        threshold="51784"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else