pytorch
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 25 additions & 8 deletions b/‎.ci/scripts/test_model.sh
Lines changed: 25 additions & 8 deletions
diff --git a/‎.github/scripts/extract_benchmark_results.py
Lines changed: 151 additions & 16 deletions b/‎.github/scripts/extract_benchmark_results.py
Lines changed: 151 additions & 16 deletions
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/android-perf.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/apple-perf.yml
Lines changed: 79 additions & 1 deletion b/‎.github/workflows/apple-perf.yml
Lines changed: 79 additions & 1 deletion
@@ -171,7 +171,7 @@ else
 fi
 
 # Check dtype.
-EXPORTED_MODEL_NAME="llama2"
+EXPORTED_MODEL_NAME="tinyllama_${MODE}_${DTYPE}"
 if [[ "${DTYPE}" == "fp16" ]]; then
   EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}_h"
 elif [[ "${DTYPE}" == "bf16" ]]; then
 
@@ -155,30 +155,24 @@ test_model_with_qnn() {
 
   if [[ "${MODEL_NAME}" == "dl3" ]]; then
     EXPORT_SCRIPT=deeplab_v3
-    EXPORTED_MODEL_NAME=dlv3_qnn.pte
   elif [[ "${MODEL_NAME}" == "mv3" ]]; then
     EXPORT_SCRIPT=mobilenet_v3
-    EXPORTED_MODEL_NAME=mv3_qnn.pte
   elif [[ "${MODEL_NAME}" == "mv2" ]]; then
     EXPORT_SCRIPT=mobilenet_v2
-    EXPORTED_MODEL_NAME=mv2_qnn.pte
   elif [[ "${MODEL_NAME}" == "ic4" ]]; then
     EXPORT_SCRIPT=inception_v4
-    EXPORTED_MODEL_NAME=ic4_qnn.pte
   elif [[ "${MODEL_NAME}" == "ic3" ]]; then
     EXPORT_SCRIPT=inception_v3
-    EXPORTED_MODEL_NAME=ic3_qnn.pte
   elif [[ "${MODEL_NAME}" == "vit" ]]; then
     EXPORT_SCRIPT=torchvision_vit
-    EXPORTED_MODEL_NAME=vit_qnn.pte
   fi
 
   # Use SM8450 for S22, SM8550 for S23, and SM8560 for S24
   # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
   "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only
-  EXPORTED_MODEL=./${EXPORT_SCRIPT}/${EXPORTED_MODEL_NAME}
+  EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
 test_model_with_coreml() {
@@ -187,7 +181,24 @@ test_model_with_coreml() {
     exit 1
   fi
 
-  "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}"
+  DTYPE=float16
+
+  "${PYTHON_EXECUTABLE}" -m examples.apple.coreml.scripts.export --model_name="${MODEL_NAME}" --compute_precision "${DTYPE}"
+  EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
+  # TODO:
+  if [ -n "$EXPORTED_MODEL" ]; then
+    EXPORTED_MODEL_WITH_DTYPE="${EXPORTED_MODEL%.pte}_${DTYPE}.pte"
+    mv "$EXPORTED_MODEL" "$EXPORTED_MODEL_WITH_DTYPE"
+    EXPORTED_MODEL="$EXPORTED_MODEL_WITH_DTYPE"
+    echo "Renamed file path: $EXPORTED_MODEL"
+  else
+    echo "No .pte file found"
+    exit 1
+  fi
+}
+
+test_model_with_mps() {
+  "${PYTHON_EXECUTABLE}" -m examples.apple.mps.scripts.mps_example --model_name="${MODEL_NAME}" --use_fp16
   EXPORTED_MODEL=$(find "." -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
@@ -206,6 +217,12 @@ elif [[ "${BACKEND}" == "coreml" ]]; then
   if [[ $? -eq 0 ]]; then
     prepare_artifacts_upload
   fi
+elif [[ "${BACKEND}" == "mps" ]]; then
+  echo "Testing ${MODEL_NAME} with mps..."
+  test_model_with_mps
+  if [[ $? -eq 0 ]]; then
+    prepare_artifacts_upload
+  fi
 elif [[ "${BACKEND}" == "xnnpack" ]]; then
   echo "Testing ${MODEL_NAME} with xnnpack..."
   WITH_QUANTIZATION=true
 
@@ -14,7 +14,7 @@
 from argparse import Action, ArgumentParser, Namespace
 from io import BytesIO
 from logging import info, warning
-from typing import Any, List, Optional
+from typing import Any, Dict, List, Optional
 from urllib import error, request
 
 
@@ -24,6 +24,15 @@
 BENCHMARK_RESULTS_FILENAME = "benchmark_results.json"
 ARTIFACTS_FILENAME_REGEX = re.compile(r"(android|ios)-artifacts-(?P<job_id>\d+).json")
 
+# iOS-related regexes and variables
+IOS_TEST_SPEC_REGEX = re.compile(
+    r"Test Case\s+'-\[(?P<test_class>\w+)\s+(?P<test_name>\w+)\]'\s+measured\s+\[(?P<metric>.+)\]\s+average:\s+(?P<value>[\d\.]+),"
+)
+IOS_TEST_NAME_REGEX = re.compile(
+    r"test_(?P<method>forward|load|generate)_(?P<model_name>\w+)_pte.*iOS_(?P<ios_ver>\w+)_iPhone(?P<iphone_ver>\w+)"
+)
+IOS_MODEL_NAME_REGEX = re.compile(r"(?P<model>[^_]+)_(?P<backend>\w+)_(?P<dtype>\w+)")
+
 
 class ValidateArtifacts(Action):
     def __call__(
@@ -135,6 +144,130 @@ def extract_android_benchmark_results(
         return []
 
 
+def initialize_ios_metadata(test_name: str) -> Dict[str, any]:
+    """
+    Extract the benchmark metadata from the test name, for example:
+        test_forward_llama2_pte_iOS_17_2_1_iPhone15_4
+        test_load_resnet50_xnnpack_q8_pte_iOS_17_2_1_iPhone15_4
+    """
+    m = IOS_TEST_NAME_REGEX.match(test_name)
+    if not m:
+        return {}
+
+    method = m.group("method")
+    model_name = m.group("model_name")
+    ios_ver = m.group("ios_ver").replace("_", ".")
+    iphone_ver = m.group("iphone_ver").replace("_", ".")
+
+    # NB: This looks brittle, but unless we can return iOS benchmark results in JSON
+    # format by the test, the mapping is needed to match with Android test
+    if method == "load":
+        metric = "model_load_time(ms)"
+    elif method == "forward":
+        metric = (
+            "generate_time(ms)"
+            if "llama" in model_name
+            else "avg_inference_latency(ms)"
+        )
+    elif method == "generate":
+        metric = "token_per_sec"
+
+    backend = ""
+    quantization = "unknown"
+
+    m = IOS_MODEL_NAME_REGEX.match(model_name)
+    if m:
+        backend = m.group("backend")
+        quantization = m.group("dtype")
+        model_name = m.group("model")
+
+    return {
+        "benchmarkModel": {
+            "backend": backend,
+            "quantization": quantization,
+            "name": model_name,
+        },
+        "deviceInfo": {
+            "arch": f"iPhone {iphone_ver}",
+            "device": f"iPhone {iphone_ver}",
+            "os": f"iOS {ios_ver}",
+            "availMem": 0,
+            "totalMem": 0,
+        },
+        "metric": metric,
+        # These fields will be populated later by extract_ios_metric
+        "actualValue": 0,
+        "targetValue": 0,
+    }
+
+
+def extract_ios_metric(
+    benchmark_result: Dict[str, Any],
+    test_name: str,
+    metric_name: str,
+    metric_value: float,
+) -> Dict[str, Any]:
+    """
+    Map the metric name from iOS xcresult to the benchmark result
+    """
+    if metric_name == "Clock Monotonic Time, s":
+        # The benchmark value is in ms
+        benchmark_result["actualValue"] = metric_value * 1000
+    elif metric_name == "Tokens Per Second, t/s":
+        benchmark_result["actualValue"] = metric_value
+
+    return benchmark_result
+
+
+def extract_ios_benchmark_results(
+    job_name: str, artifact_type: str, artifact_s3_url: str
+) -> List:
+    """
+    The benchmark results from iOS are currently from xcresult, which could either
+    be parsed from CUSTOMER_ARTIFACT or get from the test spec output. The latter
+    is probably easier to process
+    """
+    if artifact_type != "TESTSPEC_OUTPUT":
+        return []
+
+    try:
+        benchmark_results = []
+
+        with request.urlopen(artifact_s3_url) as data:
+            current_test_name = ""
+            current_record = {}
+
+            for line in data.read().decode("utf8").splitlines():
+                s = IOS_TEST_SPEC_REGEX.search(line)
+                if not s:
+                    continue
+
+                test_class = s.group("test_class")
+                test_name = s.group("test_name")
+                metric_name = s.group("metric")
+                metric_value = float(s.group("value"))
+
+                if test_name != current_test_name:
+                    if current_record:
+                        # Save the benchmark result in the same format used by Android
+                        benchmark_results.append(current_record.copy())
+
+                    current_test_name = test_name
+                    current_record = initialize_ios_metadata(current_test_name)
+
+                current_record = extract_ios_metric(
+                    current_record, test_name, metric_name, metric_value
+                )
+
+            benchmark_results.append(current_record.copy())
+
+        return benchmark_results
+
+    except error.HTTPError:
+        warning(f"Fail to {artifact_type} {artifact_s3_url}")
+        return []
+
+
 def extract_job_id(artifacts_filename: str) -> int:
     """
     Extract the job id from the artifacts filename
@@ -222,23 +355,25 @@ def main() -> None:
                 benchmark_results = extract_android_benchmark_results(
                     job_name, artifact_type, artifact_s3_url
                 )
-                if benchmark_results:
-                    benchmark_results = transform(
-                        app_type,
-                        benchmark_results,
-                        args.repo,
-                        args.head_branch,
-                        args.workflow_name,
-                        args.workflow_run_id,
-                        args.workflow_run_attempt,
-                        job_name,
-                        extract_job_id(args.artifacts),
-                    )
-                    all_benchmark_results.extend(benchmark_results)
 
             if app_type == "IOS_APP":
-                # TODO (huydhn): Implement the logic for iOS next
-                pass
+                benchmark_results = extract_ios_benchmark_results(
+                    job_name, artifact_type, artifact_s3_url
+                )
+
+            if benchmark_results:
+                benchmark_results = transform(
+                    app_type,
+                    benchmark_results,
+                    args.repo,
+                    args.head_branch,
+                    args.workflow_name,
+                    args.workflow_run_id,
+                    args.workflow_run_attempt,
+                    job_name,
+                    extract_job_id(args.artifacts),
+                )
+                all_benchmark_results.extend(benchmark_results)
 
     if all_benchmark_results:
         output_file = os.path.basename(args.artifacts)
 
@@ -135,7 +135,7 @@ jobs:
           delegate: ${{ fromJson(needs.set-parameters.outputs.delegates) }}
       fail-fast: false
     with:
-      runner: linux.2xlarge
+      runner: linux.4xlarge
       docker-image: executorch-ubuntu-22.04-clang12-android
       submodules: 'true'
       timeout: 60
@@ -205,6 +205,7 @@ jobs:
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
   benchmark-on-device:
+    if: always()
     permissions:
       id-token: write
       contents: read
 
@@ -76,7 +76,7 @@ jobs:
           # on-demand and periodic benchmarking.
           CRON_DEFAULT_MODELS: "stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l"
           CRON_DEFAULT_DEVICES: "apple_iphone_15"
-          CRON_DEFAULT_DELEGATES: "xnnpack,coreml"
+          CRON_DEFAULT_DELEGATES: "xnnpack,coreml,mps"
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
@@ -169,6 +169,8 @@ jobs:
             DELEGATE_CONFIG="xnnpack+custom+qe"
           elif [[ ${{ matrix.delegate }} == "coreml" ]]; then
             DELEGATE_CONFIG="coreml"
+          elif [[ ${{ matrix.delegate }} == "mps" ]]; then
+            DELEGATE_CONFIG="mps"
           fi
           PYTHON_EXECUTABLE=python ${CONDA_RUN} --no-capture-output \
             bash .ci/scripts/test_llama.sh "${{ matrix.model }}" "${BUILD_MODE}" "${DTYPE}" "${DELEGATE_CONFIG}" "${ARTIFACTS_DIR_NAME}"
@@ -277,6 +279,7 @@ jobs:
           path: ${{ runner.temp }}/artifacts/
 
   benchmark-on-device:
+    if: always()
     needs:
       - set-parameters
       - upload-benchmark-app
@@ -306,3 +309,78 @@ jobs:
       ios-xctestrun-zip: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/Benchmark.xctestrun.zip
       test-spec: ${{ inputs.test_spec || 'https://ossci-ios.s3.amazonaws.com/executorch/default-ios-device-farm-appium-test-spec.yml' }}
       extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
+
+  upload-benchmark-results:
+    needs:
+      - benchmark-on-device
+    if: always()
+    runs-on: linux.2xlarge
+    environment: upload-benchmark-results
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: false
+
+      - name: Authenticate with AWS
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
+      - name: Setup conda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: '3.10'
+
+      - name: Download the list of artifacts from S3
+        env:
+          ARTIFACTS_S3_DIR: s3://gha-artifacts/device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts/
+        shell: bash
+        run: |
+          set -eux
+          ${CONDA_RUN} python -mpip install awscli==1.32.18
+
+          mkdir -p artifacts
+          pushd artifacts
+          ${CONDA_RUN} aws s3 sync "${ARTIFACTS_S3_DIR}" .
+          popd
+
+          ls -lah artifacts
+
+      - name: Extract the benchmark results JSON
+        shell: bash
+        run: |
+          set -eux
+
+          mkdir -p benchmark-results
+
+          for ARTIFACTS_BY_JOB in artifacts/*.json; do
+            [ -f "${ARTIFACTS_BY_JOB}" ] || break
+            echo "${ARTIFACTS_BY_JOB}"
+            ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
+              --artifacts "${ARTIFACTS_BY_JOB}" \
+              --output-dir benchmark-results \
+              --repo ${{ github.repository }} \
+              --head-branch ${{ github.head_ref || github.ref_name }} \
+              --workflow-name "${{ github.workflow }}" \
+              --workflow-run-id ${{ github.run_id }} \
+              --workflow-run-attempt ${{ github.run_attempt }}
+          done
+
+          ls -lah benchmark-results
+
+          for BENCHMARK_RESULTS in benchmark-results/*.json; do
+            cat "${BENCHMARK_RESULTS}"
+            echo
+          done
+
+      - name: Upload the benchmark results
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: 'benchmark-results'
+          dry-run: false