Upload Android benchmark results to OSS benchmark database (#5808)

huydhn · facebook-github-bot · commit 4651d654241e · 2024-10-04T23:39:56.000-07:00
Summary: This PR adds a job to upload Android benchmark results to the benchmark database. It transforms the `benchmark_results.json` file slightly to fit into the current schema. We are going to have a better schema soon https://fburl.com/gdoc/ossgtvte, but landing this first would unblock the work on building the dashboard before the launch. Updating the schema can be done later. * The job processes what it finds, so if one model fails, the rest will still be uploaded. * I will follow up with another PR for iOS later. No need to wait for the TPS metric there, we'll upload what available first. There are still some TODO pending: * pytorch/test-infra#5742 * pytorch-labs/pytorch-gha-infra#483 But the structure of the CI job is ready to review. Pull Request resolved: #5808 Reviewed By: guangy10, kirklandsign Differential Revision: D63869876 Pulled By: huydhn fbshipit-source-id: f9bf85c9599fafbfcc300d47e4307230c46b16db
diff --git a/.github/scripts/extract_benchmark_results.py b/.github/scripts/extract_benchmark_results.py
@@ -0,0 +1,246 @@
+#!/usr/bin/env python3
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import json
+import logging
+import os
+import re
+import time
+import zipfile
+from argparse import Action, ArgumentParser, Namespace
+from io import BytesIO
+from logging import info, warning
+from typing import Any, List, Optional
+from urllib import error, request
+
+
+logging.basicConfig(level=logging.INFO)
+
+
+BENCHMARK_RESULTS_FILENAME = "benchmark_results.json"
+ARTIFACTS_FILENAME_REGEX = re.compile(r"(android|ios)-artifacts-(?P<job_id>\d+).json")
+
+
+class ValidateArtifacts(Action):
+    def __call__(
+        self,
+        parser: ArgumentParser,
+        namespace: Namespace,
+        values: Any,
+        option_string: Optional[str] = None,
+    ) -> None:
+        if os.path.isfile(values) and values.endswith(".json"):
+            setattr(namespace, self.dest, values)
+            return
+
+        parser.error(f"{values} is not a valid JSON file (*.json)")
+
+
+class ValidateOutputDir(Action):
+    def __call__(
+        self,
+        parser: ArgumentParser,
+        namespace: Namespace,
+        values: Any,
+        option_string: Optional[str] = None,
+    ) -> None:
+        if os.path.isdir(values):
+            setattr(namespace, self.dest, values)
+            return
+
+        parser.error(f"{values} is not a valid directory")
+
+
+def parse_args() -> Any:
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser("extract benchmark results from AWS Device Farm artifacts")
+    parser.add_argument(
+        "--artifacts",
+        type=str,
+        required=True,
+        action=ValidateArtifacts,
+        help="the list of artifacts from AWS in JSON format",
+    )
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        required=True,
+        action=ValidateOutputDir,
+        help="the directory to keep the benchmark results",
+    )
+    parser.add_argument(
+        "--repo",
+        type=str,
+        required=True,
+        help="which GitHub repo this workflow run belongs to",
+    )
+    parser.add_argument(
+        "--head-branch",
+        type=str,
+        required=True,
+        help="the head branch that runs",
+    )
+    parser.add_argument(
+        "--workflow-name",
+        type=str,
+        required=True,
+        help="the name of the benchmark workflow",
+    )
+    parser.add_argument(
+        "--workflow-run-id",
+        type=int,
+        required=True,
+        help="the id of the benchmark workflow",
+    )
+    parser.add_argument(
+        "--workflow-run-attempt",
+        type=int,
+        required=True,
+        help="which retry of the workflow this is",
+    )
+
+    return parser.parse_args()
+
+
+def extract_android_benchmark_results(
+    job_name: str, artifact_type: str, artifact_s3_url: str
+) -> List:
+    """
+    The benchmark results from Android have already been stored in CUSTOMER_ARTIFACT
+    artifact, so we will just need to get it
+
+    Return the list of benchmark results.
+    """
+    if artifact_type != "CUSTOMER_ARTIFACT":
+        return []
+
+    try:
+        with request.urlopen(artifact_s3_url) as data:
+            with zipfile.ZipFile(BytesIO(data.read())) as customer_artifact:
+                for name in customer_artifact.namelist():
+                    if BENCHMARK_RESULTS_FILENAME in name:
+                        return json.loads(customer_artifact.read(name))
+
+    except error.HTTPError:
+        warning(f"Fail to {artifact_type} {artifact_s3_url}")
+        return []
+
+
+def extract_job_id(artifacts_filename: str) -> int:
+    """
+    Extract the job id from the artifacts filename
+    """
+    m = ARTIFACTS_FILENAME_REGEX.match(os.path.basename(artifacts_filename))
+    if not m:
+        return 0
+    return int(m.group("job_id"))
+
+
+def transform(
+    app_type: str,
+    benchmark_results: List,
+    repo: str,
+    head_branch: str,
+    workflow_name: str,
+    workflow_run_id: int,
+    workflow_run_attempt: int,
+    job_name: str,
+    job_id: int,
+) -> List:
+    """
+    Transform the benchmark results into the format writable into the benchmark database
+    """
+    # Overwrite the device name here with the job name as it has more information about
+    # the device, i.e. Samsung Galaxy S22 5G instead of just Samsung
+    for r in benchmark_results:
+        r["deviceInfo"]["device"] = job_name
+
+    # TODO (huydhn): This is the current schema of the database oss_ci_benchmark_v2,
+    # and I'm trying to fit ET benchmark results into it, which is kind of awkward.
+    # However, the schema is going to be updated soon
+    return [
+        {
+            # GH-info to identify where the benchmark is run
+            "repo": repo,
+            "head_branch": head_branch,
+            "workflow_id": workflow_run_id,
+            "run_attempt": workflow_run_attempt,
+            "job_id": job_id,
+            # The model
+            "name": f"{r['benchmarkModel']['name']} {r['benchmarkModel'].get('backend', '')}".strip(),
+            "dtype": (
+                r["benchmarkModel"]["quantization"]
+                if r["benchmarkModel"]["quantization"]
+                else "unknown"
+            ),
+            # The metric value
+            "metric": r["metric"],
+            "actual": r["actualValue"],
+            "target": r["targetValue"],
+            # The device
+            "device": r["deviceInfo"]["device"],
+            "arch": r["deviceInfo"].get("os", ""),
+            # Not used here, just set it to something unique here
+            "filename": workflow_name,
+            "test_name": app_type,
+            "runner": job_name,
+        }
+        for r in benchmark_results
+    ]
+
+
+def main() -> None:
+    args = parse_args()
+
+    # Across all devices
+    all_benchmark_results = []
+
+    with open(args.artifacts) as f:
+        for artifact in json.load(f):
+            app_type = artifact.get("app_type", "")
+            # We expect this to be set to either ANDROID_APP or IOS_APP
+            if not app_type or app_type not in ["ANDROID_APP", "IOS_APP"]:
+                info(
+                    f"App type {app_type} is not recognized in artifact {json.dumps(artifact)}"
+                )
+                continue
+
+            job_name = artifact["job_name"]
+            artifact_type = artifact["type"]
+            artifact_s3_url = artifact["s3_url"]
+
+            if app_type == "ANDROID_APP":
+                benchmark_results = extract_android_benchmark_results(
+                    job_name, artifact_type, artifact_s3_url
+                )
+                if benchmark_results:
+                    benchmark_results = transform(
+                        app_type,
+                        benchmark_results,
+                        args.repo,
+                        args.head_branch,
+                        args.workflow_name,
+                        args.workflow_run_id,
+                        args.workflow_run_attempt,
+                        job_name,
+                        extract_job_id(args.artifacts),
+                    )
+                    all_benchmark_results.extend(benchmark_results)
+
+            if app_type == "IOS_APP":
+                # TODO (huydhn): Implement the logic for iOS next
+                pass
+
+    if all_benchmark_results:
+        output_file = os.path.basename(args.artifacts)
+        with open(f"{args.output_dir}/{output_file}", "w") as f:
+            json.dump(all_benchmark_results, f)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/.github/workflows/android-perf.yml b/.github/workflows/android-perf.yml
@@ -234,3 +234,78 @@ jobs:
       test-spec: ${{ inputs.test_spec || 'https://ossci-android.s3.amazonaws.com/executorch/android-llm-device-farm-test-spec.yml' }}
       # Uploaded to S3 from the previous job
       extra-data: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifacts/${{ matrix.model }}_${{ matrix.delegate }}/model.zip
+
+  upload-benchmark-results:
+    needs:
+      - benchmark-on-device
+    if: always()
+    runs-on: linux.2xlarge
+    environment: upload-benchmark-results
+    permissions:
+      id-token: write
+      contents: read
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: false
+
+      - name: Authenticate with AWS
+        uses: aws-actions/configure-aws-credentials@v4
+        with:
+          role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_upload-benchmark-results
+          # The max duration enforced by the server side
+          role-duration-seconds: 18000
+          aws-region: us-east-1
+
+      - name: Setup conda
+        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
+        with:
+          python-version: '3.10'
+
+      - name: Download the list of artifacts from S3
+        env:
+          ARTIFACTS_S3_DIR: s3://gha-artifacts/device_farm/${{ github.run_id }}/${{ github.run_attempt }}/artifacts/
+        shell: bash
+        run: |
+          set -eux
+          ${CONDA_RUN} python -mpip install awscli==1.32.18
+
+          mkdir -p artifacts
+          pushd artifacts
+          ${CONDA_RUN} aws s3 sync "${ARTIFACTS_S3_DIR}" .
+          popd
+
+          ls -lah artifacts
+
+      - name: Extract the benchmark results JSON
+        shell: bash
+        run: |
+          set -eux
+
+          mkdir -p benchmark-results
+
+          for ARTIFACTS_BY_JOB in artifacts/*.json; do
+            [ -f "${ARTIFACTS_BY_JOB}" ] || break
+            echo "${ARTIFACTS_BY_JOB}"
+            ${CONDA_RUN} python .github/scripts/extract_benchmark_results.py \
+              --artifacts "${ARTIFACTS_BY_JOB}" \
+              --output-dir benchmark-results \
+              --repo ${{ github.repository }} \
+              --head-branch ${{ github.head_ref || github.ref_name }} \
+              --workflow-name ${{ github.workflow }} \
+              --workflow-run-id ${{ github.run_id }} \
+              --workflow-run-attempt ${{ github.run_attempt }}
+          done
+
+          ls -lah benchmark-results
+
+          for BENCHMARK_RESULTS in benchmark-results/*.json; do
+            cat "${BENCHMARK_RESULTS}"
+            echo
+          done
+
+      - name: Upload the benchmark results
+        uses: pytorch/test-infra/.github/actions/upload-benchmark-results@main
+        with:
+          benchmark-results-dir: 'benchmark-results'
+          dry-run: false