Define generic Android benchmark metric structure (#5332)

huydhn · facebook-github-bot · commit 0d1644fcd81e · 2024-09-13T13:15:37.000-07:00
Summary: To be able to display the benchmark results, we need the following information: 1. About the model * Name, i.e. `mv2` * The backend it uses, i.e. `xnnpack` * The quantization (dtype) applied, i.e. `q8` 2. About the metric * Name, i.e. `token_per_sec`. Note that this needs to be flexible to cover future metrics * Value * An optional target (so that we can highlight regression if it happens) 3. More metadata * The device name, i.e. `samsung` * The device model and its Android version * More can be included here I codified these fields in a new `BenchmarkMetric` class, so that the benchmark results can be expressed as a list of different metrics in the result JSON. NB: Atm, the information about the model is extracted from its name, i.e. `NAME_BACKEND_QUANTIZATION.pte`, but it's better to get it from the file itself instead. Achieving this needs a bit more research. ### Testing https://github.com/pytorch/executorch/actions/runs/10843580072 * The JSON for `llama2`: ``` [ { "actual": 247, "arch": "SM-S901U1 / 12", "benchmarkModel": { "backend": "", "name": "llama2", "quantization": "" }, "device": "samsung", "metric": "model_load_time(ms)", "target": 0 }, { "actual": 367, "arch": "SM-S901U1 / 12", "benchmarkModel": { "backend": "", "name": "llama2", "quantization": "" }, "device": "samsung", "metric": "generate_time(ms)", "target": 0 }, { "actual": 342.69662, "arch": "SM-S901U1 / 12", "benchmarkModel": { "backend": "", "name": "llama2", "quantization": "" }, "device": "samsung", "metric": "token_per_sec", "target": 0 } ] ``` * The JSON for `mv2_xnnpack_q8`. I keep the average latency here as the final number to show later on the dashboard. ``` [ { "actual": 91.1, "arch": "SM-S908U1 / 12", "benchmarkModel": { "backend": "xnnpack", "name": "mv2", "quantization": "q8" }, "device": "samsung", "metric": "avg_inference_latency(ms)", "target": 0 } ] ``` Pull Request resolved: #5332 Reviewed By: guangy10, kirklandsign Differential Revision: D62624549 Pulled By: huydhn fbshipit-source-id: 5c1a605c1012396ff904c148e9a99967c83321f6
diff --git a/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java b/examples/demo-apps/android/LlamaDemo/app/src/main/java/com/example/executorchllamademo/LlmBenchmarkRunner.java
@@ -9,7 +9,9 @@
 package com.example.executorchllamademo;
 
 import android.app.Activity;
+import android.app.ActivityManager;
 import android.content.Intent;
+import android.os.Build;
 import android.os.Bundle;
 import android.util.Log;
 import android.widget.TextView;
@@ -18,7 +20,11 @@
 import java.io.File;
 import java.io.FileWriter;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 public class LlmBenchmarkRunner extends Activity implements ModelRunnerCallback {
   ModelRunner mModelRunner;
@@ -50,19 +56,21 @@ protected void onCreate(Bundle savedInstanceState) {
     }
 
     mStatsDump = new StatsDump();
+    mStatsDump.modelName = model.getName().replace(".pte", "");
     mModelRunner = new ModelRunner(model.getPath(), tokenizerPath, temperature, this);
-    mStatsDump.loadStart = System.currentTimeMillis();
+    mStatsDump.loadStart = System.nanoTime();
   }
 
   @Override
   public void onModelLoaded(int status) {
-    mStatsDump.loadEnd = System.currentTimeMillis();
+    mStatsDump.loadEnd = System.nanoTime();
+    mStatsDump.loadStatus = status;
     if (status != 0) {
       Log.e("LlmBenchmarkRunner", "Loaded failed: " + status);
       onGenerationStopped();
       return;
     }
-    mStatsDump.generateStart = System.currentTimeMillis();
+    mStatsDump.generateStart = System.nanoTime();
     mModelRunner.generate(mPrompt);
   }
 
@@ -81,36 +89,122 @@ public void onStats(String stats) {
 
   @Override
   public void onGenerationStopped() {
-    mStatsDump.generateEnd = System.currentTimeMillis();
+    mStatsDump.generateEnd = System.nanoTime();
     runOnUiThread(
         () -> {
           mTextView.append(mStatsDump.toString());
         });
 
-    // TODO (huydhn): Remove txt files here once the JSON format is ready
-    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) {
-      writer.write(mStatsDump.toString());
-    } catch (IOException e) {
-      e.printStackTrace();
-    }
+    final BenchmarkMetric.BenchmarkModel benchmarkModel =
+        BenchmarkMetric.extractBackendAndQuantization(mStatsDump.modelName);
+    final List<BenchmarkMetric> results = new ArrayList<>();
+    // The list of metrics we have atm includes:
+    // Load status
+    results.add(new BenchmarkMetric(benchmarkModel, "load_status", mStatsDump.loadStatus, 0));
+    // Model load time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "model_load_time(ns)",
+            mStatsDump.loadEnd - mStatsDump.loadStart,
+            0.0f));
+    // LLM generate time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "generate_time(ns)",
+            mStatsDump.generateEnd - mStatsDump.generateStart,
+            0.0f));
+    // Token per second
+    results.add(
+        new BenchmarkMetric(benchmarkModel, "token_per_sec", extractTPS(mStatsDump.tokens), 0.0f));
 
-    // TODO (huydhn): Figure out on what the final JSON results looks like, we need something
-    // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042
     try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
       Gson gson = new Gson();
-      writer.write(gson.toJson(mStatsDump));
+      writer.write(gson.toJson(results));
     } catch (IOException e) {
       e.printStackTrace();
     }
   }
+
+  private double extractTPS(final String tokens) {
+    final Matcher m = Pattern.compile("\\d+\\.?\\d*").matcher(tokens);
+    if (m.find()) {
+      return Double.parseDouble(m.group());
+    } else {
+      return 0.0f;
+    }
+  }
+}
+
+class BenchmarkMetric {
+  public static class BenchmarkModel {
+    // The model name, i.e. stories110M
+    String name;
+    String backend;
+    String quantization;
+
+    public BenchmarkModel(final String name, final String backend, final String quantization) {
+      this.name = name;
+      this.backend = backend;
+      this.quantization = quantization;
+    }
+  }
+
+  BenchmarkModel benchmarkModel;
+
+  // The metric name, i.e. TPS
+  String metric;
+
+  // The actual value and the option target value
+  double actualValue;
+  double targetValue;
+
+  public static class DeviceInfo {
+    // Let's see which information we want to include here
+    final String device = Build.BRAND;
+    // The phone model and Android release version
+    final String arch = Build.MODEL;
+    final String os = "Android " + Build.VERSION.RELEASE;
+    final long totalMem = new ActivityManager.MemoryInfo().totalMem;
+    final long availMem = new ActivityManager.MemoryInfo().availMem;
+  }
+
+  DeviceInfo deviceInfo = new DeviceInfo();
+
+  public BenchmarkMetric(
+      final BenchmarkModel benchmarkModel,
+      final String metric,
+      final double actualValue,
+      final double targetValue) {
+    this.benchmarkModel = benchmarkModel;
+    this.metric = metric;
+    this.actualValue = actualValue;
+    this.targetValue = targetValue;
+  }
+
+  // TODO (huydhn): Figure out a way to extract the backend and quantization information from
+  // the .pte model itself instead of parsing its name
+  public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) {
+    final Matcher m =
+        Pattern.compile("(?<name>\\w+)_(?<backend>\\w+)_(?<quantization>\\w+)").matcher(model);
+    if (m.matches()) {
+      return new BenchmarkMetric.BenchmarkModel(
+          m.group("name"), m.group("backend"), m.group("quantization"));
+    } else {
+      return new BenchmarkMetric.BenchmarkModel(model, "", "");
+    }
+  }
 }
 
 class StatsDump {
+  int loadStatus;
   long loadStart;
   long loadEnd;
   long generateStart;
   long generateEnd;
   String tokens;
+  String modelName;
 
   @NonNull
   @Override
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkActivity.java
@@ -47,34 +47,49 @@ protected void onCreate(Bundle savedInstanceState) {
     // TODO: Format the string with a parsable format
     Stats stats = new Stats();
 
+    // Record the time it takes to load the model and the forward method
+    stats.loadStart = System.nanoTime();
     Module module = Module.load(model.getPath());
+    stats.errorCode = module.loadMethod("forward");
+    stats.loadEnd = System.nanoTime();
+
     for (int i = 0; i < numIter; i++) {
-      long start = System.currentTimeMillis();
+      long start = System.nanoTime();
       module.forward();
-      long forwardMs = System.currentTimeMillis() - start;
+      long forwardMs = System.nanoTime() - start;
       stats.latency.add(forwardMs);
     }
-    stats.errorCode = module.loadMethod("forward");
 
-    // TODO (huydhn): Remove txt files here once the JSON format is ready
-    try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.txt")) {
-      writer.write(stats.toString());
-    } catch (IOException e) {
-      e.printStackTrace();
-    }
+    final BenchmarkMetric.BenchmarkModel benchmarkModel =
+        BenchmarkMetric.extractBackendAndQuantization(model.getName().replace(".pte", ""));
+    final List<BenchmarkMetric> results = new ArrayList<>();
+    // The list of metrics we have atm includes:
+    // Avg inference latency after N iterations
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel,
+            "avg_inference_latency(ns)",
+            stats.latency.stream().mapToDouble(l -> l).average().orElse(0.0f),
+            0.0f));
+    // Model load time
+    results.add(
+        new BenchmarkMetric(
+            benchmarkModel, "model_load_time(ns)", stats.loadEnd - stats.loadStart, 0.0f));
+    // Load status
+    results.add(new BenchmarkMetric(benchmarkModel, "load_status", stats.errorCode, 0));
 
-    // TODO (huydhn): Figure out on what the final JSON results looks like, we need something
-    // with the same number of fields as https://github.com/pytorch/pytorch/pull/135042
     try (FileWriter writer = new FileWriter(getFilesDir() + "/benchmark_results.json")) {
       Gson gson = new Gson();
-      writer.write(gson.toJson(stats));
+      writer.write(gson.toJson(results));
     } catch (IOException e) {
       e.printStackTrace();
     }
   }
 }
 
 class Stats {
+  long loadStart;
+  long loadEnd;
   List<Long> latency = new ArrayList<>();
   int errorCode = 0;
 
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/BenchmarkMetric.java
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+package org.pytorch.minibench;
+
+import android.app.ActivityManager;
+import android.os.Build;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+class BenchmarkMetric {
+  public static class BenchmarkModel {
+    // The model name, i.e. stories110M
+    String name;
+    String backend;
+    String quantization;
+
+    public BenchmarkModel(final String name, final String backend, final String quantization) {
+      this.name = name;
+      this.backend = backend;
+      this.quantization = quantization;
+    }
+  }
+
+  BenchmarkModel benchmarkModel;
+
+  // The metric name, i.e. TPS
+  String metric;
+
+  // The actual value and the option target value
+  double actualValue;
+  double targetValue;
+
+  public static class DeviceInfo {
+    // Let's see which information we want to include here
+    final String device = Build.BRAND;
+    // The phone model and Android release version
+    final String arch = Build.MODEL;
+    final String os = "Android " + Build.VERSION.RELEASE;
+    final long totalMem = new ActivityManager.MemoryInfo().totalMem;
+    final long availMem = new ActivityManager.MemoryInfo().availMem;
+  }
+
+  DeviceInfo deviceInfo = new DeviceInfo();
+
+  public BenchmarkMetric(
+      final BenchmarkModel benchmarkModel,
+      final String metric,
+      final double actualValue,
+      final double targetValue) {
+    this.benchmarkModel = benchmarkModel;
+    this.metric = metric;
+    this.actualValue = actualValue;
+    this.targetValue = targetValue;
+  }
+
+  // TODO (huydhn): Figure out a way to extract the backend and quantization information from
+  // the .pte model itself instead of parsing its name
+  public static BenchmarkMetric.BenchmarkModel extractBackendAndQuantization(final String model) {
+    final Matcher m =
+        Pattern.compile("(?<name>\\w+)_(?<backend>\\w+)_(?<quantization>\\w+)").matcher(model);
+    if (m.matches()) {
+      return new BenchmarkMetric.BenchmarkModel(
+          m.group("name"), m.group("backend"), m.group("quantization"));
+    } else {
+      return new BenchmarkMetric.BenchmarkModel(model, "", "");
+    }
+  }
+}
diff --git a/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java b/extension/android/benchmark/app/src/main/java/org/pytorch/minibench/LlmBenchmarkActivity.java