Qualcomm AI Engine Direct - Meta CI for Mobilebert , W2L, and Llama (#8616)

winskuo-quic · Zonglin Peng · commit ee2180ebd362 · 2025-03-06T11:09:02.000-08:00
* Qualcomm AI Engine Direct - Meta CI for Mobilebert and W2L

* variable update
diff --git a/.ci/scripts/test_model.sh b/.ci/scripts/test_model.sh
@@ -164,6 +164,7 @@ test_model_with_qnn() {
   export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
   export PYTHONPATH=$EXECUTORCH_ROOT/..
 
+  EXTRA_FLAGS=""
   if [[ "${MODEL_NAME}" == "dl3" ]]; then
     EXPORT_SCRIPT=deeplab_v3
   elif [[ "${MODEL_NAME}" == "mv3" ]]; then
@@ -176,6 +177,12 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=inception_v3
   elif [[ "${MODEL_NAME}" == "vit" ]]; then
     EXPORT_SCRIPT=torchvision_vit
+  elif [[ "${MODEL_NAME}" == "mb" ]]; then
+    EXPORT_SCRIPT=mobilebert_fine_tune
+    EXTRA_FLAGS="--num_epochs 1"
+    pip install scikit-learn
+  elif [[ "${MODEL_NAME}" == "w2l" ]]; then
+    EXPORT_SCRIPT=wav2letter
   elif [[ "${MODEL_NAME}" == "edsr" ]]; then
     EXPORT_SCRIPT=edsr
     # Additional deps for edsr
@@ -189,7 +196,7 @@ test_model_with_qnn() {
   # TODO(guangyang): Make QNN chipset matches the target device
   QNN_CHIPSET=SM8450
 
-  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only
+  "${PYTHON_EXECUTABLE}" -m examples.qualcomm.scripts.${EXPORT_SCRIPT} -b ${CMAKE_OUTPUT_DIR} -m ${QNN_CHIPSET} --compile_only $EXTRA_FLAGS
   EXPORTED_MODEL=$(find "./${EXPORT_SCRIPT}" -type f -name "${MODEL_NAME}*.pte" -print -quit)
 }
 
diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml
@@ -311,7 +311,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        model: [dl3, mv3, mv2, ic4, ic3, vit]
+        model: [dl3, mv3, mv2, ic4, ic3, vit, mb, w2l]
       fail-fast: false
     with:
       runner: linux.2xlarge
diff --git a/backends/qualcomm/tests/test_qnn_delegate.py b/backends/qualcomm/tests/test_qnn_delegate.py
@@ -73,7 +73,7 @@
 from executorch.examples.models.mobilenet_v3 import MV3Model
 from executorch.examples.models.torchvision_vit.model import TorchVisionViTModel
 
-# from executorch.examples.models.wav2letter import Wav2LetterModel
+from executorch.examples.models.wav2letter import Wav2LetterModel
 from executorch.exir import to_edge
 from executorch.exir.backend.backend_api import disable_validation
 from executorch.exir.passes import PassManager
@@ -907,8 +907,7 @@ def test_qnn_backend_example_models(self):
             # Fail during lowering Reopen once resolved
             # MobileBertModelExample(),
             # TorchVisionViTModel(),
-            # Encountered undefined symbol in mainline. Reopen once resolved.
-            # Wav2LetterModel(),
+            Wav2LetterModel(),
         ]
         expected_partitions = [
             1,
@@ -917,8 +916,8 @@ def test_qnn_backend_example_models(self):
             1,
             1,
             1,
-            1,
-            1,
+            # 1,
+            # 1,
             1,
         ]
         # TODO: Due to trigger maximum recursion depth exceeded, need to check it.
@@ -1962,12 +1961,11 @@ def test_qnn_backend_example_models(self):
                 QCOM_ANNOTATION: (),
                 QCOM_QUANT_DTYPE: QuantDtype.use_8a8w,
             },
-            # Encountered undefined symbol in mainline. Reopen once resolved.
-            # {
-            #     QCOM_MODULE: Wav2LetterModel(),
-            #     QCOM_ANNOTATION: (),
-            #     QCOM_QUANT_DTYPE: QuantDtype.use_8a8w,
-            # },
+            {
+                QCOM_MODULE: Wav2LetterModel(),
+                QCOM_ANNOTATION: (),
+                QCOM_QUANT_DTYPE: QuantDtype.use_8a8w,
+            },
         ]
         expected_partitions = [
             1,
@@ -1979,7 +1977,7 @@ def test_qnn_backend_example_models(self):
             # For MobileBertModelExample
             # 1,
             1,
-            # 1, For Wav2LetterModel
+            1,
         ]
         # TODO: Due to trigger maximum recursion depth exceeded, need to check it.
         disable_validation()
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -843,6 +843,7 @@ def post_process():
     )
 
     runner_cmd = ""
+    performance_output_path = "outputs/inference_speed.txt"
     if args.enable_x86_64:
         # x86 emulator is intended for CI and not performance. Check only the first few tokens.
         seq_len = min(seq_len, 16)
@@ -862,6 +863,7 @@ def post_process():
                 f"--model_path {pte_path}",
                 f"--seq_len {seq_len}",
                 f"--output_path {args.artifact}/outputs/outputs.txt",
+                f"--performance_output_path {performance_output_path}",
                 f"--kv_updater ShiftPointer",
                 runner_args,
             ]
@@ -882,6 +884,7 @@ def post_process():
                 f"--model_path {pte_filename}.pte",
                 f"--seq_len {seq_len}",
                 "--output_path outputs/outputs.txt",
+                f"--performance_output_path {performance_output_path}",
                 f"--kv_updater {'SmartMask' if args.kv_updater == smart_mask_updater else 'ShiftPointer'}",
                 runner_args,
             ]
@@ -905,7 +908,7 @@ def post_process():
         adb.pull(output_path=args.artifact, callback=post_process)
     if args.ip and args.port != -1:
         inference_speed = 0
-        with open(f"{args.artifact}/outputs/inference_speed.txt", "r") as f:
+        with open(f"{args.artifact}/{performance_output_path}", "r") as f:
             inference_speed = float(f.read())
 
         pte_size = os.path.getsize(pte_path)
diff --git a/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp b/examples/qualcomm/oss_scripts/llama/qnn_llama_runner.cpp
@@ -30,6 +30,10 @@ DEFINE_string(
     output_path,
     "outputs.txt",
     "Executorch inference data output path.");
+DEFINE_string(
+    performance_output_path,
+    "inference_speed.txt",
+    "Records inference speed. For CI purpose.");
 DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
 DEFINE_string(
@@ -63,6 +67,7 @@ int main(int argc, char** argv) {
   example::Runner runner(
       {FLAGS_model_path},
       FLAGS_tokenizer_path.c_str(),
+      FLAGS_performance_output_path.c_str(),
       FLAGS_logits_scale,
       FLAGS_logits_offset,
       FLAGS_temperature,
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -34,13 +34,16 @@ namespace example {
 
 namespace {
 static constexpr auto kTopp = 0.9f;
-void printReport(const Runner::Stats& stats);
+void printReport(
+    const Runner::Stats& stats,
+    const std::string& performance_output_path);
 std::string statsToJsonString(const Runner::Stats& stats);
 } // namespace
 
 Runner::Runner(
     const std::vector<std::string>& models_path,
     const std::string& tokenizer_path,
+    const std::string& performance_output_path,
     const float logits_scale,
     const int32_t logits_offset,
     const float temperature,
@@ -49,6 +52,7 @@ Runner::Runner(
     : n_bos_(1),
       n_eos_(1),
       tokenizer_path_(tokenizer_path),
+      performance_output_path_(performance_output_path),
       logits_scale_(logits_scale),
       logits_offset_(logits_offset),
       temperature_(temperature),
@@ -437,7 +441,7 @@ Error Runner::generate(
 
   stats_.num_prompt_tokens = num_prompt_tokens;
   stats_.num_generated_tokens = pos - num_prompt_tokens;
-  printReport(stats_);
+  printReport(stats_, performance_output_path_);
   if (stats_callback) {
     stats_callback(stats_);
   }
@@ -446,7 +450,9 @@ Error Runner::generate(
 }
 
 namespace {
-void printReport(const Runner::Stats& stats) {
+void printReport(
+    const Runner::Stats& stats,
+    const std::string& performance_output_path) {
   printf("PyTorchObserver %s\n", statsToJsonString(stats).c_str());
 
   ET_LOG(
@@ -507,7 +513,8 @@ void printReport(const Runner::Stats& stats) {
 
   // For now, we just print the total inference time for CI, can save more info
   // in future if needed.
-  std::ofstream outfile("outputs/inference_speed.txt");
+
+  std::ofstream outfile(performance_output_path.c_str());
   if (outfile.is_open()) {
     double num_tok = (stats.num_generated_tokens) /
         (double)(stats.inference_end_ms - stats.inference_start_ms) *
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.h b/examples/qualcomm/oss_scripts/llama/runner/runner.h
@@ -29,6 +29,7 @@ class Runner {
   explicit Runner(
       const std::vector<std::string>& models_path,
       const std::string& tokenizer_path,
+      const std::string& performance_output_path_,
       const float logits_scale,
       const int32_t logits_offset,
       const float temperature,
@@ -101,6 +102,7 @@ class Runner {
   const int32_t n_eos_;
   std::vector<std::shared_ptr<executorch::extension::Module>> modules_;
   std::string tokenizer_path_;
+  std::string performance_output_path_;
   float logits_scale_;
   int32_t logits_offset_;
   float temperature_;
diff --git a/examples/qualcomm/scripts/mobilebert_fine_tune.py b/examples/qualcomm/scripts/mobilebert_fine_tune.py
@@ -169,7 +169,7 @@ def get_fine_tuned_mobilebert(artifacts_dir, pretrained_weight, batch_size):
     dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
     dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)
 
-    epochs = 5
+    epochs = args.num_epochs
     dataloader_train = DataLoader(
         dataset_train,
         sampler=RandomSampler(dataset_train),
@@ -366,6 +366,13 @@ def calibrator(gm):
         type=str,
     )
 
+    parser.add_argument(
+        "--num_epochs",
+        help="If no pretrained weights are provided, set number of epochs to train the model",
+        default=5,
+        type=int,
+    )
+
     parser.add_argument(
         "-F",
         "--use_fp16",
diff --git a/examples/qualcomm/scripts/wav2letter.py b/examples/qualcomm/scripts/wav2letter.py
@@ -5,6 +5,7 @@
 # LICENSE file in the root directory of this source tree.
 
 import json
+import logging
 import os
 import sys
 from multiprocessing.connection import Client
@@ -111,7 +112,12 @@ def main(args):
     # target labels " abcdefghijklmnopqrstuvwxyz'*"
     instance.vocab_size = 29
     model = instance.get_eager_model().eval()
-    model.load_state_dict(torch.load(args.pretrained_weight, weights_only=True))
+    if args.pretrained_weight:
+        model.load_state_dict(torch.load(args.pretrained_weight, weights_only=True))
+    else:
+        logging.warning(
+            "It is strongly recommended to provide pretrained weights, otherwise accuracy will be bad. This option is here mainly for CI purpose to ensure compile is successful."
+        )
 
     # convert conv1d to conv2d in nn.Module level will only introduce 2 permute
     # nodes around input & output, which is more quantization friendly.
@@ -128,9 +134,15 @@ def main(args):
 
     # retrieve dataset, will take some time to download
     data_num = 100
-    inputs, targets, input_list = get_dataset(
-        data_size=data_num, artifact_dir=args.artifact
-    )
+    if args.compile_only:
+        inputs = [(torch.rand(1, 1, 700, 1),)]
+        logging.warning(
+            "With compile_only, accuracy will be bad due to insufficient datasets for quantization."
+        )
+    else:
+        inputs, targets, input_list = get_dataset(
+            data_size=data_num, artifact_dir=args.artifact
+        )
     pte_filename = "w2l_qnn"
     build_executorch_binary(
         model,
@@ -212,7 +224,7 @@ def main(args):
         ),
         default=None,
         type=str,
-        required=True,
+        required=False,
     )
 
     args = parser.parse_args()