pytorch · kirklandsign · Aug 20, 2024 · Aug 14, 2024
@@ -332,7 +332,8 @@ Error QnnManager::AllocateTensor() {
     const std::string& tensor_name = tensor_wrapper->GetName();
     // this is required by identifying shared buffer mechanism
     // info might be missed if context binary came from qnn_converter
-    if (tensor_name.find("output_") == std::string::npos) {
+    if (options_->is_from_context_binary() &&
+        tensor_name.find("output_") == std::string::npos) {
       tensor_wrapper->SetName("output_" + tensor_name);
     }
     if (IsTensorDump()) {

@@ -1923,7 +1923,7 @@ def test_llama2_7b(self):
         prompt = "Explain the rules of baseball"
         cmds = [
             "python",
-            f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b.py",
+            f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py",
             "--artifact",
             self.artifact_dir,
             "--build_folder",
@@ -1957,6 +1957,47 @@ def test_llama2_7b(self):
                 model_out = msg["result"]
                 self.assertTrue(model_out.startswith(prompt))
 
+    def test_llama3_8b(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "Explain the rules of baseball"
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--device",
+            self.device,
+            "--model",
+            self.model,
+            "--tokenizer_model",
+            f"{self.artifact_dir}/tokenizer.model",
+            "--context_binaries",
+            f"{self.artifact_dir}",
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+        ]
+        if self.host:
+            cmds.extend(["--host", self.host])
+
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                model_out = msg["result"]
+                self.assertTrue(model_out.startswith(prompt))
+
 
 class TestExampleScript(TestQNN):
     def required_envs(self, conditions=None) -> bool:

@@ -121,6 +121,7 @@ def replace_linear(module: torch.nn.Module):
 
 def canonicalize_program(
     exported_program: ExportedProgram | List[LoweredBackendModule],
+    custom_buffer_size=None,
 ):
     # check if user specifies to use multi_contexts
     # this is a generic approach in case there exists multiple backends
@@ -140,7 +141,12 @@ def process_exported_program(prog):
             return max_sf_buf_size, module_map
 
         def process_lowered_module(module):
-            return len(module.processed_bytes), {
+            spill_fill_size = (
+                len(module.processed_bytes)
+                if custom_buffer_size is None
+                else custom_buffer_size
+            )
+            return spill_fill_size, {
                 module: convert_to_option(module.compile_specs[0].value)
             }
 

@@ -69,15 +69,15 @@ target_include_directories(
 
 # build qnn_executor_runner
 add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/executor_runner
+  ${CMAKE_CURRENT_SOURCE_DIR}/executor_runner
 )
 
 # build qnn_llama_runner
 add_subdirectory(
-    ${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2
+  ${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2
 )
 
-# build qaihub_llama2_7b_runner
+# build qaihub_llama2_7b_runner and qaihub_llama3_8b_runner
 add_subdirectory(
-  ${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama2
+  ${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama
 )
@@ -9,10 +9,9 @@
 /**
  * @file
  *
- * This tool can run ExecuTorch model files with Qualcomm AI Engine Direct
- * and the portable kernels.
+ * This tool can run ExecuTorch model files with Qualcomm AI Engine Direct.
  *
- * User could specify arguments like desired input data, iterations, etc.
+ * User could specify arguments like desired prompt, temperature, etc.
  */
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>

@@ -0,0 +1,94 @@
+# Copyright (c) Qualcomm Innovation Center, Inc.
+# All rights reserved
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# preprocess qaihub runner src files for llama2,3
+set(_qaihub_llama_runner__srcs ${_llama_runner__srcs})
+list(TRANSFORM _qaihub_llama_runner__srcs  PREPEND "${EXECUTORCH_SOURCE_DIR}/")
+list(FILTER _qaihub_llama_runner__srcs  EXCLUDE REGEX ".*(/runner/).*")
+list(PREPEND _qaihub_llama_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
+  ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.cpp
+  ${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.h
+)
+
+
+# preprocess qaihub llama2 7b runner src files
+set(_qaihub_llama2_7b_runner__srcs ${_qaihub_llama_runner__srcs})
+
+list(PREPEND _qaihub_llama2_7b_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_llama2_7b_runner.cpp
+)
+
+# build qaihub llama2 7b runner
+add_executable(qaihub_llama2_7b_runner ${_qaihub_llama2_7b_runner__srcs})
+target_include_directories(qaihub_llama2_7b_runner
+  PUBLIC ${_common_include_directories}
+)
+target_link_libraries(qaihub_llama2_7b_runner
+  qnn_executorch_backend
+  executorch_no_prim_ops
+  extension_data_loader
+  extension_module
+  gflags
+)
+target_compile_options(qaihub_llama2_7b_runner
+  PUBLIC ${_common_compile_options}
+)
+
+
+# preprocess qaihub llama3 8b runner src files
+set(_qaihub_llama3_8b_runner__srcs ${_qaihub_llama_runner__srcs})
+
+list(PREPEND _qaihub_llama3_8b_runner__srcs
+  ${CMAKE_CURRENT_LIST_DIR}/llama3/qaihub_llama3_8b_runner.cpp
+)
+
+# Adding a compile option to differentiate llama2 with llama3 logic
+list(APPEND _common_compile_options -DQAIHUB_LLAMA3_RUNNER)
+
+# find RE2 for tokenizer
+set(ABSL_ENABLE_INSTALL ON)
+set(ABSL_PROPAGATE_CXX_STD ON)
+set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+add_subdirectory(
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/third-party/abseil-cpp
+  ${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
+)
+add_subdirectory(
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/third-party/re2
+  ${CMAKE_CURRENT_BINARY_DIR}/re2
+)
+set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})
+
+
+list(APPEND _qaihub_llama3_8b_runner__srcs
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
+)
+list(APPEND _qaihub_llama3_8b_runner__srcs
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama2/tokenizer/llama_tiktoken.cpp
+)
+set(_preprocessor_flag -DET_USE_TIKTOKEN)
+
+
+# build qaihub llama3 8b runner
+add_executable(qaihub_llama3_8b_runner ${_qaihub_llama3_8b_runner__srcs})
+target_include_directories(qaihub_llama3_8b_runner
+  PUBLIC ${_common_include_directories}
+)
+
+target_link_libraries(qaihub_llama3_8b_runner
+  qnn_executorch_backend
+  executorch_no_prim_ops
+  extension_data_loader
+  extension_module
+  gflags
+  re2::re2
+)
+target_compile_options(qaihub_llama3_8b_runner
+  PUBLIC ${_common_compile_options}
+)
@@ -0,0 +1,52 @@
+# Summary
+
+## Overview
+This file provides you the instructions to run LLAMA2 and LLAMA3 with different parameters via Qualcomm HTP backend. Following settings support for Llama-2-7b-chat-hf and Llama-3-8b-chat-hf
+
+Please check corresponding section for more information.
+
+## Llama-2-7b-chat-hf
+This example demonstrates how to run Llama-2-7b-chat-hf on mobile via Qualcomm HTP backend. Model was precompiled into context binaries by [Qualcomm AI HUB](https://aihub.qualcomm.com/).
+Note that the pre-compiled context binaries could not be futher fine-tuned for other downstream tasks.
+
+### Instructions
+#### Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
+2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+
+#### Step2: Prepare Model
+1. Create account for https://aihub.qualcomm.com/
+2. Follow instructions in https://huggingface.co/qualcomm/Llama-v2-7B-Chat to export context binaries (will take some time to finish)
+
+```bash
+# tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer.model
+# tokenizer.bin:
+python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
+```
+
+#### Step3: Run default examples
+```bash
+# AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v2_7b_chat_quantized
+python examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
+```
+
+## Llama-3-8b-chat-hf
+This example demonstrates how to run Llama-3-8b-chat-hf on mobile via Qualcomm HTP backend. Model was precompiled into context binaries by [Qualcomm AI HUB](https://aihub.qualcomm.com/).
+Note that the pre-compiled context binaries could not be futher fine-tuned for other downstream tasks. This example script has been tested on a 16GB RAM device and verified to work.
+
+### Instructions
+#### Step 1: Setup
+1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
+2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.
+
+#### Step2: Prepare Model
+1. Create account for https://aihub.qualcomm.com/
+2. Follow instructions in https://huggingface.co/qualcomm/Llama-v3-8B-Chat to export context binaries (will take some time to finish)
+3. For Llama 3 tokenizer, please refer to https://github.com/meta-llama/llama-models/blob/main/README.md for further instructions on how to download tokenizer.model.
+
+
+#### Step3: Run default examples
+```bash
+# AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v3_8b_chat_quantized
+python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?"
+```
@@ -55,6 +55,11 @@ def main(args):
         is_from_context_binary=True,
     )
 
+    pte_name = (
+        "qaihub_llama2_7b_prompt"
+        if args.use_prompt_processor
+        else "qaihub_llama2_7b_token"
+    )
     if args.pre_gen_pte is None:
         # create custom operators as context loader
         bundle_programs = [
@@ -69,7 +74,7 @@ def main(args):
         # setup spill-fill buffer for relieving runtime memory usage
         canonicalize_program(lowered_modules)
         # export pte files
-        pte_name, pte_files = "qaihub_llama7b", []
+        pte_files = []
         for i in range(len(target_names)):
             print(f"pte {i} generating...")
             memory_planning_pass = MemoryPlanningPass(
@@ -90,7 +95,6 @@ def main(args):
             lowered_modules.pop(0)
             gc.collect()
     else:
-        pte_name = "qaihub_llama7b"
         pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(4)]
 
     if args.compile_only:
@@ -109,12 +113,6 @@ def get_logit_encoding(path_to_last_shard: str):
             qnn_mgr.Destroy()
             return encoding.data["scale"].item(), encoding.data["offset"].item()
 
-    # setup required paths accordingly
-    # qnn_sdk       : QNN SDK path setup in environment variable
-    # artifact_path : path where artifacts were built
-    # pte_path      : path where executorch binary was stored
-    # device_id     : serial number of android device
-    # workspace     : folder for storing artifacts on android device
     adb = SimpleADB(
         qnn_sdk=os.getenv("QNN_SDK_ROOT"),
         build_path=args.build_folder,
@@ -123,7 +121,7 @@ def get_logit_encoding(path_to_last_shard: str):
         device_id=args.device,
         host_id=args.host,
         soc_model=args.model,
-        runner="examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b_runner",
+        runner="examples/qualcomm/qaihub_scripts/llama/qaihub_llama2_7b_runner",
     )
     output_file = "result.txt"
     pos_embs_file = ["freq_cos", "freq_sin"]

@@ -9,15 +9,13 @@
 /**
  * @file
  *
- * This tool can run ExecuTorch model files with Qualcomm AI Engine Direct
- * and the portable kernels.
+ * This tool can run Llama2 7b with Qualcomm AI Engine Direct.
  *
- * User could specify arguments like desired input data, iterations, etc.
- * Currently we assume that the outputs are all fp32 tensors.
+ * User could specify arguments like desired prompt, eval_mode, etc.
  */
 
 #include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
-#include <executorch/examples/qualcomm/qaihub_scripts/llama2/runner/runner.h>
+#include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
 #include <executorch/extension/runner_util/managed_tensor.h>
 #include <executorch/runtime/platform/log.h>
 
@@ -68,6 +66,7 @@ int main(int argc, char** argv) {
   Runner runner(
       models_path,
       pos_embs_path,
+      {8, 8, 8, 8},
       FLAGS_tokenizer_path.c_str(),
       FLAGS_eval_mode,
       FLAGS_temperature,