Skip to content

Qualcomm AI Engine Direct - Support Llama3 QAIHub #4789

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion backends/qualcomm/runtime/QnnManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -332,7 +332,8 @@ Error QnnManager::AllocateTensor() {
const std::string& tensor_name = tensor_wrapper->GetName();
// this is required by identifying shared buffer mechanism
// info might be missed if context binary came from qnn_converter
if (tensor_name.find("output_") == std::string::npos) {
if (options_->is_from_context_binary() &&
tensor_name.find("output_") == std::string::npos) {
tensor_wrapper->SetName("output_" + tensor_name);
}
if (IsTensorDump()) {
Expand Down
43 changes: 42 additions & 1 deletion backends/qualcomm/tests/test_qnn_delegate.py
Original file line number Diff line number Diff line change
Expand Up @@ -1923,7 +1923,7 @@ def test_llama2_7b(self):
prompt = "Explain the rules of baseball"
cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b.py",
f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py",
"--artifact",
self.artifact_dir,
"--build_folder",
Expand Down Expand Up @@ -1957,6 +1957,47 @@ def test_llama2_7b(self):
model_out = msg["result"]
self.assertTrue(model_out.startswith(prompt))

def test_llama3_8b(self):
if not self.required_envs():
self.skipTest("missing required envs")

prompt = "Explain the rules of baseball"
cmds = [
"python",
f"{self.executorch_root}/examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py",
"--artifact",
self.artifact_dir,
"--build_folder",
self.build_folder,
"--device",
self.device,
"--model",
self.model,
"--tokenizer_model",
f"{self.artifact_dir}/tokenizer.model",
"--context_binaries",
f"{self.artifact_dir}",
"--ip",
self.ip,
"--port",
str(self.port),
"--prompt",
f"{prompt}",
]
if self.host:
cmds.extend(["--host", self.host])

p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
with Listener((self.ip, self.port)) as listener:
conn = listener.accept()
p.communicate()
msg = json.loads(conn.recv())
if "Error" in msg:
self.fail(msg["Error"])
else:
model_out = msg["result"]
self.assertTrue(model_out.startswith(prompt))


class TestExampleScript(TestQNN):
def required_envs(self, conditions=None) -> bool:
Expand Down
8 changes: 7 additions & 1 deletion backends/qualcomm/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def replace_linear(module: torch.nn.Module):

def canonicalize_program(
exported_program: ExportedProgram | List[LoweredBackendModule],
custom_buffer_size=None,
):
# check if user specifies to use multi_contexts
# this is a generic approach in case there exists multiple backends
Expand All @@ -140,7 +141,12 @@ def process_exported_program(prog):
return max_sf_buf_size, module_map

def process_lowered_module(module):
return len(module.processed_bytes), {
spill_fill_size = (
len(module.processed_bytes)
if custom_buffer_size is None
else custom_buffer_size
)
return spill_fill_size, {
module: convert_to_option(module.compile_specs[0].value)
}

Expand Down
8 changes: 4 additions & 4 deletions examples/qualcomm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -69,15 +69,15 @@ target_include_directories(

# build qnn_executor_runner
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/executor_runner
${CMAKE_CURRENT_SOURCE_DIR}/executor_runner
)

# build qnn_llama_runner
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2
${CMAKE_CURRENT_SOURCE_DIR}/oss_scripts/llama2
)

# build qaihub_llama2_7b_runner
# build qaihub_llama2_7b_runner and qaihub_llama3_8b_runner
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama2
${CMAKE_CURRENT_SOURCE_DIR}/qaihub_scripts/llama
)
5 changes: 2 additions & 3 deletions examples/qualcomm/oss_scripts/llama2/qnn_llama_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,9 @@
/**
* @file
*
* This tool can run ExecuTorch model files with Qualcomm AI Engine Direct
* and the portable kernels.
* This tool can run ExecuTorch model files with Qualcomm AI Engine Direct.
*
* User could specify arguments like desired input data, iterations, etc.
* User could specify arguments like desired prompt, temperature, etc.
*/

#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
Expand Down
94 changes: 94 additions & 0 deletions examples/qualcomm/qaihub_scripts/llama/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# Copyright (c) Qualcomm Innovation Center, Inc.
# All rights reserved
#
# This source code is licensed under the BSD-style license found in the
# LICENSE file in the root directory of this source tree.

# preprocess qaihub runner src files for llama2,3
set(_qaihub_llama_runner__srcs ${_llama_runner__srcs})
list(TRANSFORM _qaihub_llama_runner__srcs PREPEND "${EXECUTORCH_SOURCE_DIR}/")
list(FILTER _qaihub_llama_runner__srcs EXCLUDE REGEX ".*(/runner/).*")
list(PREPEND _qaihub_llama_runner__srcs
${CMAKE_CURRENT_LIST_DIR}/runner/runner.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/runner.h
${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.cpp
${CMAKE_CURRENT_LIST_DIR}/runner/io_memory.h
)


# preprocess qaihub llama2 7b runner src files
set(_qaihub_llama2_7b_runner__srcs ${_qaihub_llama_runner__srcs})

list(PREPEND _qaihub_llama2_7b_runner__srcs
${CMAKE_CURRENT_LIST_DIR}/llama2/qaihub_llama2_7b_runner.cpp
)

# build qaihub llama2 7b runner
add_executable(qaihub_llama2_7b_runner ${_qaihub_llama2_7b_runner__srcs})
target_include_directories(qaihub_llama2_7b_runner
PUBLIC ${_common_include_directories}
)
target_link_libraries(qaihub_llama2_7b_runner
qnn_executorch_backend
executorch_no_prim_ops
extension_data_loader
extension_module
gflags
)
target_compile_options(qaihub_llama2_7b_runner
PUBLIC ${_common_compile_options}
)


# preprocess qaihub llama3 8b runner src files
set(_qaihub_llama3_8b_runner__srcs ${_qaihub_llama_runner__srcs})

list(PREPEND _qaihub_llama3_8b_runner__srcs
${CMAKE_CURRENT_LIST_DIR}/llama3/qaihub_llama3_8b_runner.cpp
)

# Adding a compile option to differentiate llama2 with llama3 logic
list(APPEND _common_compile_options -DQAIHUB_LLAMA3_RUNNER)

# find RE2 for tokenizer
set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/third-party/abseil-cpp
${CMAKE_CURRENT_BINARY_DIR}/abseil-cpp
)
add_subdirectory(
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/third-party/re2
${CMAKE_CURRENT_BINARY_DIR}/re2
)
set(CMAKE_POSITION_INDEPENDENT_CODE ${_pic_flag})


list(APPEND _qaihub_llama3_8b_runner__srcs
${CMAKE_CURRENT_SOURCE_DIR}/../../../../extension/llm/tokenizer/tiktoken.cpp
)
list(APPEND _qaihub_llama3_8b_runner__srcs
${CMAKE_CURRENT_SOURCE_DIR}/../../../models/llama2/tokenizer/llama_tiktoken.cpp
)
set(_preprocessor_flag -DET_USE_TIKTOKEN)


# build qaihub llama3 8b runner
add_executable(qaihub_llama3_8b_runner ${_qaihub_llama3_8b_runner__srcs})
target_include_directories(qaihub_llama3_8b_runner
PUBLIC ${_common_include_directories}
)

target_link_libraries(qaihub_llama3_8b_runner
qnn_executorch_backend
executorch_no_prim_ops
extension_data_loader
extension_module
gflags
re2::re2
)
target_compile_options(qaihub_llama3_8b_runner
PUBLIC ${_common_compile_options}
)
52 changes: 52 additions & 0 deletions examples/qualcomm/qaihub_scripts/llama/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Summary

## Overview
This file provides you the instructions to run LLAMA2 and LLAMA3 with different parameters via Qualcomm HTP backend. Following settings support for Llama-2-7b-chat-hf and Llama-3-8b-chat-hf

Please check corresponding section for more information.

## Llama-2-7b-chat-hf
This example demonstrates how to run Llama-2-7b-chat-hf on mobile via Qualcomm HTP backend. Model was precompiled into context binaries by [Qualcomm AI HUB](https://aihub.qualcomm.com/).
Note that the pre-compiled context binaries could not be futher fine-tuned for other downstream tasks.

### Instructions
#### Step 1: Setup
1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.

#### Step2: Prepare Model
1. Create account for https://aihub.qualcomm.com/
2. Follow instructions in https://huggingface.co/qualcomm/Llama-v2-7B-Chat to export context binaries (will take some time to finish)

```bash
# tokenizer.model: https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/tokenizer.model
# tokenizer.bin:
python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o tokenizer.bin
```

#### Step3: Run default examples
```bash
# AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v2_7b_chat_quantized
python examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
```

## Llama-3-8b-chat-hf
This example demonstrates how to run Llama-3-8b-chat-hf on mobile via Qualcomm HTP backend. Model was precompiled into context binaries by [Qualcomm AI HUB](https://aihub.qualcomm.com/).
Note that the pre-compiled context binaries could not be futher fine-tuned for other downstream tasks. This example script has been tested on a 16GB RAM device and verified to work.

### Instructions
#### Step 1: Setup
1. Follow the [tutorial](https://pytorch.org/executorch/main/getting-started-setup) to set up ExecuTorch.
2. Follow the [tutorial](https://pytorch.org/executorch/stable/build-run-qualcomm-ai-engine-direct-backend.html) to build Qualcomm AI Engine Direct Backend.

#### Step2: Prepare Model
1. Create account for https://aihub.qualcomm.com/
2. Follow instructions in https://huggingface.co/qualcomm/Llama-v3-8B-Chat to export context binaries (will take some time to finish)
3. For Llama 3 tokenizer, please refer to https://github.com/meta-llama/llama-models/blob/main/README.md for further instructions on how to download tokenizer.model.


#### Step3: Run default examples
```bash
# AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v3_8b_chat_quantized
python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?"
```
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,11 @@ def main(args):
is_from_context_binary=True,
)

pte_name = (
"qaihub_llama2_7b_prompt"
if args.use_prompt_processor
else "qaihub_llama2_7b_token"
)
if args.pre_gen_pte is None:
# create custom operators as context loader
bundle_programs = [
Expand All @@ -69,7 +74,7 @@ def main(args):
# setup spill-fill buffer for relieving runtime memory usage
canonicalize_program(lowered_modules)
# export pte files
pte_name, pte_files = "qaihub_llama7b", []
pte_files = []
for i in range(len(target_names)):
print(f"pte {i} generating...")
memory_planning_pass = MemoryPlanningPass(
Expand All @@ -90,7 +95,6 @@ def main(args):
lowered_modules.pop(0)
gc.collect()
else:
pte_name = "qaihub_llama7b"
pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(4)]

if args.compile_only:
Expand All @@ -109,12 +113,6 @@ def get_logit_encoding(path_to_last_shard: str):
qnn_mgr.Destroy()
return encoding.data["scale"].item(), encoding.data["offset"].item()

# setup required paths accordingly
# qnn_sdk : QNN SDK path setup in environment variable
# artifact_path : path where artifacts were built
# pte_path : path where executorch binary was stored
# device_id : serial number of android device
# workspace : folder for storing artifacts on android device
adb = SimpleADB(
qnn_sdk=os.getenv("QNN_SDK_ROOT"),
build_path=args.build_folder,
Expand All @@ -123,7 +121,7 @@ def get_logit_encoding(path_to_last_shard: str):
device_id=args.device,
host_id=args.host,
soc_model=args.model,
runner="examples/qualcomm/qaihub_scripts/llama2/qaihub_llama2_7b_runner",
runner="examples/qualcomm/qaihub_scripts/llama/qaihub_llama2_7b_runner",
)
output_file = "result.txt"
pos_embs_file = ["freq_cos", "freq_sin"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,15 +9,13 @@
/**
* @file
*
* This tool can run ExecuTorch model files with Qualcomm AI Engine Direct
* and the portable kernels.
* This tool can run Llama2 7b with Qualcomm AI Engine Direct.
*
* User could specify arguments like desired input data, iterations, etc.
* Currently we assume that the outputs are all fp32 tensors.
* User could specify arguments like desired prompt, eval_mode, etc.
*/

#include <executorch/backends/qualcomm/runtime/QnnExecuTorch.h>
#include <executorch/examples/qualcomm/qaihub_scripts/llama2/runner/runner.h>
#include <executorch/examples/qualcomm/qaihub_scripts/llama/runner/runner.h>
#include <executorch/extension/runner_util/managed_tensor.h>
#include <executorch/runtime/platform/log.h>

Expand Down Expand Up @@ -68,6 +66,7 @@ int main(int argc, char** argv) {
Runner runner(
models_path,
pos_embs_path,
{8, 8, 8, 8},
FLAGS_tokenizer_path.c_str(),
FLAGS_eval_mode,
FLAGS_temperature,
Expand Down
Loading
Loading