Skip to content

Commit 0c6a77e

Browse files
authored
Refine tokenizer (#4940)
1 parent 58efb8b commit 0c6a77e

File tree

18 files changed

+290
-242
lines changed

18 files changed

+290
-242
lines changed

backends/qualcomm/scripts/build.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ if [ "$BUILD_AARCH64" = true ]; then
7070
rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
7171
else
7272
# Force rebuild flatccrt for the correct platform
73-
cd $BUILD_ROOT/sdk && make clean
73+
cd $BUILD_ROOT/devtools && make clean
7474
fi
7575

7676
cd $BUILD_ROOT
@@ -112,7 +112,7 @@ if [ "$BUILD_X86_64" = true ]; then
112112
rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
113113
else
114114
# Force rebuild flatccrt for the correct platform
115-
cd $BUILD_ROOT/sdk && make clean
115+
cd $BUILD_ROOT/devtools && make clean
116116
fi
117117

118118
cd $BUILD_ROOT

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2036,7 +2036,12 @@ def test_llama3_8b(self):
20362036
self.fail(msg["Error"])
20372037
else:
20382038
model_out = msg["result"]
2039-
self.assertTrue(model_out.startswith(prompt))
2039+
expected_result = (
2040+
"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
2041+
+ prompt
2042+
+ "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
2043+
)
2044+
self.assertTrue(model_out.startswith(expected_result))
20402045

20412046
def test_stable_diffusion(self):
20422047
if not self.required_envs():

docs/source/build-run-qualcomm-ai-engine-direct-backend.md

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -126,8 +126,8 @@ Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct b
126126

127127
```bash
128128
cd $EXECUTORCH_ROOT
129-
mkdir cmake-out
130-
cd cmake-out
129+
mkdir build-x86
130+
cd build-x86
131131
# Note that the below command might change.
132132
# Please refer to the above build.sh for latest workable commands.
133133
cmake .. \
@@ -158,8 +158,8 @@ Commands to build `qnn_executor_runner` for Android:
158158

159159
```bash
160160
cd $EXECUTORCH_ROOT
161-
mkdir cmake-out-android
162-
cd cmake-out-android
161+
mkdir build-android
162+
cd build-android
163163
# build executorch & qnn_executorch_backend
164164
cmake .. \
165165
-DCMAKE_INSTALL_PREFIX=$PWD \
@@ -189,7 +189,7 @@ cmake ../examples/qualcomm \
189189
cmake --build examples/qualcomm -j$(nproc)
190190

191191
# qnn_executor_runner can be found under examples/qualcomm
192-
# The full path is $EXECUTORCH_ROOT/cmake-out-android/examples/qualcomm/qnn_executor_runner
192+
# The full path is $EXECUTORCH_ROOT/build-android/examples/qualcomm/qnn_executor_runner
193193
ls examples/qualcomm
194194
```
195195

@@ -209,7 +209,7 @@ cd $EXECUTORCH_ROOT
209209
cp schema/program.fbs exir/_serialize/program.fbs
210210
cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
211211

212-
python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --compile_only --download
212+
python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --compile_only --download
213213
```
214214

215215
You might see something like below:
@@ -239,7 +239,7 @@ We can test model inferences before deploying it to a device by HTP emulator.
239239
Let's build `qnn_executor_runner` for a x64 host:
240240
```bash
241241
# assuming the AOT component is built.
242-
cd $EXECUTORCH_ROOT/cmake-out
242+
cd $EXECUTORCH_ROOT/build-x86
243243
cmake ../examples/qualcomm \
244244
-DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \
245245
-DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
@@ -249,23 +249,23 @@ cmake ../examples/qualcomm \
249249
cmake --build examples/qualcomm -j$(nproc)
250250

251251
# qnn_executor_runner can be found under examples/qualcomm
252-
# The full path is $EXECUTORCH_ROOT/cmake-out/examples/qualcomm/qnn_executor_runner
252+
# The full path is $EXECUTORCH_ROOT/build-x86/examples/qualcomm/qnn_executor_runner
253253
ls examples/qualcomm/
254254
```
255255

256256
To run the HTP emulator, the dynamic linker need to access QNN libraries and `libqnn_executorch_backend.so`.
257257
We set the below two paths to `LD_LIBRARY_PATH` environment variable:
258258
1. `$QNN_SDK_ROOT/lib/x86_64-linux-clang/`
259-
2. `$EXECUTORCH_ROOT/cmake-out/lib/`
259+
2. `$EXECUTORCH_ROOT/build-x86/lib/`
260260

261261
The first path is for QNN libraries including HTP emulator. It has been configured in the AOT compilation section.
262262

263263
The second path is for `libqnn_executorch_backend.so`.
264264

265265
So, we can run `./deeplab_v3/dlv3_qnn.pte` by:
266266
```bash
267-
cd $EXECUTORCH_ROOT/cmake-out
268-
export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/cmake-out/lib/:$LD_LIBRARY_PATH
267+
cd $EXECUTORCH_ROOT/build-x86
268+
export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/build-x86/lib/:$LD_LIBRARY_PATH
269269
examples/qualcomm/qnn_executor_runner --model_path ../deeplab_v3/dlv3_qnn.pte
270270
```
271271

@@ -308,8 +308,8 @@ So, we can run `qnn_executor_runner` like
308308

309309
```bash
310310
adb push ./deeplab_v3/dlv3_qnn.pte ${DEVICE_DIR}
311-
adb push ${EXECUTORCH_ROOT}/cmake-out-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR}
312-
adb push ${EXECUTORCH_ROOT}/cmake-out-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
311+
adb push ${EXECUTORCH_ROOT}/build-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR}
312+
adb push ${EXECUTORCH_ROOT}/build-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
313313
adb shell "cd ${DEVICE_DIR} \
314314
&& export LD_LIBRARY_PATH=${DEVICE_DIR} \
315315
&& export ADSP_LIBRARY_PATH=${DEVICE_DIR} \
@@ -333,7 +333,7 @@ I 00:00:00.364875 executorch:qnn_executor_runner.cpp:425] Write etdump to etdump
333333
The model is merely executed. If we want to feed real inputs and get model outputs, we can use
334334
```bash
335335
cd $EXECUTORCH_ROOT
336-
python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --download -s <device_serial>
336+
python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --download -s <device_serial>
337337
```
338338
The `<device_serial>` can be found by `adb devices` command.
339339

examples/demo-apps/android/ExecuTorchDemo/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build-
5353
After generating the model, copy the model to `assets` directory.
5454

5555
```bash
56-
python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8450 -s <adb_connected_device_serial>
56+
python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8450 -s <adb_connected_device_serial>
5757
cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
5858
```
5959

examples/qualcomm/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,12 @@ cd $EXECUTORCH_ROOT/examples/qualcomm/scripts
5353

5454
#### For MobileNet_v2
5555
```bash
56-
python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/cmake-out-android/ -d /path/to/imagenet-mini/val
56+
python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/build-android/ -d /path/to/imagenet-mini/val
5757
```
5858

5959
#### For DeepLab_v3
6060
```bash
61-
python deeplab_v3.py -s <device_serial> -m "SM8550" -b path/to/cmake-out-android/ --download
61+
python deeplab_v3.py -s <device_serial> -m "SM8550" -b path/to/build-android/ --download
6262
```
6363

6464
#### Check context binary version

examples/qualcomm/oss_scripts/llama2/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps":
3232
Default example generates the story based on the given prompt, "Once".
3333
```bash
3434
# 16a4w quant:
35-
python examples/qualcomm/oss_scripts/llama2/llama.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
35+
python examples/qualcomm/oss_scripts/llama2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
3636
```
3737

3838
#### (Note) Customized PTQ data set

examples/qualcomm/qaihub_scripts/llama/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o token
2727
#### Step3: Run default examples
2828
```bash
2929
# AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v2_7b_chat_quantized
30-
python examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
30+
python examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
3131
```
3232

3333
## Llama-3-8b-chat-hf
@@ -48,5 +48,5 @@ Note that the pre-compiled context binaries could not be futher fine-tuned for o
4848
#### Step3: Run default examples
4949
```bash
5050
# AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v3_8b_chat_quantized
51-
python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?"
51+
python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?"
5252
```

examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py

Lines changed: 30 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@
44
# This source code is licensed under the BSD-style license found in the
55
# LICENSE file in the root directory of this source tree.
66

7-
import gc
87
import json
98
import os
109
from multiprocessing.connection import Client
@@ -15,18 +14,19 @@
1514
QcomChipset,
1615
)
1716
from executorch.backends.qualcomm.utils.utils import (
18-
canonicalize_program,
1917
from_context_binary,
2018
generate_htp_compiler_spec,
2119
generate_qnn_executorch_compiler_spec,
2220
generate_qnn_executorch_option,
2321
)
22+
from executorch.examples.qualcomm.qaihub_scripts.utils.utils import (
23+
gen_pte_from_ctx_bin,
24+
get_encoding,
25+
)
2426
from executorch.examples.qualcomm.utils import (
2527
setup_common_args_and_variables,
2628
SimpleADB,
2729
)
28-
from executorch.exir.backend.backend_api import to_backend
29-
from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
3030

3131

3232
def main(args):
@@ -55,45 +55,25 @@ def main(args):
5555
is_from_context_binary=True,
5656
)
5757

58-
pte_name = (
59-
"qaihub_llama2_7b_prompt"
60-
if args.use_prompt_processor
61-
else "qaihub_llama2_7b_token"
62-
)
58+
if args.use_prompt_processor:
59+
pte_name = "qaihub_llama2_7b_prompt"
60+
last_shard_num_inputs = 4
61+
last_shard_num_outputs = 513
62+
else:
63+
pte_name = "qaihub_llama2_7b_token"
64+
last_shard_num_inputs = 516
65+
last_shard_num_outputs = 513
66+
6367
if args.pre_gen_pte is None:
6468
# create custom operators as context loader
6569
bundle_programs = [
6670
from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}")
6771
for i, target in enumerate(target_names)
6872
]
69-
# lower with QnnBackend
70-
lowered_modules = [
71-
to_backend("QnnBackend", prog["edge_program"], compiler_specs)
72-
for prog in bundle_programs
73-
]
74-
# setup spill-fill buffer for relieving runtime memory usage
75-
canonicalize_program(lowered_modules)
76-
# export pte files
77-
pte_files = []
78-
for i in range(len(target_names)):
79-
print(f"pte {i} generating...")
80-
memory_planning_pass = MemoryPlanningPass(
81-
memory_planning_algo="greedy",
82-
alloc_graph_input=False,
83-
alloc_graph_output=False,
84-
)
85-
pte_files.append(f"{args.artifact}/{pte_name}_{i}.pte")
86-
with open(pte_files[-1], "wb") as file:
87-
file.write(
88-
lowered_modules[0].buffer(
89-
extract_delegate_segments=True,
90-
memory_planning=memory_planning_pass,
91-
)
92-
)
93-
# gc for reducing host memory consuming
94-
bundle_programs.pop(0)
95-
lowered_modules.pop(0)
96-
gc.collect()
73+
pte_names = [f"{pte_name}_{i}" for i in range(len(target_names))]
74+
pte_files = gen_pte_from_ctx_bin(
75+
args.artifact, pte_names, compiler_specs, bundle_programs
76+
)
9777
else:
9878
pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(4)]
9979

@@ -125,7 +105,16 @@ def get_logit_encoding(path_to_last_shard: str):
125105
)
126106
output_file = "result.txt"
127107
pos_embs_file = ["freq_cos", "freq_sin"]
128-
scale, offset = get_logit_encoding(target_names[-1])
108+
encoding = get_encoding(
109+
path_to_shard=f"{args.context_binaries}/{target_names[-1]}",
110+
compiler_specs=compiler_specs,
111+
get_input=False,
112+
get_output=True,
113+
num_input=last_shard_num_inputs,
114+
num_output=last_shard_num_outputs,
115+
)[0]
116+
scale = encoding["scale"][-1]
117+
offset = encoding["offset"][-1]
129118
outputs = []
130119
runner_args = [
131120
*[
@@ -173,7 +162,8 @@ def post_process():
173162
freq = (freq / scale + offset).clip(min=0, max=65535).detach()
174163
freq.to(dtype=torch.uint16).numpy().tofile(custom_files[-1])
175164

176-
adb.push(files=custom_files)
165+
if not args.skip_push:
166+
adb.push(files=custom_files)
177167
adb.execute(custom_runner_cmd=runner_cmds)
178168
adb.pull(args.artifact, callback=post_process)
179169
if args.ip and args.port != -1:
@@ -230,7 +220,7 @@ def post_process():
230220
parser.add_argument(
231221
"--temperature",
232222
help="sampling temperature for llama2",
233-
default=0.8,
223+
default=0.0,
234224
type=float,
235225
)
236226

examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b_runner.cpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
3636
DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
3737
DEFINE_double(
3838
temperature,
39-
0.8f,
40-
"Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
39+
0.0f,
40+
"Temperature; Default is 0.0f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
4141
DEFINE_int32(
4242
eval_mode,
4343
0,
@@ -75,9 +75,10 @@ int main(int argc, char** argv) {
7575

7676
// generate tokens & store inference output
7777
std::ofstream fout(FLAGS_output_path.c_str());
78-
runner.generate(FLAGS_prompt, FLAGS_seq_len, [&](const std::string& piece) {
79-
fout << piece;
80-
});
78+
runner.generate(
79+
FLAGS_prompt, "", FLAGS_seq_len, [&](const std::string& piece) {
80+
fout << piece;
81+
});
8182
fout.close();
8283
return 0;
8384
}

0 commit comments

Comments
 (0)