pytorch · cccclai · Aug 29, 2024 · Aug 23, 2024 · cccclai · Aug 28, 2024
@@ -70,7 +70,7 @@ if [ "$BUILD_AARCH64" = true ]; then
         rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
     else
         # Force rebuild flatccrt for the correct platform
-        cd $BUILD_ROOT/sdk && make clean
+        cd $BUILD_ROOT/devtools && make clean
     fi
 
     cd $BUILD_ROOT
@@ -112,7 +112,7 @@ if [ "$BUILD_X86_64" = true ]; then
         rm -rf $BUILD_ROOT && mkdir $BUILD_ROOT
     else
         # Force rebuild flatccrt for the correct platform
-        cd $BUILD_ROOT/sdk && make clean
+        cd $BUILD_ROOT/devtools && make clean
     fi
 
     cd $BUILD_ROOT

@@ -1996,7 +1996,12 @@ def test_llama3_8b(self):
                 self.fail(msg["Error"])
             else:
                 model_out = msg["result"]
-                self.assertTrue(model_out.startswith(prompt))
+                expected_result = (
+                    "<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n"
+                    + prompt
+                    + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
+                )
+                self.assertTrue(model_out.startswith(expected_result))
 
     def test_stable_diffusion(self):
         if not self.required_envs():

@@ -126,8 +126,8 @@ Python APIs on x64 are required to compile models to Qualcomm AI Engine Direct b
 
 ```bash
 cd $EXECUTORCH_ROOT
-mkdir cmake-out
-cd cmake-out
+mkdir build-x86
+cd build-x86
 # Note that the below command might change.
 # Please refer to the above build.sh for latest workable commands.
 cmake .. \
@@ -158,8 +158,8 @@ Commands to build `qnn_executor_runner` for Android:
 
 ```bash
 cd $EXECUTORCH_ROOT
-mkdir cmake-out-android
-cd cmake-out-android
+mkdir build-android
+cd build-android
 # build executorch & qnn_executorch_backend
 cmake .. \
     -DCMAKE_INSTALL_PREFIX=$PWD \
@@ -189,7 +189,7 @@ cmake ../examples/qualcomm \
 cmake --build examples/qualcomm -j$(nproc)
 
 # qnn_executor_runner can be found under examples/qualcomm
-# The full path is $EXECUTORCH_ROOT/cmake-out-android/examples/qualcomm/qnn_executor_runner
+# The full path is $EXECUTORCH_ROOT/build-android/examples/qualcomm/qnn_executor_runner
 ls examples/qualcomm
 ```
 
@@ -209,7 +209,7 @@ cd $EXECUTORCH_ROOT
 cp schema/program.fbs exir/_serialize/program.fbs
 cp schema/scalar_type.fbs exir/_serialize/scalar_type.fbs
 
-python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --compile_only --download
+python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --compile_only --download
 ```
 
 You might see something like below:
@@ -239,7 +239,7 @@ We can test model inferences before deploying it to a device by HTP emulator.
 Let's build `qnn_executor_runner` for a x64 host:
 ```bash
 # assuming the AOT component is built.
-cd $EXECUTORCH_ROOT/cmake-out
+cd $EXECUTORCH_ROOT/build-x86
 cmake ../examples/qualcomm \
   -DCMAKE_PREFIX_PATH="$PWD/lib/cmake/ExecuTorch;$PWD/third-party/gflags;" \
   -DCMAKE_FIND_ROOT_PATH_MODE_PACKAGE=BOTH \
@@ -249,23 +249,23 @@ cmake ../examples/qualcomm \
 cmake --build examples/qualcomm -j$(nproc)
 
 # qnn_executor_runner can be found under examples/qualcomm
-# The full path is $EXECUTORCH_ROOT/cmake-out/examples/qualcomm/qnn_executor_runner
+# The full path is $EXECUTORCH_ROOT/build-x86/examples/qualcomm/qnn_executor_runner
 ls examples/qualcomm/
 ```
 
 To run the HTP emulator, the dynamic linker need to access QNN libraries and `libqnn_executorch_backend.so`.
 We set the below two paths to `LD_LIBRARY_PATH` environment variable:
   1. `$QNN_SDK_ROOT/lib/x86_64-linux-clang/`
-  2. `$EXECUTORCH_ROOT/cmake-out/lib/`
+  2. `$EXECUTORCH_ROOT/build-x86/lib/`
 
 The first path is for QNN libraries including HTP emulator. It has been configured in the AOT compilation section.
 
 The second path is for `libqnn_executorch_backend.so`.
 
 So, we can run `./deeplab_v3/dlv3_qnn.pte` by:
 ```bash
-cd $EXECUTORCH_ROOT/cmake-out
-export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/cmake-out/lib/:$LD_LIBRARY_PATH
+cd $EXECUTORCH_ROOT/build-x86
+export LD_LIBRARY_PATH=$EXECUTORCH_ROOT/build-x86/lib/:$LD_LIBRARY_PATH
 examples/qualcomm/qnn_executor_runner --model_path ../deeplab_v3/dlv3_qnn.pte
 ```
 
@@ -308,8 +308,8 @@ So, we can run `qnn_executor_runner` like
 
 ```bash
 adb push ./deeplab_v3/dlv3_qnn.pte ${DEVICE_DIR}
-adb push ${EXECUTORCH_ROOT}/cmake-out-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR}
-adb push ${EXECUTORCH_ROOT}/cmake-out-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
+adb push ${EXECUTORCH_ROOT}/build-android/examples/qualcomm/executor_runner/qnn_executor_runner ${DEVICE_DIR}
+adb push ${EXECUTORCH_ROOT}/build-android/lib/libqnn_executorch_backend.so ${DEVICE_DIR}
 adb shell "cd ${DEVICE_DIR} \
            && export LD_LIBRARY_PATH=${DEVICE_DIR} \
            && export ADSP_LIBRARY_PATH=${DEVICE_DIR} \
@@ -333,7 +333,7 @@ I 00:00:00.364875 executorch:qnn_executor_runner.cpp:425] Write etdump to etdump
 The model is merely executed. If we want to feed real inputs and get model outputs, we can use
 ```bash
 cd $EXECUTORCH_ROOT
-python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8550 --download -s <device_serial>
+python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8550 --download -s <device_serial>
 ```
 The `<device_serial>` can be found by `adb devices` command.
 

@@ -53,7 +53,7 @@ For delegating to Qualcomm Hexagon NPU, please follow the tutorial [here](build-
 After generating the model, copy the model to `assets` directory.
 
 ```bash
-python -m examples.qualcomm.scripts.deeplab_v3 -b cmake-out-android -m SM8450 -s <adb_connected_device_serial>
+python -m examples.qualcomm.scripts.deeplab_v3 -b build-android -m SM8450 -s <adb_connected_device_serial>
 cp deeplab_v3/dlv3_qnn.pte examples/demo-apps/android/ExecuTorchDemo/app/src/main/assets/
 ```
 

@@ -53,12 +53,12 @@ cd $EXECUTORCH_ROOT/examples/qualcomm/scripts
 
 #### For MobileNet_v2
 ```bash
-python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/cmake-out-android/ -d /path/to/imagenet-mini/val
+python mobilenet_v2.py -s <device_serial> -m "SM8550" -b path/to/build-android/ -d /path/to/imagenet-mini/val
 ```
 
 #### For DeepLab_v3
 ```bash
-python deeplab_v3.py -s <device_serial> -m "SM8550" -b path/to/cmake-out-android/ --download
+python deeplab_v3.py -s <device_serial> -m "SM8550" -b path/to/build-android/ --download
 ```
 
 #### Check context binary version

@@ -32,7 +32,7 @@ echo '{"dim": 768, "multiple_of": 32, "n_heads": 12, "n_layers": 12, "norm_eps":
 Default example generates the story based on the given prompt, "Once".
 ```bash
 # 16a4w quant:
-python examples/qualcomm/oss_scripts/llama2/llama.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
+python examples/qualcomm/oss_scripts/llama2/llama.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --ptq 16a4w --checkpoint stories110M --params params.json --tokenizer_model tokenizer.model --tokenizer_bin tokenizer.bin --prompt "Once"
 ```
 
 #### (Note) Customized PTQ data set

@@ -27,7 +27,7 @@ python -m examples.models.llama2.tokenizer.tokenizer -t tokenizer.model -o token
 #### Step3: Run default examples
 ```bash
 # AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v2_7b_chat_quantized
-python examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
+python examples/qualcomm/qaihub_scripts/llama/llama2/qaihub_llama2_7b.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_bin tokenizer.bin --prompt "What is Python?"
 ```
 
 ## Llama-3-8b-chat-hf
@@ -48,5 +48,5 @@ Note that the pre-compiled context binaries could not be futher fine-tuned for o
 #### Step3: Run default examples
 ```bash
 # AIHUB_CONTEXT_BINARIES: ${PATH_TO_AIHUB_WORKSPACE}/build/llama_v3_8b_chat_quantized
-python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -a ${ARTIFACTS} -b cmake-out-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?"
+python examples/qualcomm/qaihub_scripts/llama/llama3/qaihub_llama3_8b.py -b build-android -s ${SERIAL_NUM} -m ${SOC_MODEL} --context_binaries ${AIHUB_CONTEXT_BINARIES} --tokenizer_model tokenizer.model --prompt "What is baseball?"
 ```
@@ -4,7 +4,6 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-import gc
 import json
 import os
 from multiprocessing.connection import Client
@@ -15,18 +14,19 @@
     QcomChipset,
 )
 from executorch.backends.qualcomm.utils.utils import (
-    canonicalize_program,
     from_context_binary,
     generate_htp_compiler_spec,
     generate_qnn_executorch_compiler_spec,
     generate_qnn_executorch_option,
 )
+from executorch.examples.qualcomm.qaihub_scripts.utils.utils import (
+    gen_pte_from_ctx_bin,
+    get_encoding,
+)
 from executorch.examples.qualcomm.utils import (
     setup_common_args_and_variables,
     SimpleADB,
 )
-from executorch.exir.backend.backend_api import to_backend
-from executorch.exir.passes.memory_planning_pass import MemoryPlanningPass
 
 
 def main(args):
@@ -55,45 +55,25 @@ def main(args):
         is_from_context_binary=True,
     )
 
-    pte_name = (
-        "qaihub_llama2_7b_prompt"
-        if args.use_prompt_processor
-        else "qaihub_llama2_7b_token"
-    )
+    if args.use_prompt_processor:
+        pte_name = "qaihub_llama2_7b_prompt"
+        last_shard_num_inputs = 4
+        last_shard_num_outputs = 513
+    else:
+        pte_name = "qaihub_llama2_7b_token"
+        last_shard_num_inputs = 516
+        last_shard_num_outputs = 513
+
     if args.pre_gen_pte is None:
         # create custom operators as context loader
         bundle_programs = [
             from_context_binary(f"{args.context_binaries}/{target}", f"ctx_loader_{i}")
             for i, target in enumerate(target_names)
         ]
-        # lower with QnnBackend
-        lowered_modules = [
-            to_backend("QnnBackend", prog["edge_program"], compiler_specs)
-            for prog in bundle_programs
-        ]
-        # setup spill-fill buffer for relieving runtime memory usage
-        canonicalize_program(lowered_modules)
-        # export pte files
-        pte_files = []
-        for i in range(len(target_names)):
-            print(f"pte {i} generating...")
-            memory_planning_pass = MemoryPlanningPass(
-                memory_planning_algo="greedy",
-                alloc_graph_input=False,
-                alloc_graph_output=False,
-            )
-            pte_files.append(f"{args.artifact}/{pte_name}_{i}.pte")
-            with open(pte_files[-1], "wb") as file:
-                file.write(
-                    lowered_modules[0].buffer(
-                        extract_delegate_segments=True,
-                        memory_planning=memory_planning_pass,
-                    )
-                )
-            # gc for reducing host memory consuming
-            bundle_programs.pop(0)
-            lowered_modules.pop(0)
-            gc.collect()
+        pte_names = [f"{pte_name}_{i}" for i in range(len(target_names))]
+        pte_files = gen_pte_from_ctx_bin(
+            args.artifact, pte_names, compiler_specs, bundle_programs
+        )
     else:
         pte_files = [f"{args.pre_gen_pte}/{pte_name}_{i}.pte" for i in range(4)]
 
@@ -125,7 +105,16 @@ def get_logit_encoding(path_to_last_shard: str):
     )
     output_file = "result.txt"
     pos_embs_file = ["freq_cos", "freq_sin"]
-    scale, offset = get_logit_encoding(target_names[-1])
+    encoding = get_encoding(
+        path_to_shard=f"{args.context_binaries}/{target_names[-1]}",
+        compiler_specs=compiler_specs,
+        get_input=False,
+        get_output=True,
+        num_input=last_shard_num_inputs,
+        num_output=last_shard_num_outputs,
+    )[0]
+    scale = encoding["scale"][-1]
+    offset = encoding["offset"][-1]
     outputs = []
     runner_args = [
         *[
@@ -173,7 +162,8 @@ def post_process():
         freq = (freq / scale + offset).clip(min=0, max=65535).detach()
         freq.to(dtype=torch.uint16).numpy().tofile(custom_files[-1])
 
-    adb.push(files=custom_files)
+    if not args.skip_push:
+        adb.push(files=custom_files)
     adb.execute(custom_runner_cmd=runner_cmds)
     adb.pull(args.artifact, callback=post_process)
     if args.ip and args.port != -1:
@@ -230,7 +220,7 @@ def post_process():
     parser.add_argument(
         "--temperature",
         help="sampling temperature for llama2",
-        default=0.8,
+        default=0.0,
         type=float,
     )
 

@@ -36,8 +36,8 @@ DEFINE_string(tokenizer_path, "tokenizer.bin", "Tokenizer stuff.");
 DEFINE_string(prompt, "The answer to the ultimate question is", "Prompt.");
 DEFINE_double(
     temperature,
-    0.8f,
-    "Temperature; Default is 0.8f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
+    0.0f,
+    "Temperature; Default is 0.0f. 0 = greedy argmax sampling (deterministic). Lower temperature = more deterministic");
 DEFINE_int32(
     eval_mode,
     0,
@@ -75,9 +75,10 @@ int main(int argc, char** argv) {
 
   // generate tokens & store inference output
   std::ofstream fout(FLAGS_output_path.c_str());
-  runner.generate(FLAGS_prompt, FLAGS_seq_len, [&](const std::string& piece) {
-    fout << piece;
-  });
+  runner.generate(
+      FLAGS_prompt, "", FLAGS_seq_len, [&](const std::string& piece) {
+        fout << piece;
+      });
   fout.close();
   return 0;
 }