Change bert to batch prefill

Joey Tsai · Joey Tsai · commit 00d9e0fe3d4a · 2024-11-22T09:53:19.000+08:00
diff --git a/examples/qualcomm/oss_scripts/llama2/llama.py b/examples/qualcomm/oss_scripts/llama2/llama.py
@@ -229,7 +229,7 @@ def sample_top_p(probs: torch.Tensor, top_p: float) -> torch.Tensor:
     print(f"calibration data:\n{sp_model.decode(token_list)}")
 
 
-def _bert_calibrate(
+def _batch_prefill_calibrate(
     example_inputs,
     user_prompts,
     module: torch.fx.GraphModule,
@@ -273,7 +273,7 @@ def calibrate(
     max_seq_len=512,
 ):
     if len(example_inputs) == 2:
-        _bert_calibrate(
+        _batch_prefill_calibrate(
             example_inputs,
             user_prompts,
             module,
@@ -332,7 +332,7 @@ def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type):
                         == self.llama_meta["get_head_dim"]
                     ):
                         a.meta[QCOM_QUANTIZED_IO] = kv_type
-                    # single head, bert mode
+                    # single head, batch_prefill mode
                     elif a.meta["val"].flatten().size()[0] == self.llama_meta[
                         "get_head_dim"
                     ] * (self.llama_meta["get_max_seq_len"] - 1):
@@ -416,7 +416,7 @@ def compile(args):
 
     if args.model_mode == "kv":
         use_kv_cache = output_new_cache_only = True
-    elif args.model_mode == "bert" or args.model_mode == "hybrid":
+    elif args.model_mode == "batch_prefill" or args.model_mode == "hybrid":
         raise NotImplementedError(
             f"model_mode {args.model_mode} is not implemented yet."
         )
@@ -653,9 +653,9 @@ def post_process():
 
     parser.add_argument(
         "--model_mode",
-        help="Export and inference bert mode, kv mode or hybrid(TBD) mode",
+        help="Export and inference batch_prefill mode, kv mode or hybrid(TBD) mode",
         default="kv",
-        choices=["bert", "kv", "hybrid"],
+        choices=["batch_prefill", "kv", "hybrid"],
         type=str,
     )
 
diff --git a/examples/qualcomm/oss_scripts/llama2/model/static_llama.py b/examples/qualcomm/oss_scripts/llama2/model/static_llama.py
@@ -21,7 +21,7 @@ def apply_rotary_emb_single(
 ) -> torch.Tensor:
     x_r, x_i = x[..., ::2], x[..., 1::2]
 
-    # brodcast for bert mode input x
+    # brodcast for batch_prefill mode input x
     if x.dim() == 4:
         freqs_cos = freqs_cos[None, :, None, :]
         freqs_sin = freqs_sin[None, :, None, :]
@@ -111,7 +111,7 @@ def forward_sha(
             for i, _ in enumerate(k_caches):
                 kh.append(torch.cat([k_caches[i], k[i]], dim=-1))
                 vh.append(torch.cat([v_caches[i], v[i]], dim=1))
-        # bert/prefill mode
+        # batch_prefill mode
         else:
             kh = k
             vh = v
@@ -131,7 +131,7 @@ def forward_sha(
         if self.output_new_cache_only:
             if k_caches and v_caches:
                 return y, k, v
-            # bert mode. Consider to remove, it's not really used
+            # batch_prefill mode. Consider to remove, it's not really used
             return y, k[-1], v[-1]
 
         return y, kh, vh
@@ -172,7 +172,7 @@ def forward(
 
                 output_y.append(y)
 
-        # bert/prefill mode
+        # batch_prefill mode
         else:
             kh = k
             vh = v
diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama3_2/llama.py
@@ -103,7 +103,7 @@ def _kv_calibrate(
     print(f"calibration data:\n{sp_model.decode(token_list)}")
 
 
-def _bert_calibrate(
+def _batch_prefill_calibrate(
     example_inputs,
     user_prompts,
     module: torch.fx.GraphModule,
@@ -147,7 +147,7 @@ def calibrate(
     max_seq_len=512,
 ):
     if len(example_inputs) == 2:
-        _bert_calibrate(
+        _batch_prefill_calibrate(
             example_inputs,
             user_prompts,
             module,
@@ -206,7 +206,7 @@ def _tag_kv_ios(self, gm: torch.fx.GraphModule, kv_type, sharding_type):
                         == self.llama_meta["get_head_dim"]
                     ):
                         a.meta[QCOM_QUANTIZED_IO] = kv_type
-                    # single head, bert mode
+                    # single head, batch_prefill mode
                     elif a.meta["val"].flatten().size()[0] == self.llama_meta[
                         "get_head_dim"
                     ] * (self.llama_meta["get_max_seq_len"] - 1):
@@ -319,7 +319,7 @@ def compile(args):
 
     if args.model_mode == "kv":
         use_kv_cache = output_new_cache_only = True
-    elif args.model_mode == "bert":
+    elif args.model_mode == "batch_prefill":
         use_kv_cache = output_new_cache_only = False
     elif args.model_mode == "hybrid":
         raise NotImplementedError(
@@ -409,7 +409,7 @@ def compile(args):
 def inference(args, pre_gen_pte=""):
     workspace = f"/data/local/tmp/{getpass.getuser()}/executorch/single_llama"
 
-    if args.model_mode == "bert":
+    if args.model_mode == "batch_prefill":
         eval_mode = 0
     elif args.model_mode == "kv":
         eval_mode = 1
@@ -576,9 +576,9 @@ def post_process():
 
     parser.add_argument(
         "--model_mode",
-        help="Export and inference bert mode, kv mode or hybrid(TBD) mode",
+        help="Export and inference batch_prefill mode, kv mode or hybrid(TBD) mode",
         default="kv",
-        choices=["bert", "kv", "hybrid"],
+        choices=["batch_prefill", "kv", "hybrid"],
         type=str,
     )
 
diff --git a/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp b/examples/qualcomm/oss_scripts/llama3_2/qnn_llama3_2_runner.cpp
@@ -46,7 +46,7 @@ DEFINE_int32(
 DEFINE_int32(
     eval_mode,
     0,
-    "0: PromptProcessor(bert) / 1: TokenGenerator(kv) / 2: HybridMode (TBD)");
+    "0: PromptProcessor(batch_prefill) / 1: TokenGenerator(kv) / 2: HybridMode (TBD)");
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.cpp
@@ -107,7 +107,7 @@ Error Runner::load() {
 
   // prepare io
   auto methods_meta = get_methods_meta();
-  if (eval_mode_ == EvalMode::kBert) {
+  if (eval_mode_ == EvalMode::kBatchPrefill) {
     io_mem_->prepare_prefill_io(methods_meta);
   } else {
     io_mem_->prepare_kv_io(methods_meta);
@@ -217,7 +217,7 @@ Error Runner::generate(
   HybridMemory::IO* ptr =
       static_cast<HybridMemory::IO*>(io_mem_->get_mutable_ptr());
 
-  if (eval_mode_ == EvalMode::kBert) {
+  if (eval_mode_ == EvalMode::kBatchPrefill) {
     for (int i = 0; i < num_prompt_tokens; i++) {
       ptr->prefill_input_toks[i] = static_cast<int32_t>(prompt_tokens[i]);
       auto piece_res = tokenizer_->decode(prompt_tokens[i], prompt_tokens[i]);
diff --git a/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h b/examples/qualcomm/oss_scripts/llama3_2/runner/runner.h
@@ -72,7 +72,7 @@ class Runner {
 
  private:
   enum EvalMode {
-    kBert = 0,
+    kBatchPrefill = 0,
     kKVCached,
     kUnsupported,
   };