Comment out eot condition to generate all tokens

winskuo-quic · winskuo-quic · commit b4a9e53f010f · 2025-01-21T13:46:38.000+08:00
diff --git a/backends/qualcomm/utils/utils.py b/backends/qualcomm/utils/utils.py
@@ -823,6 +823,7 @@ def generate_multi_graph_program(
     )
     assert qnn_mgr.Init().value == 0, "failed to load processed bytes"
     binary_info = bytes(qnn_mgr.Compile())
+    print("Checking the size of QNN binary info: ", len(binary_info))
     assert len(binary_info) != 0, "failed to generate QNN context binary"
     graph_names = qnn_mgr.GetGraphNames()
     for graph_name in graph_names:
diff --git a/examples/qualcomm/oss_scripts/llama/llama.py b/examples/qualcomm/oss_scripts/llama/llama.py
@@ -82,6 +82,7 @@ def _kv_calibrate(
     _, atten_mask, _, k_caches, v_caches = example_inputs
 
     # TODO: change criteria & support batch inputs if necessary
+    pos = torch.tensor(0, dtype=torch.int32)
     max_cache_len = max_seq_len - 1
 
     token_list = []
@@ -113,42 +114,10 @@ def _kv_calibrate(
                 for i, v_cache in enumerate(v_caches)
             ]
 
-    # token_list = sp_model.encode(user_prompts, bos=True, eos=False)
-	
-    user_token_list = [
-        # what is the capital of the united states
-        [128000, 128006, 882, 128007, 271, 12840, 374, 279, 6864, 315, 279, 29292, 5415, 128009, 128006, 78191, 128007, 271],
-        # what is 1 + 1
-        [128000, 128006, 882, 128007, 271, 12840, 374, 220, 16, 489, 220, 16, 128009, 128006, 78191, 128007, 271],
-        # what is the meaning of life
-        [128000, 128006, 882, 128007, 271, 12840, 374, 279, 7438, 315, 2324, 128009, 128006, 78191, 128007, 271],
-    ]
-    
-    for token_list in user_token_list:
-        _, atten_mask, _, k_caches, v_caches = copy.deepcopy(example_inputs)
-        pos = torch.tensor(0, dtype=torch.int32)
-        with torch.no_grad():
-            while token_list[-1] != sp_model.eos_id and pos < max_cache_len:
-                logits, new_k_caches, new_v_caches = module(
-                    torch.full((1, 1), token_list[pos], dtype=torch.int32),
-                    atten_mask,
-                    torch.full((1, 1), pos),
-                    *k_caches,
-                    *v_caches,
-                )
-                k_caches = [
-                    torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1)
-                    for i, k_cache in enumerate(k_caches)
-                ]
-                v_caches = [
-                    torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1)
-                    for i, v_cache in enumerate(v_caches)
-                ]
-                
-                pos += 1
-                atten_mask[0][-pos - 1] = 0
-                if pos >= len(token_list):
-                    token_list.append(torch.argmax(logits[:, -1], dim=-1).item())
+            pos += 1
+            atten_mask[0][-pos - 1] = 0
+            if pos >= len(token_list):
+                token_list.append(torch.argmax(logits[:, -1], dim=-1).item())
 
     print(f"kv calibration data:\n{tokenizer.decode(token_list)}")
 
@@ -359,17 +328,7 @@ def quantize(self, quant_dtype, args, tokenizer, custom_annotations=()):
             max_seq_len=self.llama_meta["get_max_seq_len"],
         )
 
-        fx_graph_module = convert_pt2e(fx_graph_module)
-
-        logging.info("Evaluating the converted model...")
-        calibrate(
-            self.get_example_inputs(self.llama_meta["get_use_kv_cache"]),
-            args.prompt,
-            fx_graph_module,
-            tokenizer_model_path=args.tokenizer_model,
-            max_seq_len=self.llama_meta["get_max_seq_len"],
-        )
-        self.llama_model = fx_graph_module
+        self.llama_model = convert_pt2e(fx_graph_module)
 
     def lowering_modules(
         self,
diff --git a/examples/qualcomm/oss_scripts/llama/runner/runner.cpp b/examples/qualcomm/oss_scripts/llama/runner/runner.cpp
@@ -404,10 +404,10 @@ Error Runner::generate(
         token_callback(piece_res.get().c_str());
       }
 
-      if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) {
-        ET_LOG(Info, "\nReached to the end of generation");
-        break;
-      }
+      // if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) {
+      //   ET_LOG(Info, "\nReached to the end of generation");
+      //   break;
+      // }
     }
   };
 
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
@@ -1682,10 +1682,7 @@ def forward(self, x):
         ]
         self.assertEqual(external_map["linear.weight"], 0)
         self.assertEqual(external_map["linear.bias"], 1)
-<<<<<<< HEAD
 
-=======
->>>>>>> c766f0dc0 (Apply calibration patch and deduplicate delegate cache patch)
     def test_delegate_deduplicate(self) -> None:
         class SharedModule(torch.nn.Module):
             def __init__(self):
@@ -1695,10 +1692,6 @@ def __init__(self):
             def forward(self, x):
                 return self.linear(x)
 
-<<<<<<< HEAD
-=======
-
->>>>>>> c766f0dc0 (Apply calibration patch and deduplicate delegate cache patch)
         class Module1(torch.nn.Module):
             def __init__(self, shared_module):
                 super().__init__()
@@ -1707,10 +1700,6 @@ def __init__(self, shared_module):
             def forward(self, x):
                 return self.shared_module(x)
 
-<<<<<<< HEAD
-=======
-
->>>>>>> c766f0dc0 (Apply calibration patch and deduplicate delegate cache patch)
         class Module2(torch.nn.Module):
             def __init__(self, shared_module):
                 super().__init__()

Original file line number	Diff line number	Diff line change
`@@ -823,6 +823,7 @@ def generate_multi_graph_program(`
`823`	`823`	`)`
`824`	`824`	`assert qnn_mgr.Init().value == 0, "failed to load processed bytes"`
`825`	`825`	`binary_info = bytes(qnn_mgr.Compile())`
	`826`	`+ print("Checking the size of QNN binary info: ", len(binary_info))`
`826`	`827`	`assert len(binary_info) != 0, "failed to generate QNN context binary"`
`827`	`828`	`graph_names = qnn_mgr.GetGraphNames()`
`828`	`829`	`for graph_name in graph_names:`
Original file line number	Diff line number	Diff line change
`@@ -404,10 +404,10 @@ Error Runner::generate(`
`404`	`404`	`token_callback(piece_res.get().c_str());`
`405`	`405`	`}`
`406`	`406`
`407`		`- if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) {`
`408`		`- ET_LOG(Info, "\nReached to the end of generation");`
`409`		`- break;`
`410`		`- }`
	`407`	`+ // if (pos >= num_prompt_tokens && eos_id_.count(cur_token) > 0) {`
	`408`	`+ // ET_LOG(Info, "\nReached to the end of generation");`
	`409`	`+ // break;`
	`410`	`+ // }`
`411`	`411`	`}`
`412`	`412`	`};`
`413`	`413`