Apply calibration patch and deduplicate delegate cache patch

winskuo-quic · winskuo-quic · commit c766f0dc0277 · 2025-01-09T15:12:47.000+08:00
diff --git a/examples/qualcomm/oss_scripts/llama3_2/llama.py b/examples/qualcomm/oss_scripts/llama3_2/llama.py
@@ -73,37 +73,49 @@ def _kv_calibrate(
     max_seq_len=512,
 ):
     sp_model = get_tokenizer(tokenizer_model_path)
-    _, atten_mask, _, k_caches, v_caches = example_inputs
 
     # TODO: change criteria & support batch inputs if necessary
-    pos = torch.tensor(0, dtype=torch.int32)
     max_cache_len = max_seq_len - 1
-    token_list = sp_model.encode(user_prompts, bos=True, eos=False)
 
-    with torch.no_grad():
-        while token_list[-1] != sp_model.eos_id and pos < max_cache_len:
-            logits, new_k_caches, new_v_caches = module(
-                torch.full((1, 1), token_list[pos], dtype=torch.int32),
-                atten_mask,
-                torch.full((1, 1), pos),
-                *k_caches,
-                *v_caches,
-            )
-            k_caches = [
-                torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1)
-                for i, k_cache in enumerate(k_caches)
-            ]
-            v_caches = [
-                torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1)
-                for i, v_cache in enumerate(v_caches)
-            ]
-
-            pos += 1
-            atten_mask[0][-pos - 1] = 0
-            if pos >= len(token_list):
-                token_list.append(torch.argmax(logits[:, -1], dim=-1).item())
 
-    print(f"calibration data:\n{sp_model.decode(token_list)}")
+    # token_list = sp_model.encode(user_prompts, bos=True, eos=False)
+	
+    user_token_list = [
+        # what is the capital of the united states
+        [128000, 128006, 882, 128007, 271, 12840, 374, 279, 6864, 315, 279, 29292, 5415, 128009, 128006, 78191, 128007, 271],
+        # what is 1 + 1
+        [128000, 128006, 882, 128007, 271, 12840, 374, 220, 16, 489, 220, 16, 128009, 128006, 78191, 128007, 271],
+        # what is the meaning of life
+        [128000, 128006, 882, 128007, 271, 12840, 374, 279, 7438, 315, 2324, 128009, 128006, 78191, 128007, 271],
+    ]
+    
+    for token_list in user_token_list:
+        _, atten_mask, _, k_caches, v_caches = copy.deepcopy(example_inputs)
+        pos = torch.tensor(0, dtype=torch.int32)
+        with torch.no_grad():
+            while token_list[-1] != sp_model.eos_id and pos < max_cache_len:
+                logits, new_k_caches, new_v_caches = module(
+                    torch.full((1, 1), token_list[pos], dtype=torch.int32),
+                    atten_mask,
+                    torch.full((1, 1), pos),
+                    *k_caches,
+                    *v_caches,
+                )
+                k_caches = [
+                    torch.cat([k_cache[:, :, 1:], new_k_caches[i]], dim=-1)
+                    for i, k_cache in enumerate(k_caches)
+                ]
+                v_caches = [
+                    torch.cat([v_cache[:, 1:, :], new_v_caches[i]], dim=1)
+                    for i, v_cache in enumerate(v_caches)
+                ]
+                
+                pos += 1
+                atten_mask[0][-pos - 1] = 0
+                if pos >= len(token_list):
+                    token_list.append(torch.argmax(logits[:, -1], dim=-1).item())
+
+        logging.info(f"calibration data:\n{sp_model.decode(token_list)}")
 
 
 def _prefill_calibrate(
@@ -114,32 +126,44 @@ def _prefill_calibrate(
     max_seq_len=512,
 ):
     sp_model = get_tokenizer(tokenizer_model_path)
-    _, atten_mask = example_inputs
     max_cache_len = max_seq_len - 1
 
     # TODO: change criteria & support batch inputs if necessary
-    token_list = sp_model.encode(user_prompts, bos=True, eos=False)
-    token_list = torch.tensor(token_list)[:max_cache_len].reshape(1, -1)
-    last_prompt_pos = token_list.numel()
-    if last_prompt_pos < max_cache_len:
-        token_list = torch.cat(
-            [
-                token_list,
-                torch.zeros((1, max_cache_len - last_prompt_pos), dtype=torch.int32),
-            ],
-            dim=1,
-        )
-    else:
-        token_list = token_list[:, :max_cache_len]
-
-    with torch.no_grad():
-        logits, new_k_caches, new_v_caches = module(
-            token_list,
-            atten_mask,
-        )
-        predict = [torch.argmax(logits[:, last_prompt_pos - 1], dim=-1).item()]
+    
+    # token_list = sp_model.encode(user_prompts, bos=True, eos=False)
+
+    user_token_list = [
+        # what is the capital of the united states
+        [128000, 128006, 882, 128007, 271, 12840, 374, 279, 6864, 315, 279, 29292, 5415, 128009, 128006, 78191, 128007, 271],
+        # what is 1 + 1
+        [128000, 128006, 882, 128007, 271, 12840, 374, 220, 16, 489, 220, 16, 128009, 128006, 78191, 128007, 271],
+        # what is the meaning of life
+        [128000, 128006, 882, 128007, 271, 12840, 374, 279, 7438, 315, 2324, 128009, 128006, 78191, 128007, 271],
+    ]
+
+    for token_list in user_token_list:
+        _, atten_mask = copy.deepcopy(example_inputs)
+        token_list = torch.tensor(token_list)[:max_cache_len].reshape(1, -1)
+        last_prompt_pos = token_list.numel()
+        if last_prompt_pos < max_cache_len:
+            token_list = torch.cat(
+                [
+                    token_list,
+                    torch.zeros((1, max_cache_len - last_prompt_pos), dtype=torch.int32),
+                ],
+                dim=1,
+            )
+        else:
+            token_list = token_list[:, :max_cache_len]
 
-    print(f"calibration data:\n{sp_model.decode(predict)}")
+        with torch.no_grad():
+            logits, new_k_caches, new_v_caches = module(
+                token_list,
+                atten_mask,
+            )
+            predict = [torch.argmax(logits[:, last_prompt_pos - 1], dim=-1).item()]
+            
+        logging.info(f"calibration data:\n{sp_model.decode(predict)}")
 
 
 def calibrate(
@@ -249,7 +273,17 @@ def quantize(self, quant_dtype, args, custom_annotations=()):
             max_seq_len=self.llama_meta["get_max_seq_len"],
         )
 
-        self.llama_model = convert_pt2e(fx_graph_module)
+        fx_graph_module = convert_pt2e(fx_graph_module)
+
+        logging.info("Evaluating the converted model...")
+        calibrate(
+            self.get_example_inputs(self.llama_meta["get_use_kv_cache"]),
+            args.prompt,
+            fx_graph_module,
+            tokenizer_model_path=args.tokenizer_model,
+            max_seq_len=self.llama_meta["get_max_seq_len"],
+        )
+        self.llama_model = fx_graph_module
 
     def lowering_modules(
         self,
diff --git a/exir/_serialize/_program.py b/exir/_serialize/_program.py
@@ -224,6 +224,7 @@ def _extract_delegate_segments(
     """
     remaining_inline: List[BackendDelegateInlineData] = []
     inline_indices_seen: set[int] = set()
+    segment_index_map: dict[bytes, int] = {}
     for plan in program.execution_plan:
         for delegate in plan.delegates:
             if delegate.processed.location != DataLocation.INLINE:
@@ -249,8 +250,11 @@ def _extract_delegate_segments(
             inline_indices_seen.add(delegate.processed.index)
             if inline.data:
                 # Move the delegate data out of the program.
-                segment_index = len(segments)
-                segments.append(Cord(inline.data))
+                segment_index = segment_index_map.get(inline.data)
+                if segment_index is None:
+                    segment_index = len(segments)
+                    segments.append(Cord(inline.data))
+                    segment_index_map[inline.data] = segment_index
                 delegate.processed = BackendDelegateDataReference(
                     location=DataLocation.SEGMENT,
                     index=segment_index,
diff --git a/exir/backend/test/demos/rpc/TARGETS b/exir/backend/test/demos/rpc/TARGETS
@@ -28,6 +28,7 @@ runtime.python_library(
     ],
     visibility = [
         "//executorch/exir/backend/test/...",
+        "//executorch/exir/emit/test/...",
     ],
     deps = [
         ":executor_backend_preprocess",
diff --git a/exir/emit/_emitter.py b/exir/emit/_emitter.py
@@ -122,6 +122,8 @@ class _ProgramState:
     # Delegate data stored directly in the flatbuffer. Pointed to by BackendDelegateDataReference,
     # and should be copied to Program.backend_delegate_data.
     backend_delegate_data: List[BackendDelegateInlineData] = field(default_factory=list)
+    # Delegate cache that is used across all entry points.
+    backend_delegate_data_cache: Dict[bytes, int] = field(default_factory=dict)
 
     # Constants are optionally stored in external files.
     # Aggregate unique external constants into one buffer.
@@ -1112,10 +1114,13 @@ def _emit_delegate(
         if delegate_index is None:
             # Allocate an entry for the data. TODO(T150113674): Reuse any duplicate entries if
             # present.
-            data_index: int = len(self.program_state.backend_delegate_data)
-            self.program_state.backend_delegate_data.append(
-                BackendDelegateInlineData(data=processed_bytes)
-            )
+            data_index: Optional[int] = self.program_state.backend_delegate_data_cache.get(processed_bytes)
+            if data_index is None:
+                data_index = len(self.program_state.backend_delegate_data)
+                self.program_state.backend_delegate_data_cache[processed_bytes] = data_index
+                self.program_state.backend_delegate_data.append(
+                    BackendDelegateInlineData(data=processed_bytes)
+                )
 
             backend_delegate = BackendDelegate(
                 id=lowered_module.backend_id,
diff --git a/exir/emit/test/TARGETS b/exir/emit/test/TARGETS
@@ -16,6 +16,7 @@ python_unittest(
         "//executorch/exir:lib",
         "//executorch/exir:print_program",
         "//executorch/exir:schema",
+        "//executorch/exir/backend/test/demos/rpc:executor_backend_partitioner",
         "//executorch/exir/backend:backend_api",
         "//executorch/exir/emit:lib",
         "//executorch/exir/passes:const_prop_pass",
diff --git a/exir/emit/test/test_emit.py b/exir/emit/test/test_emit.py
@@ -26,6 +26,9 @@
 from executorch.exir._serialize._program import deserialize_pte_binary
 from executorch.exir.backend.backend_api import to_backend
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
+from executorch.exir.backend.test.demos.rpc.executor_backend_partitioner import (
+    ExecutorBackendPartitioner,
+)
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.emit import emit_program  # noqa
 from executorch.exir.error import InternalError
@@ -60,7 +63,7 @@
 from functorch.experimental import control_flow
 from torch import nn
 
-from torch.export import Dim, export
+from torch.export import Dim, export, export_for_training
 
 
 class WrapperModule(torch.nn.Module):
@@ -1626,3 +1629,51 @@ def forward(self, x):
         ]
         self.assertEqual(external_map["linear.weight"], 0)
         self.assertEqual(external_map["linear.bias"], 1)
+    def test_delegate_deduplicate(self) -> None:
+        class SharedModule(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.linear = torch.nn.Linear(2, 2)
+
+            def forward(self, x):
+                return self.linear(x)
+
+
+        class Module1(torch.nn.Module):
+            def __init__(self, shared_module):
+                super().__init__()
+                self.shared_module = shared_module
+
+            def forward(self, x):
+                return self.shared_module(x)
+
+
+        class Module2(torch.nn.Module):
+            def __init__(self, shared_module):
+                super().__init__()
+                self.shared_module = shared_module
+
+            def forward(self, x):
+                return self.shared_module(x)
+
+        shared_module = SharedModule()
+        module_1 = Module1(shared_module)
+        module_2 = Module2(shared_module)
+        example_inputs = (torch.randn(2, 2),)
+        module_1(*example_inputs)
+        module_2(*example_inputs)
+
+        ep1 = export_for_training(module_1, example_inputs)
+        ep2 = export_for_training(module_2, example_inputs)
+
+        edge_program_manager = exir.to_edge(
+            {"forward1": ep1, "forward2": ep2},
+            compile_config=exir.EdgeCompileConfig(
+                _check_ir_validity=False, _use_edge_ops=True
+            ),
+        )
+
+        edge_program_manager = edge_program_manager.to_backend(ExecutorBackendPartitioner()).to_executorch()
+
+        # Check that there is only one delegate because two methods are exactly the same
+        self.assertEqual(len(edge_program_manager.executorch_program.backend_delegate_data), 1)