Add tests that localize the prefill issue to the kv cache

jackzhxng · jackzhxng · commit aac90a05bff8 · 2024-12-10T16:44:30.000-08:00
diff --git a/extension/llm/modules/test/test_kv_cache.py b/extension/llm/modules/test/test_kv_cache.py
@@ -0,0 +1,176 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+from typing import Callable, Tuple
+
+import torch
+
+from executorch.exir import EdgeCompileConfig, to_edge
+from executorch.extension.llm.modules.kv_cache import KVCache as InferenceKVCache
+from executorch.runtime import Runtime
+from torch.testing import assert_close
+from torchtune.modules.kv_cache import KVCache
+
+
+def generate_cache_inputs(
+    seq_len: int,
+    batch_size: int = 1,
+    num_kv_heads: int = 64,
+    head_dim: int = 8,
+) -> Tuple[torch.Tensor, ...]:
+    """Helper to generate k_val and v_val for both et and tt caches."""
+    k_val = torch.ones(batch_size, seq_len, num_kv_heads, head_dim)
+    v_val = torch.ones(batch_size, seq_len, num_kv_heads, head_dim)
+
+    # For torchtune, the kv cache takes in transposed k and v.
+    k_val_trans = k_val.transpose(1, 2)
+    v_val_trans = v_val.transpose(1, 2)
+
+    return (k_val, v_val, k_val_trans, v_val_trans)
+
+
+class KVCacheTest(unittest.TestCase):
+    def setUp(self):
+        self.batch_size = 1
+        self.max_seq_len = 10
+        self.num_kv_heads = 1  # For testing purposes, usually this is 64.
+        self.head_dim = 8
+        self.dtype = torch.float
+
+        self.tt_kv_cache = KVCache(
+            batch_size=self.batch_size,
+            max_seq_len=self.max_seq_len,
+            num_kv_heads=self.num_kv_heads,
+            head_dim=self.head_dim,
+            dtype=self.dtype,
+        )
+        self.et_kv_cache = InferenceKVCache(
+            batch_size=self.batch_size,
+            max_seq_len=self.max_seq_len,
+            num_kv_heads=self.num_kv_heads,
+            head_dim=self.head_dim,
+            dtype=self.dtype,
+            transpose_cache=False,
+        )
+
+    def _test_kv_cache(self, et_cache_module: Callable):
+        """
+        Given an executorch kv cache anywhere along the export chain, compare it's results
+        against torchtune and run basic tests.
+        """
+        prefill_seq_len = 3
+        k_val, v_val, k_val_trans, v_val_trans = generate_cache_inputs(
+            prefill_seq_len, self.batch_size, self.num_kv_heads, self.head_dim
+        )
+
+        et_res = et_cache_module(k_val, v_val)
+        tt_res = self.tt_kv_cache.update(k_val_trans, v_val_trans)
+        tt_res_transposed = (tt_res[0].transpose(1, 2), tt_res[1].transpose(1, 2))
+
+        # Check torchtune matches executorch.
+        assert_close(et_res, tt_res_transposed)
+
+        # Check the values are correct, all rows in the seq_len dim should be
+        # filled with 1s up to and including the 3rd.
+        et_k_cache = et_res[0]
+        for i in range(prefill_seq_len):
+            self.assertTrue(et_k_cache[0][i][0][0] == 1)
+        self.assertTrue(et_k_cache[0][prefill_seq_len][0][0] == 0)
+
+        """Case 2: Token-by-token (seq_len = 0)"""
+        seq_len = 1
+        k_val, v_val, k_val_trans, v_val_trans = generate_cache_inputs(
+            seq_len, self.batch_size, self.num_kv_heads, self.head_dim
+        )
+
+        et_res = et_cache_module(k_val, v_val)
+        tt_res = self.tt_kv_cache.update(k_val_trans, v_val_trans)
+
+        # Check torchtune matches executorch.
+        tt_res_transposed = (tt_res[0].transpose(1, 2), tt_res[1].transpose(1, 2))
+        assert_close(tt_res_transposed, et_res)
+
+        # All rows should be filled with 1s up to 3 + 1th row.
+        et_k_cache = et_res[0]
+        for i in range(prefill_seq_len + 1):
+            self.assertTrue(et_k_cache[0][i][0][0] == 1)
+        self.assertTrue(et_k_cache[0][prefill_seq_len + 1][0][0] == 0)
+
+    def export_kv_cache(
+        self,
+        kv_cache: torch.nn.Module,
+    ) -> torch.export.ExportedProgram:
+        # Wrapper since torch.export only exports forward().
+        class EtCacheWrapper(torch.nn.Module):
+            def __init__(self, kv_cache: torch.nn.Module):
+                super().__init__()
+                self.kv_cache = kv_cache
+
+            def forward(self, k_val: torch.Tensor, v_val: torch.Tensor):
+                return self.kv_cache.update(k_val, v_val)
+
+        dim = torch.export.Dim("seq_len_dim", min=1, max=self.max_seq_len)
+        exported_kv_cache = torch.export.export(
+            EtCacheWrapper(self.et_kv_cache),
+            (
+                torch.Tensor(self.batch_size, 3, self.num_kv_heads, self.head_dim),
+                torch.Tensor(self.batch_size, 3, self.num_kv_heads, self.head_dim),
+            ),  # 3 as example prefill seq_len.
+            dynamic_shapes={
+                "k_val": {
+                    0: torch.export.Dim.STATIC,
+                    1: dim,
+                    2: torch.export.Dim.STATIC,
+                    3: torch.export.Dim.STATIC,
+                },
+                "v_val": {
+                    0: torch.export.Dim.STATIC,
+                    1: dim,
+                    2: torch.export.Dim.STATIC,
+                    3: torch.export.Dim.STATIC,
+                },
+            },
+        )
+        return exported_kv_cache
+
+    def test_kv_cache_eager(self):
+        self._test_kv_cache(self.et_kv_cache.update)
+
+    def test_kv_cache_export(self):
+        exported_kv_cache = self.export_kv_cache(self.et_kv_cache)
+        self._test_kv_cache(exported_kv_cache.module())
+
+    def test_kv_cache_edge(self):
+        exported_kv_cache = self.export_kv_cache(self.et_kv_cache)
+        edge_program = to_edge(
+            exported_kv_cache,
+            compile_config=EdgeCompileConfig(
+                _core_aten_ops_exception_list=[torch.ops.aten._assert_async.msg],
+                _check_ir_validity=False,
+            ),
+        )
+        self._test_kv_cache(edge_program._edge_programs["forward"].module())
+
+    def test_kv_cache_executorch(self):
+        exported_kv_cache = self.export_kv_cache(self.et_kv_cache)
+        edge_program = to_edge(
+            exported_kv_cache,
+            compile_config=EdgeCompileConfig(
+                _core_aten_ops_exception_list=[torch.ops.aten._assert_async.msg],
+                _check_ir_validity=False,
+            ),
+        )
+        et_program = edge_program.to_executorch()
+        runtime = Runtime.get()
+        program = runtime.load_program(et_program.buffer)
+        method = program.load_method("forward")
+
+        # Since method.execute expects a tuple of args.
+        def wrapped_callable(k_val: torch.Tensor, v_val: torch.Tensor) -> torch.Tensor:
+            return method.execute((k_val, v_val))
+
+        self._test_kv_cache(wrapped_callable)