Update attention test

jackzhxng · jackzhxng · commit a2b7ee3db093 · 2025-01-08T11:08:19.000-08:00
diff --git a/extension/llm/modules/test/test_attention.py b/extension/llm/modules/test/test_attention.py
@@ -11,6 +11,8 @@
 import torch
 from executorch.exir import EdgeCompileConfig, to_edge
 
+from executorch.exir.capture._config import ExecutorchBackendConfig
+from executorch.exir.passes.init_mutable_pass import InitializedMutableBufferPass
 from executorch.extension.llm.modules.attention import (
     MultiHeadAttention as ETMultiHeadAttention,
 )
@@ -114,7 +116,7 @@ def test_attention_eager(self):
         et_res = self.et_mha(self.x, self.x)  # Self attention.
         tt_res = self.tt_mha(self.x, self.x)  # Self attention.
 
-        self.assertTrue(torch.allclose(et_res, tt_res))
+        assert_close(et_res, tt_res)
         self.et_mha.reset_cache()
         self.tt_mha.reset_cache()
 
@@ -125,7 +127,7 @@ def test_attention_eager(self):
             self.x, self.x, input_pos=self.input_pos
         )  # Self attention with input pos.
 
-        self.assertTrue(torch.allclose(et_res, tt_res))
+        assert_close(et_res, tt_res)
 
         # test kv cache read. Input pos can be [10, 11, ..., 19]
         next_input_pos = torch.arange(10, 20).unsqueeze(0)
@@ -187,9 +189,8 @@ def test_attention_aoti(self):
 
     def test_attention_executorch(self):
         # Self attention.
-        # TODO: Fix kv cache
-        # self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
-        # self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
+        self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
+        self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=100)
 
         with torch.no_grad():
             et_mha_ep = torch.export.export(
@@ -202,9 +203,15 @@ def test_attention_executorch(self):
         et_program = to_edge(
             et_mha_ep,
             compile_config=EdgeCompileConfig(
-                _core_aten_ops_exception_list=[torch.ops.aten._assert_async.msg]
+                _core_aten_ops_exception_list=[torch.ops.aten._assert_async.msg],
+                _check_ir_validity=False,
             ),
-        ).to_executorch()
+        ).to_executorch(
+            config=ExecutorchBackendConfig(
+                passes=[InitializedMutableBufferPass(["cache_pos"])],
+            )
+        )
+
         runtime = Runtime.get()
         program = runtime.load_program(et_program.buffer)
         method = program.load_method("forward")
@@ -219,28 +226,23 @@ def test_attention_torch_cond_eager(self):
         self.et_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
         self.tt_mha.setup_cache(1, dtype=torch.float32, max_seq_len=self.max_seq_len)
 
-        # mask
         mask = self.causal_mask[self.input_pos, :]
-        # First run
+        # First run.
         et_res = self.et_mha(
             self.x, self.x, mask=mask, input_pos=self.input_pos
         )  # Self attention with input pos.
         tt_res = self.tt_mha(
             self.x, self.x, mask=mask, input_pos=self.input_pos
         )  # Self attention with input pos.
 
-        self.assertTrue(torch.allclose(et_res, tt_res))
+        assert_close(et_res, tt_res)
 
         # Second run test kv cache read. Input pos is [10, 11, ..., 19]
         next_input_pos = torch.arange(10, 20).unsqueeze(0)
 
         empty_y = torch.full_like(self.x, torch.nan)
         mask = self.causal_mask[next_input_pos, :]
-        et_res = self.et_mha(
-            self.x, empty_y, mask=mask, input_pos=next_input_pos
-        )  # Self attention with input pos.
-        tt_res = self.tt_mha(
-            self.x, None, mask=mask, input_pos=next_input_pos
-        )  # Self attention with input pos.
+        et_res = self.et_mha(self.x, empty_y, mask=mask, input_pos=next_input_pos)
+        tt_res = self.tt_mha(self.x, None, mask=mask, input_pos=next_input_pos)
 
         assert_close(et_res, tt_res)