Update base for Update on "Dont quantize the current token for attention"

kimishpatel · kimishpatel · commit bbf7b761d6d8 · 2024-10-01T13:38:50.000-07:00
Differential Revision: [D63497872](https://our.internmc.facebook.com/intern/diff/D63497872/) [ghstack-poisoned]
diff --git a/examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py b/examples/models/llama2/source_transformation/test_sdpa_with_quantized_kv_cache.py
@@ -1,3 +1,9 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
 import unittest
 
 import torch
@@ -57,23 +63,25 @@ def test_simple(self, is_dynamic_shape=False):
         self.quantized_sdpa = SDPACustom(self.quantized_kv_cache, self.dim)
         float_out = self.float_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
         quantized_out = self.quantized_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
-        self.assertTrue(
-            torch.allclose(
-                float_out,
-                quantized_out,
-            )
+        torch.testing.assert_close(
+            float_out,
+            quantized_out,
+            # had to adjust rtol because switching to using custom_sdpa means we
+            # will use dequantized k and v instead of original k and v
+            # this leads to larger differences in the output.
+            # subsequent diff in the stack will address this issue.
+            rtol=1e-01,
+            atol=1e-03,
         )
 
         input_pos = torch.tensor([3], dtype=torch.int64)
         self.seq_len = 1
         q, k, v = self._init_kv()
         float_out = self.float_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
         quantized_out = self.quantized_sdpa(input_pos, q, k, v, 1, self.seq_len, None)
-        self.assertTrue(
-            torch.allclose(
-                float_out,
-                quantized_out,
-                rtol=1e-03,
-                atol=1e-03,
-            )
+        torch.testing.assert_close(
+            float_out,
+            quantized_out,
+            rtol=1e-03,
+            atol=1e-03,
         )