Fix a rotary position encoding bug in kv cache

larryliu0820 · facebook-github-bot · commit 027ad541b510 · 2024-02-20T12:31:20.000-08:00
Summary:
We have 2 branches in `Transformer` module for using kv cache or not. For the branch that uses kv cache, we should get the rotary position encoding by slicing the precomputed value by the `start_pos: start_pos + seqlen`.

This diff fixes it.

Reviewed By: JacobSzwejbka

Differential Revision: D53954747

fbshipit-source-id: d79ea06e97d5a5f06533e4e4db11f61e2a0fae87
diff --git a/examples/models/llama2/model.py b/examples/models/llama2/model.py
@@ -405,6 +405,9 @@ def forward(
     ) -> Union[
         torch.Tensor, Tuple[torch.Tensor, List[torch.Tensor], List[torch.Tensor]]
     ]:
+        _bsz, seqlen = tokens.shape
+        h = self.tok_embeddings(tokens)
+
         if self.use_kv_cache:
             assert (
                 cache_k is not None and cache_v is not None and start_pos is not None
@@ -415,29 +418,27 @@ def forward(
             assert (
                 cache_v.size(0) == self.n_layers
             ), f"{cache_v.size(0)} != {self.n_layers}"
-        else:
-            assert (
-                start_pos is None and cache_k is None and cache_v is None,
-                "Caches and start_pos are unused when use_kv_cache is False",
-            )
-
-        _bsz, seqlen = tokens.shape
-        h = self.tok_embeddings(tokens)
-        freqs_cos = self.freqs_cos[:seqlen]
-        freqs_sin = self.freqs_sin[:seqlen]
 
-        if self.use_kv_cache:
-            sp = start_pos.item()  # pyre-ignore[16]
+            sp = start_pos.item()
             # self.params.max_seq_len - 1 because of 0 based indexing, and - 1 again because our input seq len is 1 and its added to the cache before accessing the cache
             torch._constrain_as_size(sp, min=0, max=self.params.max_seq_len - 2)
             torch._constrain_as_value(
-                cache_k.shape[0],  # pyre-ignore[16]
-                min=self.n_layers,
+                cache_k.shape[0],
                 max=self.n_layers,
+                min=self.n_layers,
             )
             torch._constrain_as_value(
                 cache_v.shape[0], min=self.n_layers, max=self.n_layers
             )
+            # when KV cache is used, seqlen is most likely 1. We want to slice from the start_pos.
+            freqs_cos = self.freqs_cos[sp : sp + seqlen]
+            freqs_sin = self.freqs_sin[sp : sp + seqlen]
+        else:
+            assert (
+                start_pos is None and cache_k is None and cache_v is None,
+            ), "Caches and start_pos are unused when use_kv_cache is False"
+            freqs_cos = self.freqs_cos[:seqlen]
+            freqs_sin = self.freqs_sin[:seqlen]
 
         for index, layer in enumerate(self.layers):
             if self.use_kv_cache: