Update on "move mask as sdpa input instead of attribute"

cccclai · cccclai · commit 3ea88cde0c91 · 2024-04-14T14:58:27.000-07:00
sdpa (https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) input is taking attention mask as input, refactor the sdpa module input closer to the sdpa input Differential Revision: [D56119739](https://our.internmc.facebook.com/intern/diff/D56119739/) [ghstack-poisoned]
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
@@ -214,11 +214,13 @@ def __init__(
         self,
         kv_cache: KVCache,
         dim: int,
+        head_dim: int,
         n_rep: int,
     ):
         super().__init__()
         self.kv_cache = kv_cache
         self.dim = dim
+        self.head_dim = head_dim
         self.n_rep = n_rep
 
     def forward(
@@ -236,7 +238,7 @@ def forward(
         v = v.transpose(1, 2)
 
         k, v = self.kv_cache.update(input_pos, k, v)
-        attn_mask = self.mask[None, None, input_pos]
+        attn_mask = mask[None, None, input_pos]
 
         k = k.repeat_interleave(self.n_rep, dim=1)
         v = v.repeat_interleave(self.n_rep, dim=1)
@@ -286,9 +288,10 @@ def __init__(self, args: ModelArgs, layer_id: int):
                 not args.use_sdpa_with_kv_cache_op,  # if we are using the custom op dont transpose the cache. Expect untransposed q k v
             )
             self.SDPA = SDPA(
-                self.kv_cache,
-                self.dim,
-                self.n_rep,
+                kv_cache=self.kv_cache,
+                dim=self.dim,
+                head_dim=self.head_dim,
+                n_rep=self.n_rep,
             )
 
     def forward(