move mask as sdpa input instead of attribute

cccclai · cccclai · commit a8f04ae5453e · 2024-04-14T11:13:50.000-07:00
sdpa (https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) input is taking attention mask as input, refactor the sdpa module input closer to the sdpa input Differential Revision: [D56119739](https://our.internmc.facebook.com/intern/diff/D56119739/) ghstack-source-id: 222465699 Pull Request resolved: #3036
diff --git a/examples/models/llama2/export_llama_lib.py b/examples/models/llama2/export_llama_lib.py
@@ -96,12 +96,10 @@ class SDPACustom(torch.nn.Module):
     def __init__(
         self,
         kv_cache: KVCache,
-        mask,
         dim: int,
     ):
         super().__init__()
         self.kv_cache = kv_cache
-        self.mask = mask
         self.dim = dim
 
     def forward(
@@ -112,6 +110,7 @@ def forward(
         v: torch.Tensor,
         bsz,
         seqlen,
+        mask,
     ):
         output = torch.ops.llama.sdpa_with_kv_cache(
             q,
@@ -131,7 +130,7 @@ def _replace_sdpa_with_custom_op(module: torch.nn.Module):
             setattr(
                 module,
                 name,
-                SDPACustom(child.kv_cache, child.mask, child.dim),
+                SDPACustom(child.kv_cache, child.dim),
             )
         else:
             _replace_sdpa_with_custom_op(child)
diff --git a/examples/models/llama2/llama_transformer.py b/examples/models/llama2/llama_transformer.py
@@ -213,13 +213,11 @@ class SDPA(nn.Module):
     def __init__(
         self,
         kv_cache: KVCache,
-        mask,
         dim: int,
         n_rep: int,
     ):
         super().__init__()
         self.kv_cache = kv_cache
-        self.mask = mask
         self.dim = dim
         self.n_rep = n_rep
 
@@ -231,17 +229,18 @@ def forward(
         v: torch.Tensor,
         bsz,
         seqlen,
+        mask: torch.Tensor,
     ) -> torch.Tensor:
         q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)
         k = k.transpose(1, 2)
         v = v.transpose(1, 2)
 
         k, v = self.kv_cache.update(input_pos, k, v)
-        mask = self.mask[None, None, input_pos]
+        attn_mask = self.mask[None, None, input_pos]
 
         k = k.repeat_interleave(self.n_rep, dim=1)
         v = v.repeat_interleave(self.n_rep, dim=1)
-        y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0)
+        y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=0.0)
 
         return y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim)
 
@@ -288,7 +287,6 @@ def __init__(self, args: ModelArgs, layer_id: int):
             )
             self.SDPA = SDPA(
                 self.kv_cache,
-                self.mask,
                 self.dim,
                 self.n_rep,
             )
@@ -314,7 +312,7 @@ def forward(
 
         if self.use_kv_cache:
             assert input_pos is not None
-            output = self.SDPA(input_pos, q, k, v, bsz, seqlen)
+            output = self.SDPA(input_pos, q, k, v, bsz, seqlen, self.mask)
             return self.wo(output)
 
         q = q.transpose(1, 2)  # (bs, n_local_heads, seqlen, head_dim)