Merge pull request #2236 from NightMachinery/patch-1

rwightman · web-flow · commit 474c9cf76834 · 2024-07-19T08:09:56.000-07:00
eva.py: fixed bug in applying attention mask
diff --git a/timm/models/eva.py b/timm/models/eva.py
@@ -134,10 +134,12 @@ def forward(
         else:
             q = q * self.scale
             attn = (q @ k.transpose(-2, -1))
-            attn = attn.softmax(dim=-1)
+            
             if attn_mask is not None:
                 attn_mask = attn_mask.to(torch.bool)
                 attn = attn.masked_fill(~attn_mask[:, None, None, :], float("-inf"))
+            attn = attn.softmax(dim=-1)
+            
             attn = self.attn_drop(attn)
             x = attn @ v