LLM export pass to swap in custom SDPA

sxu · web-flow · commit 7054b1f955be · 2025-04-28T22:00:51.000-07:00
Differential Revision: D73444078 Pull Request resolved: #10355
diff --git a/extension/llm/export/TARGETS b/extension/llm/export/TARGETS
@@ -41,6 +41,21 @@ runtime.python_library(
         "//executorch/exir:lib",
         "//executorch/exir/backend:backend_details",
         "//executorch/extension/export_util:export_util",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_py",
         "//pytorch/tokenizers/pytorch_tokenizers:tokenizers",
     ],
 )
+
+runtime.python_test(
+    name = "export_passes_test",
+    srcs = [
+        "test_export_passes.py",
+    ],
+    preload_deps = [
+        "//executorch/extension/llm/custom_ops:custom_ops_aot_lib",
+    ],
+    deps = [
+        ":export_lib",
+    ],
+)
diff --git a/extension/llm/export/export_passes.py b/extension/llm/export/export_passes.py
@@ -1,3 +1,5 @@
+import logging
+
 import torch
 
 from executorch.exir.pass_base import ExportPass
@@ -95,3 +97,106 @@ def call(self, graph_module: torch.fx.GraphModule):
         graph_module.recompile()
 
         return PassResult(graph_module, graph_changed)
+
+
+class ReplaceSDPAWithCustomSDPAPass(ExportPass):
+    """
+    This pass replaces aten.scaled_dot_product_attention.default with llama.custom_sdpa.default.
+    If assume_causal_mask is set to True, this pass will ignore any explicit masks and simply set
+    is_causal to True in custoom_spda.
+    """
+
+    def __init__(self, assume_causal_mask=False):
+        super().__init__()
+        self.assume_causal_mask = assume_causal_mask
+
+    def call_operator(self, op, args, kwargs, meta):
+        from executorch.extension.llm.custom_ops import custom_ops  # noqa
+
+        if op != torch.ops.aten.scaled_dot_product_attention.default:
+            return super().call_operator(op, args, kwargs, meta)
+
+        q, k, v, mask, dropout, is_causal, scale = self._extract_args(args, kwargs)
+
+        qT = self._transpose(q, meta)
+        kT = self._transpose(k, meta)
+        vT = self._transpose(v, meta)
+
+        if not (
+            q.node.meta["val"].dim()
+            == k.node.meta["val"].dim()
+            == v.node.meta["val"].dim()
+            == 4
+        ):
+            logging.info("ReplaceSDPAWithCustomSDPAPass only supports 4D QKV inputs.")
+            return super().call_operator(op, args, kwargs, meta)
+
+        if self.assume_causal_mask:
+            # Ignore specified mask simply set the is_causal flag.
+            mask = None
+            is_causal = True
+
+        if mask is not None:
+            mask_fake_tensor = mask.node.meta["val"]
+            if mask_fake_tensor.dim() > 2:
+                if all(d == 1 for d in mask_fake_tensor.size()[:-2]):
+                    mask = super().call_operator(
+                        torch.ops.aten.squeeze.dims,
+                        (mask, tuple(i for i in range(mask_fake_tensor.dim() - 2))),
+                        {},
+                        meta,
+                    )
+                else:
+                    logging.info(
+                        "ReplaceSDPAWithCustomSDPAPass only supports 2D attention mask."
+                    )
+                    return super().call_operator(op, args, kwargs, meta)
+
+            # TODO(kimishpatel): Remove once custom SDPA supports boolean mask.
+            if mask_fake_tensor.dtype == torch.bool:
+                mask = super().call_operator(
+                    torch.ops.aten.where.Scalar,
+                    (mask, 0.0, float("-inf")),
+                    {},
+                    meta,
+                )
+
+        custom_sdpa = super().call_operator(
+            torch.ops.llama.custom_sdpa.default,
+            (qT, kT, vT, 0, mask, dropout, is_causal, scale),
+            {},
+            meta,
+        )
+        return self._transpose(custom_sdpa, meta)
+
+    def _extract_args(self, args, kwargs):
+        q, k, v, *rest = args
+        mask = None
+        dropout = 0.0
+        is_causal = False
+        scale = None
+        if len(rest) > 0:
+            mask = rest[0]
+        if len(rest) > 1:
+            dropout = rest[1]
+        if len(rest) > 2:
+            is_causal = rest[2]
+        if "scale" in kwargs:
+            scale = kwargs["scale"]
+
+        return q, k, v, mask, dropout, is_causal, scale
+
+    def _transpose(self, x, meta):
+        transpose = super().call_operator(
+            torch.ops.aten.transpose.int,
+            (x, 1, 2),
+            {},
+            meta,
+        )
+        contiguous = super().call_operator(
+            torch.ops.aten.contiguous.default,
+            (transpose,),
+            {},
+            meta,
+        )
+        return contiguous
diff --git a/extension/llm/export/test_export_passes.py b/extension/llm/export/test_export_passes.py
@@ -2,7 +2,10 @@
 
 import torch
 
-from executorch.extension.llm.export.export_passes import RemoveRedundantTransposes
+from executorch.extension.llm.export.export_passes import (
+    RemoveRedundantTransposes,
+    ReplaceSDPAWithCustomSDPAPass,
+)
 
 from torch.export import export_for_training
 from torch.testing import FileCheck
@@ -160,3 +163,47 @@ def forward(self, x):
 
         m = TestModule2()
         self._check(m, (x,), key, 3, 2)
+
+
+class ReplaceSDPAWithCustomSDPAPassTest(unittest.TestCase):
+    class TestModule(torch.nn.Module):
+        def forward(self, x, mask, is_causal):
+            return torch.nn.functional.scaled_dot_product_attention(
+                x, x, x, attn_mask=mask, is_causal=is_causal
+            )
+
+    def setUp(self):
+        torch.manual_seed(0)
+
+    def _test(self, args, assume_causal_mask=False):
+        m = self.TestModule()
+        gm = export_for_training(m, args, strict=True).module()
+
+        sdpa_key = "torch.ops.aten.scaled_dot_product_attention.default"
+        custom_sdpa_key = "torch.ops.llama.custom_sdpa.default"
+        FileCheck().check_count(sdpa_key, 1, exactly=True).run(gm.code)
+        gm = ReplaceSDPAWithCustomSDPAPass(assume_causal_mask)(gm).graph_module
+        FileCheck().check_count(sdpa_key, 0, exactly=True).run(gm.code)
+        FileCheck().check_count(custom_sdpa_key, 1, exactly=True).run(gm.code)
+
+        y1 = m(*args)
+        y2 = gm(*args)
+        self.assertTrue(torch.allclose(y1, y2))
+
+    def test_causal_mask(self):
+        self._test((torch.rand(1, 4, 32, 64), None, True))
+
+    def test_explicit_causal_mask(self):
+        mask = torch.tril(torch.ones(32, 32, dtype=torch.bool))
+        self._test((torch.rand(1, 4, 32, 64), mask, False), assume_causal_mask=True)
+
+    def test_custom_mask(self):
+        m1 = torch.tril(torch.ones(32, 32, dtype=torch.bool))
+        m2 = torch.tril(torch.ones(32, 32, dtype=torch.bool), diagonal=-16)
+        self._test((torch.rand(1, 4, 32, 64), torch.logical_xor(m1, m2), False))
+
+    def test_squeezable_mask(self):
+        m1 = torch.tril(torch.ones(32, 32, dtype=torch.bool))
+        m2 = torch.tril(torch.ones(32, 32, dtype=torch.bool), diagonal=-16)
+        m = torch.logical_xor(m1, m2).view(1, 1, 32, 32)
+        self._test((torch.rand(1, 4, 32, 64), m, False))