[XNNPACK][Partitioner] SDPA Config (#4797)

kirklandsign · web-flow · commit f93a5b5f8bf2 · 2024-08-20T11:13:18.000-07:00
We add the SDPA Config here for partitioner. Currently there is an issue with SDPA when used from the FairSeq Multihead attention models, so I currently have it disabled for the base partitioner until we resolve that. Otherwise, for our tests, we can use the SDPA correctly from there. We have to track D60553559. Will follow up on this later. Differential Revision: [D60323285](https://our.internmc.facebook.com/intern/diff/D60323285/) Co-authored-by: Max Ren <maxren@meta.com> Pull Request resolved: #4764
diff --git a/backends/xnnpack/operators/op_sdpa.py b/backends/xnnpack/operators/op_sdpa.py
@@ -66,9 +66,12 @@ def define_node(
 
         # Hack to broadcast the scale
         q_shape = get_shape(get_input_node(node, 0))
-        scale = cast(float, node.kwargs["scale"])
+        embedding_dim = q_shape[-1]
+        scale = 1 / (embedding_dim**0.5)
+        if "scale" in node.kwargs and node.kwargs["scale"]:
+            scale = cast(float, node.kwargs["scale"])
 
-        t = torch.full((q_shape[-1],), scale, dtype=mask_dtype)
+        t = torch.full((embedding_dim,), scale, dtype=mask_dtype)
         scale_node = self.get_fake_attr("scale", t)
         self.define_tensor(
             scale_node,
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
@@ -40,6 +40,7 @@
     PowConfig,
     QuantizedPerTensorConfig,
     ReLUConfig,
+    # SDPAConfig, TODO: D60553559: preserving SDPA for fairseq fails
     SigmoidConfig,
     SliceCopyConfig,
     SoftmaxConfig,
@@ -87,6 +88,7 @@
     PowConfig,
     PreluConfig,
     ReLUConfig,
+    # SDPAConfig, TODO: D60553559: preserving SDPA for fairseq fails
     SigmoidConfig,
     SliceCopyConfig,
     SoftmaxConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -415,3 +415,26 @@ class BMMConfig(GenericNodePartitionerConfig):
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
+
+
+class SDPAConfig(GenericNodePartitionerConfig):
+    target_name = "scaled_dot_product_attention.default"
+
+    def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
+        """
+        Requires Mask to have Rank 2
+        """
+        if not self.check_common_constraints(node, ep):
+            return False
+
+        if len(node.all_input_nodes) < 4:
+            return False
+        mask_node = node.all_input_nodes[3]
+        mask_rank = mask_node.meta["val"].dim()
+        return mask_rank == 2
+
+    def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
+        return torch.ops.aten.scaled_dot_product_attention.default
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
diff --git a/backends/xnnpack/test/ops/sdpa.py b/backends/xnnpack/test/ops/sdpa.py
@@ -8,7 +8,12 @@
 from typing import Optional
 
 import torch
+from executorch.backends.xnnpack.partition.config.generic_node_configs import SDPAConfig
+from executorch.backends.xnnpack.partition.xnnpack_partitioner2 import (
+    XnnpackPartitioner,
+)
 from executorch.backends.xnnpack.test.tester import Tester
+from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
 
 
 class TestSDPA(unittest.TestCase):
@@ -61,9 +66,9 @@ def _test(self, module, inputs, atol=1e-03, rtol=1e-03):
         (
             Tester(module, inputs)
             .export()
-            .to_edge()
-            .check_count({"executorch_exir_dialects_edge__ops_aten_bmm_default": 2})
-            .partition()
+            .to_edge_transform_and_lower(
+                ToEdgeTransformAndLower([XnnpackPartitioner(configs=[SDPAConfig])])
+            )
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .check_not(
                 ["executorch_exir_dialects_edge__ops_aten_bmm_default"],