[XNNPACK][Partitioner] SDPA Config

mcr229 · mcr229 · commit 94892f67ecca · 2024-08-16T15:47:27.000-07:00
We add the SDPA Config here for partitioner. Currently there is an issue with SDPA when used from the FairSeq Multihead attention models, so I currently have it disabled for the base partitioner until we resolve that. Otherwise, for our tests, we can use the SDPA correctly from there. We have to track D60553559. Will follow up on this later. Differential Revision: [D60323285](https://our.internmc.facebook.com/intern/diff/D60323285/) [ghstack-poisoned]
diff --git a/backends/xnnpack/operators/op_sdpa.py b/backends/xnnpack/operators/op_sdpa.py
@@ -4,7 +4,7 @@
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
-from typing import cast, Dict
+from typing import Dict
 
 import torch
 from executorch.backends.transforms import get_shape
@@ -66,9 +66,12 @@ def define_node(
 
         # Hack to broadcast the scale
         q_shape = get_shape(get_input_node(node, 0))
-        scale = cast(float, node.kwargs["scale"])
+        C = q_shape[-1]
+        scale = 1 / (C**0.5)
+        if "scale" in node.kwargs and node.kwargs["scale"]:
+            scale = node.kwargs["scale"]
 
-        t = torch.full((q_shape[-1],), scale, dtype=mask_dtype)
+        t = torch.full((C,), scale, dtype=mask_dtype)
         scale_node = self.get_fake_attr("scale", t)
         self.define_tensor(
             scale_node,
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
@@ -40,6 +40,7 @@
     PowConfig,
     QuantizedPerTensorConfig,
     ReLUConfig,
+    # SDPAConfig, TODO: D60553559: preserving SDPA for fairseq fails
     SigmoidConfig,
     SliceCopyConfig,
     SoftmaxConfig,
@@ -87,6 +88,7 @@
     PowConfig,
     PreluConfig,
     ReLUConfig,
+    # SDPAConfig, TODO: D60553559: preserving SDPA for fairseq fails
     SigmoidConfig,
     SliceCopyConfig,
     SoftmaxConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -415,3 +415,26 @@ class BMMConfig(GenericNodePartitionerConfig):
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
+
+
+class SDPAConfig(GenericNodePartitionerConfig):
+    target_name = "scaled_dot_product_attention.default"
+
+    def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
+        """
+        Requires Mask to have Rank 2
+        """
+        if not self.check_common_constraints(node, ep):
+            return False
+
+        if len(node.all_input_nodes) < 4:
+            return False
+        mask_node = node.all_input_nodes[3]
+        mask_rank = mask_node.meta["val"].dim()
+        return mask_rank == 2
+
+    def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
+        return torch.ops.aten.scaled_dot_product_attention.default
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
diff --git a/backends/xnnpack/test/ops/sdpa.py b/backends/xnnpack/test/ops/sdpa.py
@@ -8,7 +8,12 @@
 from typing import Optional
 
 import torch
+from executorch.backends.xnnpack.partition.config.generic_node_configs import SDPAConfig
+from executorch.backends.xnnpack.partition.xnnpack_partitioner2 import (
+    XnnpackPartitioner,
+)
 from executorch.backends.xnnpack.test.tester import Tester
+from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
 
 
 class TestSDPA(unittest.TestCase):
@@ -61,9 +66,9 @@ def _test(self, module, inputs, atol=1e-03, rtol=1e-03):
         (
             Tester(module, inputs)
             .export()
-            .to_edge()
-            .check_count({"executorch_exir_dialects_edge__ops_aten_bmm_default": 2})
-            .partition()
+            .to_edge_transform_and_lower(
+                ToEdgeTransformAndLower([XnnpackPartitioner(configs=[SDPAConfig])])
+            )
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .check_not(
                 ["executorch_exir_dialects_edge__ops_aten_bmm_default"],