[XNNPACK][Partitioner] SDPA Config

mcr229 · web-flow · commit 8d4abd94d107 · 2024-08-20T10:41:54.000-07:00
Differential Revision: D60323285 Pull Request resolved: #4764
diff --git a/backends/xnnpack/operators/op_sdpa.py b/backends/xnnpack/operators/op_sdpa.py
@@ -66,9 +66,12 @@ def define_node(
 
         # Hack to broadcast the scale
         q_shape = get_shape(get_input_node(node, 0))
-        scale = cast(float, node.kwargs["scale"])
+        embedding_dim = q_shape[-1]
+        scale = 1 / (embedding_dim**0.5)
+        if "scale" in node.kwargs and node.kwargs["scale"]:
+            scale = cast(float, node.kwargs["scale"])
 
-        t = torch.full((q_shape[-1],), scale, dtype=mask_dtype)
+        t = torch.full((embedding_dim,), scale, dtype=mask_dtype)
         scale_node = self.get_fake_attr("scale", t)
         self.define_tensor(
             scale_node,
diff --git a/backends/xnnpack/partition/config/__init__.py b/backends/xnnpack/partition/config/__init__.py
@@ -40,6 +40,7 @@
     PowConfig,
     QuantizedPerTensorConfig,
     ReLUConfig,
+    # SDPAConfig, TODO: D60553559: preserving SDPA for fairseq fails
     SigmoidConfig,
     SliceCopyConfig,
     SoftmaxConfig,
@@ -87,6 +88,7 @@
     PowConfig,
     PreluConfig,
     ReLUConfig,
+    # SDPAConfig, TODO: D60553559: preserving SDPA for fairseq fails
     SigmoidConfig,
     SliceCopyConfig,
     SoftmaxConfig,
diff --git a/backends/xnnpack/partition/config/generic_node_configs.py b/backends/xnnpack/partition/config/generic_node_configs.py
@@ -415,3 +415,26 @@ class BMMConfig(GenericNodePartitionerConfig):
 
     def supported_precision_types(self) -> List[ConfigPrecisionType]:
         return [ConfigPrecisionType.FP32]
+
+
+class SDPAConfig(GenericNodePartitionerConfig):
+    target_name = "scaled_dot_product_attention.default"
+
+    def check_constraints(self, node: torch.fx.Node, ep: ExportedProgram) -> bool:
+        """
+        Requires Mask to have Rank 2
+        """
+        if not self.check_common_constraints(node, ep):
+            return False
+
+        if len(node.all_input_nodes) < 4:
+            return False
+        mask_node = node.all_input_nodes[3]
+        mask_rank = mask_node.meta["val"].dim()
+        return mask_rank == 2
+
+    def get_original_aten(self) -> Optional[torch._ops.OpOverload]:
+        return torch.ops.aten.scaled_dot_product_attention.default
+
+    def supported_precision_types(self) -> List[ConfigPrecisionType]:
+        return [ConfigPrecisionType.FP32]
diff --git a/backends/xnnpack/test/ops/sdpa.py b/backends/xnnpack/test/ops/sdpa.py
@@ -8,7 +8,12 @@
 from typing import Optional
 
 import torch
+from executorch.backends.xnnpack.partition.config.generic_node_configs import SDPAConfig
+from executorch.backends.xnnpack.partition.xnnpack_partitioner2 import (
+    XnnpackPartitioner,
+)
 from executorch.backends.xnnpack.test.tester import Tester
+from executorch.backends.xnnpack.test.tester.tester import ToEdgeTransformAndLower
 
 
 class TestSDPA(unittest.TestCase):
@@ -61,9 +66,9 @@ def _test(self, module, inputs, atol=1e-03, rtol=1e-03):
         (
             Tester(module, inputs)
             .export()
-            .to_edge()
-            .check_count({"executorch_exir_dialects_edge__ops_aten_bmm_default": 2})
-            .partition()
+            .to_edge_transform_and_lower(
+                ToEdgeTransformAndLower([XnnpackPartitioner(configs=[SDPAConfig])])
+            )
             .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
             .check_not(
                 ["executorch_exir_dialects_edge__ops_aten_bmm_default"],