Add support for quantized bmm (#4047)

mcremon-meta · facebook-github-bot · commit 92cab3c4c161 · 2024-06-26T11:37:08.000-07:00
Summary: Pull Request resolved: #4047 The current quantizer only captures "fake" bmm from matmuls with specific shapes. Add support for `torch.bmm` as well. Differential Revision: D58959269
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -11,6 +11,7 @@
 import torch
 from executorch.backends.cadence.aot.quantizer.patterns import (
     AddmmPattern,
+    BmmPattern,
     Conv1dPattern,
     Conv2dPattern,
     LayerNormFunctionalPattern,
@@ -396,7 +397,9 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             other_inputs,
                             quant_node,
                         )
-                    elif isinstance(pattern, MatmulPattern):
+                    elif isinstance(pattern, BmmPattern) or isinstance(
+                        pattern, MatmulPattern
+                    ):
                         args, kwargs = get_args_and_kwargs_matmul(
                             inputs_inputs,
                             dequants_inputs,
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
@@ -95,6 +95,26 @@ def replacement_op(self):
         return torch.ops.cadence.quantized_linear
 
 
+class BmmPattern(QuantizationPattern):
+    def partition_types(self):
+        return [torch.bmm]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        bmm_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(bmm_node, 0), (bmm_node, 1)],
+            weights=[],
+            biases=[],
+            output=[(bmm_node,)],
+        )
+
+    def replacement_op(self):
+        return torch.ops.cadence.quantized_matmul.default
+
+
 class Conv1dPattern(QuantizationPattern):
     def partition_types(self) -> List[Type[torch.nn.Module]]:
         return [torch.nn.Conv1d]
diff --git a/backends/cadence/aot/quantizer/quantizer.py b/backends/cadence/aot/quantizer/quantizer.py
@@ -9,6 +9,7 @@
 import torch
 from executorch.backends.cadence.aot.quantizer.patterns import (
     AddmmPattern,
+    BmmPattern,
     Conv1dPattern,
     Conv2dPattern,
     LayerNormFunctionalPattern,
@@ -133,6 +134,7 @@ def __init__(self):
         super().__init__(
             [
                 CadenceGenericQuantizer(AddmmPattern(), static_qconfig),
+                CadenceGenericQuantizer(BmmPattern(), static_qconfig),
                 CadenceGenericQuantizer(Conv1dPattern(), static_qconfig),
                 CadenceGenericQuantizer(Conv2dPattern(), static_qconfig),
                 CadenceGenericQuantizer(LayerNormPattern(), static_qconfig),