Arm backend: Add MM to BMM pass (#7341)

Sebastian-Larsson · web-flow · commit 05abf894f1c6 · 2025-01-09T16:00:44.000+01:00
aten.mm does not support input or output tensors of rank 3, which is
required by TOSA for the MM operation. Therefore, create a pass that
converts any MM nodes to BMM (which always has rank 3). The pass also
unsqueezes tensors of rank 2 to rank 3.

As a result of the new pass, op_mm.py is no longer required and has
been removed.


Change-Id: I8459dd73bb366452b5139b48a5724c300b2d5a26

Signed-off-by: Sebastian Larsson &lt;sebastian.larsson@arm.com&gt;
diff --git a/backends/arm/_passes/annotate_decomposed_matmul.py b/backends/arm/_passes/annotate_decomposed_matmul.py
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -36,7 +36,6 @@ def call(self, graph_module: GraphModule) -> PassResult:
             itertools.chain.from_iterable(matmul_partitions.values())
         )
         matmul_targets = {
-            exir_ops.edge.aten.mm.default,
             exir_ops.edge.aten.bmm.default,
         }
         for partition in matmul_partitions:
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -45,6 +45,7 @@
 from executorch.backends.arm._passes.meandim_to_averagepool_pass import (
     ConvertMeanDimToAveragePool,
 )
+from executorch.backends.arm._passes.mm_to_bmm_pass import ConvertMmToBmmPass
 from executorch.backends.arm._passes.remove_clone_pass import RemoveClonePass
 from executorch.backends.arm._passes.scalars_to_attribute_pass import (
     ScalarsToAttributePass,
@@ -79,6 +80,7 @@ def transform_to_backend_pipeline(
         self.add_pass(ConvertMeanDimToAveragePool())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ConvertSplitToSlicePass())
+        self.add_pass(ConvertMmToBmmPass())
         # TODO MLETORCH-558
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeFullArgument())
@@ -99,7 +101,6 @@ def transform_to_backend_pipeline(
                     exir_ops.edge.aten.hardtanh.default,
                     exir_ops.edge.aten.log.default,
                     exir_ops.edge.aten.max_pool2d.default,
-                    exir_ops.edge.aten.mm.default,
                     exir_ops.edge.aten.mul.Tensor,
                     exir_ops.edge.aten.permute_copy.default,
                     exir_ops.edge.aten.reciprocal.default,
diff --git a/backends/arm/_passes/mm_to_bmm_pass.py b/backends/arm/_passes/mm_to_bmm_pass.py
@@ -0,0 +1,98 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+    insert_q_dq_pair,
+)
+from executorch.backends.arm.tosa_quant_utils import dq_op, q_op
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx import Node
+
+
+class ConvertMmToBmmPass(ExportPass):
+    """
+    This pass converts a MM node to a BMM one and turns input and output tensors
+    from rank 2 to rank 3. The TOSA specification requires rank 3. The graph is
+    modified to do the following:
+    1) Unsqueeze input tensors to rank 3.
+    2) Convert MM node to BMM.
+    3) Squeeze output tensor to rank 2.
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        modified_graph = False
+        graph = graph_module.graph
+        node_list = graph.find_nodes(
+            op="call_function", target=exir_ops.edge.aten.mm.default
+        )
+        for node in node_list:
+            # Unsqueeze input tensors to rank 3
+            for input_node in node.args:
+                if not isinstance(input_node, Node):
+                    continue
+
+                shape = get_first_fake_tensor(input_node).shape
+                rank = len(shape)
+                if rank != 2:
+                    raise RuntimeError(f"Input tensor has rank {rank}, must be 2")
+
+                with graph.inserting_before(node):
+                    unsqueeze_before = create_node(
+                        graph, exir_ops.edge.aten.unsqueeze_copy.default
+                    )
+                    unsqueeze_before.args = (
+                        input_node,  # Input is node's original input
+                        0,
+                    )
+                    node.replace_input_with(input_node, unsqueeze_before)
+
+                # If Quantized we must insert unsqueeze --> q --> dq --> node
+                if input_node.target == dq_op:
+                    q_params = input_node.args[1:]
+                    insert_q_dq_pair(graph, unsqueeze_before, q_params)
+
+            # Replace mm node with bmm
+            with graph.inserting_before(node):
+                bmm_node = create_node(
+                    graph,
+                    exir_ops.edge.aten.bmm.default,
+                )
+                bmm_node.args = node.args
+                node.replace_all_uses_with(bmm_node)
+                graph.erase_node(node)
+
+            # Unsqueeze output tensor to rank 3
+            with graph.inserting_after(bmm_node):
+                squeeze_after = create_node(
+                    graph,
+                    exir_ops.edge.aten.squeeze_copy.dims,
+                )
+                squeeze_after.args = (
+                    bmm_node,
+                    [0],
+                )
+                original_users = [
+                    user for user in bmm_node.users if user != squeeze_after
+                ]
+                for user in original_users:
+                    user.replace_input_with(bmm_node, squeeze_after)
+
+            # If quantized, insert mm --> q --> dq --> squeeze
+            if all(original_user.target == q_op for original_user in original_users):
+                q_params = original_users[0].args[1:]
+                insert_q_dq_pair(graph, bmm_node, q_params)
+
+            modified_graph = True
+
+        if modified_graph:
+            graph_module.recompile()
+            graph_module = super().call(graph_module).graph_module
+
+        return PassResult(graph_module, modified_graph)
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
@@ -1,4 +1,4 @@
-# Copyright 2023-2024 Arm Limited and/or its affiliates.
+# Copyright 2023-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -22,7 +22,6 @@
     op_max,
     op_max_pool2d,
     op_min,
-    op_mm,
     op_mul,
     op_permute,
     op_quant,
diff --git a/backends/arm/operators/op_mm.py b/backends/arm/operators/op_mm.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright 2024 Arm Limited and/or its affiliates.`
	`1`	`+# Copyright 2024-2025 Arm Limited and/or its affiliates.`
`2`	`2`	`# All rights reserved.`
`3`	`3`	`#`
`4`	`4`	`# This source code is licensed under the BSD-style license found in the`
`@@ -36,7 +36,6 @@ def call(self, graph_module: GraphModule) -> PassResult:`
`36`	`36`	`itertools.chain.from_iterable(matmul_partitions.values())`
`37`	`37`	`)`
`38`	`38`	`matmul_targets = {`
`39`		`- exir_ops.edge.aten.mm.default,`
`40`	`39`	`exir_ops.edge.aten.bmm.default,`
`41`	`40`	`}`
`42`	`41`	`for partition in matmul_partitions:`