Implement bmm op for Arm backend

Erik-Lundell · web-flow · commit 66b2f73c5fd3 · 2024-08-28T19:47:05.000-07:00
Differential Revision: D61852906 Pull Request resolved: #4926
diff --git a/backends/arm/arm_partitioner.py b/backends/arm/arm_partitioner.py
@@ -40,6 +40,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.addmm.default,
             exir_ops.edge.aten.expand_copy.default,
             exir_ops.edge.aten.cat.default,
+            exir_ops.edge.aten.bmm.default,
             exir_ops.edge.aten.permute_copy.default,
             exir_ops.edge.aten.hardtanh.default,
             exir_ops.edge.aten.convolution.default,
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
@@ -9,6 +9,7 @@
     op_addmm,
     op_avg_pool2d,
     op_batch_norm,
+    op_bmm,
     op_cat,
     op_conv2d,
     op_dequant,
diff --git a/backends/arm/operators/op_bmm.py b/backends/arm/operators/op_bmm.py
@@ -0,0 +1,82 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List
+
+import serializer.tosa_serializer as ts
+import torch.fx
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_quant_utils import build_rescale, get_quant_node_args
+from executorch.backends.arm.tosa_utils import get_two_inputs
+from serializer.tosa_serializer import TosaOp
+
+
+@register_node_visitor
+class BMMVisitor(NodeVisitor):
+    target = "aten.bmm.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+        input0, input1 = get_two_inputs(node)
+
+        # aten.bmm maps directly to MATMUL
+        # NOTE: For now, only INT8 & FP32 is supported
+
+        # For INT8, we need to get the zero points and add an intermediate tensor
+        # for a later rescale.
+        if is_quant_node:
+            input0_zp = get_quant_node_args(input0).zp
+            input1_zp = get_quant_node_args(input1).zp
+            bmm_result = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
+            bmm_output_name = bmm_result.name
+        else:
+            input0_zp, input1_zp = 0, 0
+            bmm_output_name = output.name
+
+        # Add the MATMUL to the TOSA graph.
+        attr = ts.TosaSerializerAttribute()
+        attr.MatMulAttribute(A_zp=input0_zp, B_zp=input1_zp)
+
+        tosa_graph.addOperator(
+            TosaOp.Op().MATMUL,
+            [input0.name, input1.name],
+            [bmm_output_name],
+            attr,
+        )
+
+        # As INT8 accumulates into INT32, we need to rescale it back to INT8
+        if is_quant_node:
+            input0_q_params = get_quant_node_args(input0)
+            input1_q_params = get_quant_node_args(input1)
+            output_q_params = get_quant_node_args(list(node.users)[0])
+
+            final_output_scale = (
+                input0_q_params.scale * input1_q_params.scale
+            ) / output_q_params.scale
+
+            build_rescale(
+                tosa_fb=tosa_graph,
+                scale=final_output_scale,
+                input_node=bmm_result,
+                output_name=output.name,
+                output_type=ts.DType.INT8,
+                output_shape=bmm_result.shape,
+                input_zp=0,
+                output_zp=output_q_params.zp,
+                is_double_round=False,
+            )
diff --git a/backends/arm/quantizer/quantization_annotation/mm_annotator.py b/backends/arm/quantizer/quantization_annotation/mm_annotator.py
@@ -22,7 +22,7 @@ def _annotate_mm(
     quantization_config: QuantizationConfig,
     filter_fn: Optional[Callable[[Node], bool]] = None,
 ) -> Optional[List[List[Node]]]:
-    mm_partitions = get_source_partitions(gm.graph, [torch.mm], filter_fn)
+    mm_partitions = get_source_partitions(gm.graph, [torch.mm, torch.bmm], filter_fn)
     mm_partitions = list(itertools.chain.from_iterable(mm_partitions.values()))
     annotated_partitions = []
     for mm_partition in mm_partitions:
diff --git a/backends/arm/test/ops/test_bmm.py b/backends/arm/test/ops/test_bmm.py
@@ -0,0 +1,135 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from parameterized import parameterized
+
+torch.manual_seed(1)
+
+
+class TestBMM(unittest.TestCase):
+    """Tests Batch MatMul"""
+
+    class BMM(torch.nn.Module):
+        test_parameters = [
+            (torch.rand(5, 3, 5), torch.rand(5, 5, 2)),
+            (torch.rand(2, 1, 1), torch.rand(2, 1, 1)),
+            (torch.ones(1, 55, 3), torch.ones(1, 3, 44)),
+            (10000 * torch.randn(10, 1, 10), torch.randn(10, 10, 5)),
+            (-10 * torch.randn(2, 32, 64), 5 + 5 * torch.randn(2, 64, 32)),
+        ]
+
+        def forward(self, x, y):
+            return torch.bmm(x, y)
+
+    class BMMSingleInput(torch.nn.Module):
+        test_parameters = [
+            (torch.rand(20, 3, 3),),
+            (torch.ones(2, 128, 128),),
+            (10000 * torch.randn(4, 25, 25),),
+            (5 + 5 * torch.randn(3, 64, 64),),
+        ]
+
+        def forward(self, x):
+            return torch.bmm(x, x)
+
+    def _test_bmm_tosa_MI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .export()
+            .check_count({"torch.ops.aten.bmm.default": 1})
+            .check_not(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_bmm_tosa_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_tosa_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.bmm.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_not(["executorch_exir_dialects_edge__ops_aten_bmm_default"])
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+            .run_method_and_compare_outputs(inputs=test_data)
+        )
+
+    def _test_bmm_u55_BI_pipeline(
+        self, module: torch.nn.Module, test_data: Tuple[torch.Tensor, ...]
+    ):
+        (
+            ArmTester(
+                module,
+                example_inputs=test_data,
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .check_count({"torch.ops.aten.bmm.default": 1})
+            .check(["torch.ops.quantized_decomposed"])
+            .to_edge()
+            .partition()
+            .check_count({"torch.ops.higher_order.executorch_call_delegate": 1})
+            .to_executorch()
+        )
+
+    @parameterized.expand(BMM.test_parameters)
+    def test_bmm_tosa_MI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_bmm_tosa_MI_pipeline(self.BMM(), test_data)
+
+    @parameterized.expand(BMMSingleInput.test_parameters)
+    def test_bmm_single_input_tosa_MI(self, operand1: torch.Tensor):
+        test_data = (operand1,)
+        self._test_bmm_tosa_MI_pipeline(self.BMMSingleInput(), test_data)
+
+    @parameterized.expand(BMM.test_parameters)
+    def test_bmm_tosa_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data)
+
+    @parameterized.expand(BMMSingleInput.test_parameters)
+    def test_bmm_single_input_tosa_BI(self, operand1: torch.Tensor):
+        test_data = (operand1,)
+        self._test_bmm_tosa_BI_pipeline(self.BMMSingleInput(), test_data)
+
+    @parameterized.expand(BMM.test_parameters)
+    def test_bmm_u55_BI(self, operand1: torch.Tensor, operand2: torch.Tensor):
+        test_data = (operand1, operand2)
+        self._test_bmm_tosa_BI_pipeline(self.BMM(), test_data)
+
+    # Expected to fail with error: Warning, unsupported fusing of TOSA Rescale previous operator is of type: Memcpy
+    @parameterized.expand(BMMSingleInput.test_parameters)
+    @unittest.expectedFailure
+    def test_bmm_single_input_u55_BI(self, operand1: torch.Tensor):
+        test_data = (operand1,)
+        self._test_bmm_u55_BI_pipeline(self.BMMSingleInput(), test_data)