Arm backend: Add additional tosa_supported_op checks for BI (#8593)

oscarandersson8218 · Erik-Lundell · web-flow · commit c35df8bfe6b0 · 2025-02-20T14:53:11.000+01:00
Add additional tosa_supported_op checks for BI

If a TosaSpecification without floating point support is used,
additional checks will be made during paritioning to make sure that we
don't partition operators that:
- are not quantized properly, i.e. does not have a dq-q pair
  surrounding them.
- should have been decomposed prior to qunatization, e.g. div should
  be decomposed to a mul and recip before quantization.

Signed-off-by: Oscar Andersson &lt;oscar.andersson@arm.com&gt;
Co-authored-by: Erik Lundell &lt;erik.lundell@arm.com&gt;
diff --git a/backends/arm/_passes/fuse_quantized_activation_pass.py b/backends/arm/_passes/fuse_quantized_activation_pass.py
@@ -13,7 +13,8 @@
 
 
 class FuseQuantizedActivationPass(ExportPass):
-    def _is_fuseable_quantized_activation(self, node: Node):
+    @staticmethod
+    def _is_fuseable_quantized_activation(node: Node):
         """Fuse activations that have a 0 lower bound and quantized with a qmin zero-point"""
         is_fuseable = node.target == exir_ops.edge.aten.relu.default
         if node.target == exir_ops.edge.aten.hardtanh.default:
@@ -29,7 +30,8 @@ def _is_fuseable_quantized_activation(self, node: Node):
         else:
             return False
 
-    def _is_fuseable_input(self, node: Node):
+    @staticmethod
+    def _is_fuseable_input(node: Node):
         return (
             node.target
             in (
@@ -45,11 +47,11 @@ def call(self, graph_module: torch.fx.GraphModule):
             if node.op != "call_function":
                 continue
 
-            if not self._is_fuseable_quantized_activation(node):
+            if not FuseQuantizedActivationPass._is_fuseable_quantized_activation(node):
                 continue
 
             input_node = node.args[0]
-            if not self._is_fuseable_input(input_node):
+            if not FuseQuantizedActivationPass._is_fuseable_input(input_node):
                 continue
 
             node.replace_all_uses_with(input_node)
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
@@ -5,13 +5,22 @@
 
 # pyre-unsafe
 
+import itertools
 import operator
+import typing
 from typing import final, Optional, Sequence, Type
 
+import torch
+
 import torch.fx as fx
+from executorch.backends.arm._passes.arm_pass_utils import get_first_fake_tensor
+from executorch.backends.arm._passes.fuse_quantized_activation_pass import (
+    FuseQuantizedActivationPass,
+)
 from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.fx.passes.operator_support import any_chain, chain, OperatorSupportBase
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
 
 
 class SupportedTOSAOperatorCheck(OperatorSupportBase):
@@ -27,7 +36,9 @@ def __init__(self, tosa_spec: TosaSpecification):
     targets: list[str] = []
 
     @final
-    def is_node_supported(self, submodules, node: fx.Node) -> bool:
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
         if node.target not in self.targets:
             return False
         return self.is_node_tosa_supported(node, self.tosa_spec)
@@ -75,6 +86,10 @@ def tosa_support_factory(
     tosa_spec: TosaSpecification,
     additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
 ) -> OperatorSupportBase:
+    negative_checks: list[OperatorSupportBase] = []
+    if not tosa_spec.support_float():
+        negative_checks.append(NeedsDecompositionCheck())
+        negative_checks.append(CheckProperQuantization())
     return chain(
         any_chain(
             BaseTOSASupportList(),
@@ -83,13 +98,16 @@ def tosa_support_factory(
                 for check in get_registered_tosa_support_checks(tosa_spec)
             ),
         ),
+        *negative_checks,
         *additional_checks if additional_checks else [],
     )
 
 
 class BaseTOSASupportList(OperatorSupportBase):
 
-    def is_node_supported(self, submodules, node: fx.Node) -> bool:
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
         supported = node.op == "call_function" and node.target in [
             exir_ops.edge.aten.abs.default,
             exir_ops.edge.aten.add.Tensor,
@@ -150,3 +168,154 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
         ]
 
         return supported
+
+
+class NeedsDecompositionCheck(OperatorSupportBase):
+    """
+    Targeted operators need to be decomposed prior to quantization in order to get a pair of q-dq-nodes surrounding
+    the operator, and to get optimal quantization parameters for each operator. This check will reject operators
+    that need to be decomposed.
+    """
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+
+        if node.op != "call_function":
+            return True
+        if node.target == exir_ops.edge.aten.mean.dim:
+            dim = node.args[1]
+            return dim == [-1, -2]
+        needs_decomp = node.target in [
+            exir_ops.edge.aten.div.Tensor,
+            exir_ops.edge.aten._native_batch_norm_legit_no_training.default,
+            exir_ops.edge.aten.native_layer_norm.default,
+            exir_ops.edge.aten.mean.dim,
+            exir_ops.edge.aten._softmax.default,
+            exir_ops.edge.aten._log_softmax.default,
+            exir_ops.edge.aten.var.correction,
+            exir_ops.edge.aten.var.dim,
+        ]
+        return not needs_decomp
+
+
+class CheckProperQuantization(OperatorSupportBase):
+    """
+    For targeted nodes, check that it has been quantized as expected. In most cases this means that a pair of quantize
+    and dequantize nodes surrounds the node. This is neccessary for table operators and operators that need to rescale
+    activations.
+    """
+
+    dq_op = exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default
+    q_op = exir_ops.edge.quantized_decomposed.quantize_per_tensor.default
+
+    def _is_matmul_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ):
+        """
+        Find the matmul source partition containing this node and check that all its inputs and outputs are quantized.
+        """
+        for graph_module in submodules.values():
+            graph_module = typing.cast(fx.GraphModule, graph_module)
+            matmul_partitions = get_source_partitions(
+                graph_module.graph,
+                [
+                    torch.matmul,
+                ],
+                None,
+            )
+            matmul_partitions = list(
+                itertools.chain.from_iterable(matmul_partitions.values())
+            )
+            matched_partition = None
+            for partition in matmul_partitions:
+                if node in partition.nodes:
+                    matched_partition = partition
+            if matched_partition is not None:
+                input_quantized = all(
+                    input_node.target == self.dq_op
+                    for input_node in matched_partition.input_nodes
+                )
+                if not input_quantized:
+                    return False
+                output_quantized = all(
+                    output_node_user.target == self.q_op
+                    for output_node_user in matched_partition.output_nodes[0].users
+                )
+                if not output_quantized:
+                    return False
+            else:
+                return False
+
+        return True
+
+    def is_node_supported(
+        self, submodules: typing.Mapping[str, torch.nn.Module], node: fx.Node
+    ) -> bool:
+        output_quantized = False
+        input_quantized = False
+        if node.target not in (
+            exir_ops.edge.aten.add.Tensor,
+            exir_ops.edge.aten.avg_pool2d.default,
+            exir_ops.edge.aten.bmm.default,
+            exir_ops.edge.aten.convolution.default,
+            exir_ops.edge.aten.exp.default,
+            exir_ops.edge.aten.hardtanh.default,
+            exir_ops.edge.aten.linear.default,
+            exir_ops.edge.aten.log.default,
+            exir_ops.edge.aten.max_pool2d_with_indices.default,
+            exir_ops.edge.aten.mm.default,
+            exir_ops.edge.aten.mul.Tensor,
+            exir_ops.edge.aten.reciprocal.default,
+            exir_ops.edge.aten.relu.default,
+            exir_ops.edge.aten.rsqrt.default,
+            exir_ops.edge.aten.sigmoid.default,
+            exir_ops.edge.aten.sub.Tensor,
+            exir_ops.edge.aten.tanh.default,
+            exir_ops.edge.aten.upsample_nearest2d.vec,
+        ):
+            return True
+        elif node.target in (
+            exir_ops.edge.aten.bmm.default,
+            exir_ops.edge.aten.mm.default,
+        ):
+            source_fn_stack: tuple[typing.Any] = node.meta.get("source_fn_stack", [])
+            if len(source_fn_stack) > 0:
+                if source_fn_stack[-1][1] in (torch.matmul,):
+                    return self._is_matmul_node_supported(submodules, node)
+
+        elif node.target in (exir_ops.edge.aten.max_pool2d_with_indices.default,):
+            users = node.users
+            output_quantized = all(
+                user.target == operator.getitem
+                and all(user_user.target == self.q_op for user_user in user.users)
+                for user in users
+            )
+        elif FuseQuantizedActivationPass._is_fuseable_input(node):
+            users = node.users
+            output_quantized = all(
+                FuseQuantizedActivationPass._is_fuseable_quantized_activation(user)
+                for user in users
+            )
+        elif FuseQuantizedActivationPass._is_fuseable_quantized_activation(node):
+            input_node = node.all_input_nodes[0]
+            input_quantized = FuseQuantizedActivationPass._is_fuseable_input(input_node)
+
+        input_quantized = input_quantized or all(
+            (input_node.target == self.dq_op)
+            or (not get_first_fake_tensor(input_node).dtype.is_floating_point)
+            for input_node in node.all_input_nodes
+        )
+
+        if not input_quantized:
+            return False
+
+        output_quantized = output_quantized or all(
+            (output_node.target == self.q_op)
+            or (not get_first_fake_tensor(output_node).dtype.is_floating_point)
+            for output_node in node.users
+        )
+
+        if not output_quantized:
+            return False
+        return True
diff --git a/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py b/backends/arm/test/misc/test_partition_decomposed_quantized_ops.py
@@ -0,0 +1,65 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Test that tosa_supported_operators reject operators that are not
+# quantized properly. This is typically a consequence of a torch op
+# such a Softplus that is decompsed into many other ops without
+# surrounding q/dq nodes.
+
+from typing import Tuple
+
+import torch
+from executorch.backends.arm.test import common
+
+from executorch.backends.arm.test.tester.test_pipeline import (
+    TosaPipelineBI,
+    TosaPipelineMI,
+)
+
+input_t1 = Tuple[torch.Tensor]
+aten_op: list[str] = ["torch.ops.aten.add.Tensor", "torch.ops.aten.softplus.default"]
+exir_op: list[str] = [
+    "executorch_exir_dialects_edge__ops_aten_add_Tensor",
+    "executorch_exir_dialects_edge__ops_aten_mul_Tensor",
+    "executorch_exir_dialects_edge__ops_aten_exp_default",
+    "executorch_exir_dialects_edge__ops_aten_div_Tensor",
+]
+
+
+test_data: dict[input_t1] = {
+    "3d_rand": (torch.rand(1, 5, 5),),
+}
+
+
+class Module(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.softplus = torch.nn.Softplus()
+
+    def forward(self, x: torch.Tensor):
+        return self.softplus(x + x)
+
+
+@common.parametrize("test_data", test_data)
+def test_softplus_tosa_MI(test_data: input_t1):
+    pipeline = TosaPipelineMI[input_t1](
+        Module(), test_data=test_data, aten_op=aten_op, exir_op=exir_op
+    )
+    # remove check_count.exir as there will be more than one delegate
+    pipeline.pop_stage("check_count.exir")
+    pipeline.run()
+
+
+@common.parametrize("test_data", test_data)
+def test_softplus_tosa_BI(test_data: input_t1):
+    pipeline = TosaPipelineBI[input_t1](
+        Module(), test_data=test_data, aten_op=aten_op, exir_op=exir_op
+    )
+    pipeline.pop_stage("check_not.exir")
+    # check that all ops in exir_op except add are rejected
+    pipeline.add_stage_after(
+        "partition", pipeline.tester.check, exir_op[1:], suffix="exir_post_partition"
+    )
+    pipeline.run()