Add lowering of TOSA.MIN and TOSA.MAX

per · per · commit 70f95d083e71 · 2024-12-16T09:45:15.000+01:00
Uses the fold DQ/Q pass to encapsulate the quantization information within the node.

Signed-off-by: Per Åstrand &lt;per.astrand@arm.com&gt;
Change-Id: I3adbab7e2a23a0208a03bbc423b38c15221a4959
diff --git a/backends/arm/_passes/arm_pass_manager.py b/backends/arm/_passes/arm_pass_manager.py
@@ -29,6 +29,9 @@
     DecomposeSoftmaxesPass,
 )
 from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
+from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
+    FoldAndAnnotateQParamsPass,
+)
 from executorch.backends.arm._passes.keep_dims_false_to_squeeze_pass import (
     KeepDimsFalseToSqueezePass,
 )
@@ -50,6 +53,7 @@
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
 from executorch.exir.backend.compile_spec_schema import CompileSpec
+from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_manager import PassManager
 
 
@@ -80,6 +84,14 @@ def transform_to_backend_pipeline(
         self.add_pass(Conv1dUnsqueezePass(exported_program))
         self.add_pass(DecomposeSoftmaxesPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(
+            FoldAndAnnotateQParamsPass(
+                [
+                    exir_ops.edge.aten.minimum.default,
+                    exir_ops.edge.aten.maximum.default,
+                ]
+            )
+        )
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
                 memory_format = spec.value.decode()
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
@@ -94,6 +94,8 @@ def is_node_supported(self, submodules, node: fx.Node) -> bool:
             exir_ops.edge.aten.sigmoid.default,
             exir_ops.edge.aten.mean.dim,
             exir_ops.edge.aten.mm.default,
+            exir_ops.edge.aten.minimum.default,
+            exir_ops.edge.aten.maximum.default,
             exir_ops.edge.aten.repeat.default,
             exir_ops.edge.aten.reciprocal.default,
             exir_ops.edge.aten.relu.default,
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
@@ -19,7 +19,9 @@
     op_get_item,
     op_hardtanh,
     op_log,
+    op_max,
     op_max_pool2d,
+    op_min,
     op_mm,
     op_mul,
     op_permute,
diff --git a/backends/arm/operators/op_max.py b/backends/arm/operators/op_max.py
@@ -0,0 +1,81 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import cast, List
+
+import executorch.backends.arm.tosa_quant_utils as tqutils
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_utils import tosa_shape
+
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class MaxVisitor(NodeVisitor):
+    target = "aten.maximum.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+        assert inputs[0].dtype == inputs[1].dtype
+
+        input_qparams = cast(dict[int, tqutils.QuantArgs], node.meta["input_qparams"])
+        min_output = output
+
+        if inputs[0].dtype == ts.DType.INT8:
+            # insert RESCALEs to int32
+            x_scale = input_qparams[0].scale
+            x_zp = input_qparams[0].zp
+
+            y_scale = input_qparams[1].scale
+            y_zp = input_qparams[1].zp
+
+            assert (
+                x_zp == y_zp
+            ), "Different zp for inputs, MAX should be quantized with shared quantization!"
+            assert (
+                x_scale == y_scale
+            ), "Different scale for input, MAX should be quantized with shared quantization!"
+
+            operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
+                tosa_graph, inputs, node
+            )
+
+            output.shape = tosa_shape(output.shape, output.dim_order)
+            min_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
+        else:
+            operand_inputs = inputs
+
+        tosa_graph.addOperator(
+            TosaOp.Op().MAXIMUM,
+            [
+                operand_inputs[0].name,
+                operand_inputs[1].name,
+            ],
+            [min_output.name],
+        )
+
+        if output.dtype == ts.DType.INT8:
+            # insert RESCALE from int32 back to int8
+            tqutils.insert_rescale_node_back_to_int8(
+                tosa_graph, min_output, scale_back, node
+            )
diff --git a/backends/arm/operators/op_min.py b/backends/arm/operators/op_min.py
@@ -0,0 +1,81 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import cast, List
+
+import executorch.backends.arm.tosa_quant_utils as tqutils
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_utils import tosa_shape
+
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class MinVisitor(NodeVisitor):
+    target = "aten.minimum.default"
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+        is_quant_node: bool,
+    ) -> None:
+        assert inputs[0].dtype == inputs[1].dtype
+
+        input_qparams = cast(dict[int, tqutils.QuantArgs], node.meta["input_qparams"])
+        min_output = output
+
+        if inputs[0].dtype == ts.DType.INT8:
+            # insert RESCALEs to int32
+            x_scale = input_qparams[0].scale
+            x_zp = input_qparams[0].zp
+
+            y_scale = input_qparams[1].scale
+            y_zp = input_qparams[1].zp
+
+            assert (
+                x_zp == y_zp
+            ), "Different zp for inputs, MIN should be quantized with shared quantization!"
+            assert (
+                x_scale == y_scale
+            ), "Different scale for input, MIN should be quantized with shared quantization!"
+
+            operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
+                tosa_graph, inputs, node
+            )
+
+            output.shape = tosa_shape(output.shape, output.dim_order)
+            min_output = tosa_graph.addIntermediate(output.shape, ts.DType.INT32)
+        else:
+            operand_inputs = inputs
+
+        tosa_graph.addOperator(
+            TosaOp.Op().MINIMUM,
+            [
+                operand_inputs[0].name,
+                operand_inputs[1].name,
+            ],
+            [min_output.name],
+        )
+
+        if output.dtype == ts.DType.INT8:
+            # insert RESCALE from int32 back to int8
+            tqutils.insert_rescale_node_back_to_int8(
+                tosa_graph, min_output, scale_back, node
+            )
diff --git a/backends/arm/quantizer/arm_quantizer.py b/backends/arm/quantizer/arm_quantizer.py
@@ -77,6 +77,7 @@ def _supported_symmetric_quantized_operators() -> Dict[str, List[OperatorPattern
         ],
         "mul": [[torch.mul]],
         "sub": [[torch.sub]],
+        "min_max": [[torch.min], [torch.max]],
     }
     return copy.deepcopy(supported_operators)
 
@@ -267,6 +268,7 @@ class ArmQuantizer(Quantizer):
         "add",
         "sub",
         "mul",
+        "min_max",
         "mm",
         "one_to_one",
         "generic",
diff --git a/backends/arm/quantizer/quantization_annotation/__init__.py b/backends/arm/quantizer/quantization_annotation/__init__.py
@@ -55,6 +55,7 @@ def decorator(annotator: AnnotatorType):
     generic_annotator,
     linear_annotator,
     max_pool2d_annotator,
+    min_max_annotator,
     mm_annotator,
     mul_annotator,
     one_to_one_annotator,
diff --git a/backends/arm/quantizer/quantization_annotation/min_max_annotator.py b/backends/arm/quantizer/quantization_annotation/min_max_annotator.py
@@ -0,0 +1,46 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import Callable, List, Optional
+
+import torch
+from executorch.backends.arm.quantizer import arm_quantizer_utils
+from executorch.backends.arm.quantizer.quantization_annotation import register_annotator
+from executorch.backends.arm.quantizer.quantization_config import QuantizationConfig
+from torch.ao.quantization.quantizer import QuantizationAnnotation
+from torch.fx import GraphModule, Node
+
+
+@register_annotator("min_max")
+def _annotate_min_max(
+    gm: GraphModule,
+    quantization_config: QuantizationConfig,
+    filter_fn: Optional[Callable[[Node], bool]] = None,
+) -> Optional[List[List[Node]]]:
+    annotated_partitions = []
+    for node in gm.graph.nodes:
+        if node.target not in (
+            torch.ops.aten.minimum.default,
+            torch.ops.aten.maximum.default,
+        ):
+            continue
+        annotated_partitions.append(node)
+        min_max_node = node
+        if arm_quantizer_utils.is_annotated(min_max_node):
+            continue
+
+        input_qspec_map, output_qspec = arm_quantizer_utils.get_shared_qspec(
+            min_max_node, gm, quantization_config
+        )
+        if input_qspec_map is not None:
+            min_max_node.meta["quantization_annotation"] = QuantizationAnnotation(
+                input_qspec_map=input_qspec_map,
+                output_qspec=output_qspec,
+                _annotated=True,
+            )
+    return annotated_partitions
diff --git a/backends/arm/test/ops/test_maximum.py b/backends/arm/test/ops/test_maximum.py
diff --git a/backends/arm/test/ops/test_minimum.py b/backends/arm/test/ops/test_minimum.py