Arm backend: Add POW operator

Martin Lindström · Martin Lindström · commit 9f56b4084dda · 2025-03-20T23:29:51.000+01:00
Implement support for torch.pow in the MI and BI profile of TOSA.

For MI, the operator works as Pytorch's reference implementation except
for that the base operand cannot be a scalar but must be a tensor. This
is due to a general bug.

For BI, the exponent operand must be a scalar and a constant value. The
base operand must be a tensor.

Change-Id: I9c91b2a19ef43ae2ef884640974017824327dbf3
diff --git a/backends/arm/_passes/insert_table_ops.py b/backends/arm/_passes/insert_table_ops.py
@@ -6,18 +6,20 @@
 
 # pyre-unsafe
 
-from typing import Callable, Dict
+from typing import Callable, cast, Dict, Set
 
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
 from executorch.backends.arm.tosa_quant_utils import QuantArgs
+from executorch.backends.transforms.utils import delete_constant_placeholder
 from executorch.exir import ExportedProgram
 
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.dialects.edge._ops import EdgeOpOverload
 
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule
+from torch.fx.node import Node
 from torch.library import impl, Library
 
 lib = Library("tosa", "DEF")
@@ -29,6 +31,59 @@ def _table_impl(*args, **kwargs):  # pyre-ignore
     return args[0]
 
 
+class TableOps:
+    """
+    Helper class for finding the corresponding table operator for a given Node.
+    """
+
+    def __init__(self, exported_program: ExportedProgram):
+        self.exported_program = exported_program
+
+        # Targets that follow a straigtforward one-to-one mapping to their table op
+        self.unary_table_ops: Dict[
+            EdgeOpOverload, Callable[[torch.Tensor], torch.Tensor]
+        ] = {
+            exir_ops.edge.aten.exp.default: torch.exp,
+            exir_ops.edge.aten.floor.default: torch.floor,
+            exir_ops.edge.aten.log.default: torch.log,
+            exir_ops.edge.aten.reciprocal.default: torch.reciprocal,
+            exir_ops.edge.aten.rsqrt.default: torch.rsqrt,
+            exir_ops.edge.aten.sigmoid.default: torch.sigmoid,
+            exir_ops.edge.aten.tanh.default: torch.tanh,
+            exir_ops.edge.aten.hardsigmoid.default: torch.nn.functional.hardsigmoid,
+            exir_ops.edge.aten.hardswish.default: torch.nn.functional.hardswish,
+        }
+
+        # Targets that must be treated explicitly
+        self.special_table_ops: Set[EdgeOpOverload] = {
+            exir_ops.edge.aten.pow.Tensor_Tensor,
+        }
+
+    def __contains__(self, node: Node) -> bool:
+        return (
+            node.target in self.unary_table_ops or node.target in self.special_table_ops
+        )
+
+    def __getitem__(self, node: Node):
+        target = cast(EdgeOpOverload, node.target)
+        if target in self.unary_table_ops:
+            return self.unary_table_ops[target]
+        elif target in self.special_table_ops:
+            match target:
+                case exir_ops.edge.aten.pow.Tensor_Tensor:
+                    # Exponent is a constant. Retrieve it from the graph and embed it into a lambda.
+                    exp_node = cast(Node, node.args[1])
+                    exp_name = self.exported_program.graph_signature.inputs_to_buffers[
+                        exp_node.name
+                    ]
+                    exp = self.exported_program.state_dict[exp_name]
+                    return lambda x: torch.pow(x, exp).flatten()
+                case _:
+                    raise NotImplementedError("Unhandled table operation")
+        else:
+            raise KeyError("Table op for {target} does not exist")
+
+
 class InsertTableOpsPass(ExportPass):
     """
     For ops in self.table_ops they need to be serialized as a TOSA TABLE. This pass replaces these
@@ -37,21 +92,10 @@ class InsertTableOpsPass(ExportPass):
     which will be used to produce the table values in operators/op_table.py.
     """
 
-    table_ops: Dict[EdgeOpOverload, Callable[[torch.Tensor], torch.Tensor]] = {
-        exir_ops.edge.aten.exp.default: torch.exp,
-        exir_ops.edge.aten.floor.default: torch.floor,
-        exir_ops.edge.aten.log.default: torch.log,
-        exir_ops.edge.aten.reciprocal.default: torch.reciprocal,
-        exir_ops.edge.aten.rsqrt.default: torch.rsqrt,
-        exir_ops.edge.aten.sigmoid.default: torch.sigmoid,
-        exir_ops.edge.aten.tanh.default: torch.tanh,
-        exir_ops.edge.aten.hardsigmoid.default: torch.nn.functional.hardsigmoid,
-        exir_ops.edge.aten.hardswish.default: torch.nn.functional.hardswish,
-    }
-
     def __init__(self, exported_program: ExportedProgram) -> None:
         super().__init__()
         self.exported_program = exported_program
+        self.table_ops = TableOps(exported_program)
 
     def register_buffer(self, buffer_name: str, buffer: torch.Tensor) -> None:
         """
@@ -86,7 +130,7 @@ def f(x: torch.Tensor) -> torch.Tensor:
     def call(self, graph_module: GraphModule) -> PassResult:
         modified = False
         for node in graph_module.graph.nodes:
-            if node.op != "call_function" or node.target not in self.table_ops:
+            if node.op != "call_function" or node not in self.table_ops:
                 continue
             input_qparams = node.meta["input_qparams"]
             output_qparams = node.meta["output_qparams"]
@@ -104,7 +148,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 assert len(output_qparams) == 1
                 # Generate table buffer
                 buffer = self.generate_table_values(
-                    torch_op=self.table_ops[node.target],
+                    torch_op=self.table_ops[node],
                     in_quantargs=input_qparams[0],
                     out_quantargs=output_qparams[0],
                 )
@@ -115,7 +159,19 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     buffer_name=table_node.name.replace("_default", ""), buffer=buffer
                 )
                 node.replace_all_uses_with(table_node)
-            graph_module.graph.erase_node(node)
+
+            if node.target in self.table_ops.special_table_ops:
+                # The node must be treated explicitly
+                match node.target:
+                    case exir_ops.edge.aten.pow.Tensor_Tensor:
+                        exp_node = node.args[1]
+                        graph_module.graph.erase_node(node)
+                        delete_constant_placeholder(self.exported_program, exp_node)
+                    case _:
+                        raise NotImplementedError("Unhandled table operation")
+            else:
+                graph_module.graph.erase_node(node)
+
             table_node.meta["input_qparams"] = input_qparams
             table_node.meta["output_qparams"] = output_qparams
             modified = True
diff --git a/backends/arm/_passes/match_arg_ranks_pass.py b/backends/arm/_passes/match_arg_ranks_pass.py
@@ -45,6 +45,7 @@ def __init__(self, exported_program):
         exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.mul.Tensor,
         exir_ops.edge.aten.div.Tensor,
+        exir_ops.edge.aten.pow.Tensor_Tensor,
     ]
 
     def _match_op_rank(self, graph_module, node, arg, max_rank):
diff --git a/backends/arm/operator_support/tosa_supported_operators.py b/backends/arm/operator_support/tosa_supported_operators.py
@@ -194,6 +194,8 @@ def is_node_supported(
             exir_ops.edge.aten.clone.default,
             exir_ops.edge.aten.unsqueeze_copy.default,
             exir_ops.edge.aten.squeeze_copy.dims,
+            exir_ops.edge.aten.pow.Tensor_Scalar,
+            exir_ops.edge.aten.pow.Tensor_Tensor,
             operator.getitem,
             exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
             exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
diff --git a/backends/arm/operators/__init__.py b/backends/arm/operators/__init__.py
@@ -31,6 +31,7 @@
     op_minimum,
     op_mul,
     op_permute,
+    op_pow,
     op_reciprocal,
     op_repeat,
     op_rescale,
diff --git a/backends/arm/operators/op_pow.py b/backends/arm/operators/op_pow.py
@@ -0,0 +1,57 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+from typing import List
+
+import serializer.tosa_serializer as ts
+from executorch.backends.arm.operators.node_visitor import (
+    NodeVisitor,
+    register_node_visitor,
+)
+from executorch.backends.arm.tosa_mapping import TosaArg
+from executorch.backends.arm.tosa_specification import TosaSpecification
+from serializer.tosa_serializer import TosaOp
+from torch.fx import Node
+
+
+@register_node_visitor
+class PowVisitor_080_MI(NodeVisitor):
+    target = "aten.pow.Tensor_Tensor"
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80+MI"),
+    ]
+
+    def __init__(self, *args):
+        super().__init__(*args)
+
+    def define_node(
+        self,
+        node: Node,
+        tosa_graph: ts.TosaSerializer,
+        inputs: List[TosaArg],
+        output: TosaArg,
+    ) -> None:
+        if not (inputs[0].dtype == inputs[1].dtype == output.dtype):
+            raise ValueError(
+                "All inputs and outputs need same dtype."
+                f"Got {inputs[0].dtype=}, {inputs[1].dtype=}, {output.dtype=}"
+            )
+        if inputs[0].dtype not in [ts.DType.FP32, ts.DType.FP16]:
+            raise ValueError(
+                f"All inputs need to be FP32 or FP16. Got {inputs[0].dtype}"
+            )
+
+        tosa_graph.addOperator(
+            TosaOp.Op().POW,
+            [
+                inputs[0].name,
+                inputs[1].name,
+            ],
+            [output.name],
+            None,
+        )
diff --git a/backends/arm/quantizer/quantization_annotator.py b/backends/arm/quantizer/quantization_annotator.py
@@ -137,6 +137,7 @@ def _match_pattern(
     torch.ops.aten.hardsigmoid.default,
     torch.ops.aten.hardswish.default,
     torch.ops.aten.full_like.default,
+    torch.ops.aten.pow.Tensor_Tensor,
 ]
 
 _one_to_one_shared_input_qspec = [
diff --git a/backends/arm/test/ops/test_pow.py b/backends/arm/test/ops/test_pow.py
diff --git a/backends/transforms/replace_scalar_with_tensor.py b/backends/transforms/replace_scalar_with_tensor.py

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ def __init__(self, exported_program):`
`45`	`45`	`exir_ops.edge.aten.sub.Tensor,`
`46`	`46`	`exir_ops.edge.aten.mul.Tensor,`
`47`	`47`	`exir_ops.edge.aten.div.Tensor,`
	`48`	`+ exir_ops.edge.aten.pow.Tensor_Tensor,`
`48`	`49`	`]`
`49`	`50`
`50`	`51`	`def _match_op_rank(self, graph_module, node, arg, max_rank):`
Original file line number	Diff line number	Diff line change
`@@ -137,6 +137,7 @@ def _match_pattern(`
`137`	`137`	`torch.ops.aten.hardsigmoid.default,`
`138`	`138`	`torch.ops.aten.hardswish.default,`
`139`	`139`	`torch.ops.aten.full_like.default,`
	`140`	`+ torch.ops.aten.pow.Tensor_Tensor,`
`140`	`141`	`]`
`141`	`142`
`142`	`143`	`_one_to_one_shared_input_qspec = [`