pytorch
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 3 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎backends/arm/_passes/broadcast_args_pass.py
Lines changed: 63 additions & 0 deletions b/‎backends/arm/_passes/broadcast_args_pass.py
Lines changed: 63 additions & 0 deletions
diff --git a/‎backends/arm/arm_vela.py
Lines changed: 2 additions & 2 deletions b/‎backends/arm/arm_vela.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/operator_support/to_copy_support.py
Lines changed: 4 additions & 1 deletion b/‎backends/arm/operator_support/to_copy_support.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/operator_support/tosa_supported_operators.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/operator_support/tosa_supported_operators.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/op_abs.py
Lines changed: 2 additions & 2 deletions b/‎backends/arm/operators/op_abs.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/operators/op_add.py
Lines changed: 2 additions & 2 deletions b/‎backends/arm/operators/op_add.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/operators/op_eq.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operators/op_eq.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_ge.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operators/op_ge.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_gt.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operators/op_gt.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_le.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operators/op_le.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_lt.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/operators/op_lt.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/operators/op_maximum.py
Lines changed: 2 additions & 2 deletions b/‎backends/arm/operators/op_maximum.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/operators/op_minimum.py
Lines changed: 2 additions & 2 deletions b/‎backends/arm/operators/op_minimum.py
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/operators/op_mul.py
Lines changed: 3 additions & 3 deletions b/‎backends/arm/operators/op_mul.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/arm/operators/op_neg.py
Lines changed: 10 additions & 11 deletions b/‎backends/arm/operators/op_neg.py
Lines changed: 10 additions & 11 deletions
diff --git a/‎backends/arm/operators/op_sub.py
Lines changed: 16 additions & 5 deletions b/‎backends/arm/operators/op_sub.py
Lines changed: 16 additions & 5 deletions
@@ -8,6 +8,7 @@
 from .annotate_channels_last_dim_order_pass import AnnotateChannelsLastDimOrder  # noqa
 from .annotate_decomposed_matmul import AnnotateDecomposedMatmulPass  # noqa
 from .arm_pass import ArmPass  # noqa
+from .broadcast_args_pass import BroadcastArgsPass  # noqa
 from .cast_int64_pass import CastInt64BuffersToInt32Pass  # noqa
 from .cast_to_int32_pass import CastToInt32Pass  # noqa
 from .conv1d_unsqueeze_pass import Conv1dUnsqueezePass  # noqa
 
@@ -10,6 +10,7 @@
 from executorch.backends.arm._passes import (
     AnnotateChannelsLastDimOrder,
     AnnotateDecomposedMatmulPass,
+    BroadcastArgsPass,
     CastInt64BuffersToInt32Pass,
     CastToInt32Pass,
     ComputeConstantOpsAOT,
@@ -104,6 +105,8 @@ def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
+        if self.tosa_spec.is_U55_subset:
+            self.add_pass(BroadcastArgsPass())
         self.add_pass(ComputeConstantOpsAOT(exported_program))
 
         self.add_pass(RemoveClonePass())
 
@@ -0,0 +1,63 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from executorch.backends.arm._passes import ArmPass
+
+from executorch.backends.arm._passes.arm_pass_utils import (
+    create_node,
+    get_first_fake_tensor,
+)
+
+from executorch.exir.dialects._ops import ops as exir_ops
+
+from executorch.exir.pass_base import PassResult
+from torch.fx import GraphModule, Node
+
+
+class BroadcastArgsPass(ArmPass):
+    """
+    Pass to manually broadcast arguments by inserting repeats.
+    This is done when more than one arg needs broadcasting.
+    """
+
+    targeted_ops = {
+        exir_ops.edge.aten.add.Tensor,
+        exir_ops.edge.aten.sub.Tensor,
+        # mul is indirectly targeting div as div is decompsed to reciprocal + mul
+        exir_ops.edge.aten.mul.Tensor,
+    }
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        for node in graph_module.graph.nodes:
+            if node.op != "call_function" or node.target not in self.targeted_ops:
+                continue
+
+            output_shape = get_first_fake_tensor(node).shape
+            nbr_of_broacasts = 0
+            for arg in node.args:
+                if not isinstance(arg, Node):
+                    continue
+
+                shape = get_first_fake_tensor(arg).shape
+                if shape != output_shape:
+                    nbr_of_broacasts += 1
+                if nbr_of_broacasts > 1:
+                    multiples = [
+                        int(output_shape[d] / shape[d])
+                        for d in range(len(output_shape))
+                    ]
+                    with graph_module.graph.inserting_before(node):
+                        repeat = create_node(
+                            graph_module.graph,
+                            exir_ops.edge.aten.repeat.default,
+                            args=(arg, multiples),
+                            kwargs={},
+                            from_node=node,
+                        )
+                        node.replace_input_with(arg, repeat)
+
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
@@ -73,8 +73,8 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             np_path = os.path.join(tmpdir, "output", "out_vela.npz")
         else:
             np_path = os.path.join(tmpdir, "output", "out_sg0_vela.npz")
-        blocks = b""
 
+        blocks = b""
         with np.load(np_path, allow_pickle=False) as data:
             # Construct our modified output_blocks with data in a form easily
             # digested on the device side
@@ -92,7 +92,7 @@ def vela_compile(tosa_flatbuffer: bytes, args: List[str], verbose: bool = False)
             if not isinstance(data["scratch_shape"][0], np.int64):
                 raise RuntimeError("Expected scratch to be int64")
             block_length = int(data["scratch_shape"][0])
-            bin_blocks["scratch_data"] = b"\x00" * block_length
+            bin_blocks["scratch_size"] = struct.pack("<I", block_length)
 
             # Capture inputs and outputs
             bin_blocks["inputs"] = vela_bin_pack_io("input", data)
 
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
+import copy
 import logging
 
 import torch
@@ -42,7 +43,9 @@ def _merge_supported_types(
         dtypes1: SupportedTypeDict,
         dtypes2: SupportedTypeDict,
     ) -> SupportedTypeDict:
-        merged_dtypes = dtypes1
+        merged_dtypes = copy.deepcopy(
+            dtypes1
+        )  # Use deepcopy to avoid unintentionally modifying SUPPORTED_INT_TYPES
         for k, v in dtypes2.items():
             merged_dtypes[k] = merged_dtypes.get(k, []) + v
         return merged_dtypes
 
@@ -306,6 +306,7 @@ class CheckProperQuantization(OperatorSupportBase):
         exir_ops.edge.aten.sub.Tensor,
         exir_ops.edge.aten.upsample_bilinear2d.vec,
         exir_ops.edge.aten.upsample_nearest2d.vec,
+        torch.ops.aten.scalar_tensor.default,
         *TableOps.included_ops(),
     )
 
 
@@ -164,7 +164,7 @@ def define_node(
         scale_back = 1.0
         if inputs[0].dtype == ts.DType.INT8:
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )  # type: ignore[possibly-undefined]
         else:
             # input[0].dtype == ts.DType.INT32
@@ -192,7 +192,7 @@ def define_node(
             # Scale output back to 8 bit
             # pyre-ignore
             tqutils.insert_rescale_op_to_int8(
-                tosa_graph, abs_output, scale_back, node, self.tosa_specs
+                tosa_graph, abs_output, scale_back, node, self.tosa_spec
             )  # type: ignore[possibly-undefined]
 
 
 
@@ -174,7 +174,7 @@ def define_node(
         scale_back = 1.0
         if inputs[0].dtype == ts.DType.INT8:
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
         else:
             # input[0].dtype == ts.DType.INT32
@@ -202,7 +202,7 @@ def define_node(
             # Scale output back to 8 bit
             # pyre-ignore
             tqutils.insert_rescale_op_to_int8(
-                tosa_graph, add_output, scale_back, node, self.tosa_specs
+                tosa_graph, add_output, scale_back, node, self.tosa_spec
             )  # type: ignore[possibly-undefined]
 
 
 
@@ -98,7 +98,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             # Rescale inputs to 32 bit
             rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             # Update IO
 
@@ -97,7 +97,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             # Rescale inputs to 32 bit
             rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             # Update IO
 
@@ -97,7 +97,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             # Rescale inputs to 32 bit
             rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             # Update IO
 
@@ -97,7 +97,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             # Rescale inputs to 32 bit
             rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             # Update IO
 
@@ -97,7 +97,7 @@ def define_node(
         if inputs[0].dtype == ts.DType.INT8:
             # Rescale inputs to 32 bit
             rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             # Update IO
 
@@ -129,7 +129,7 @@ def define_node(
                 )
 
             operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             output.shape = tosa_shape(output.shape, output.dim_order)
@@ -155,5 +155,5 @@ def define_node(
         if output.dtype == ts.DType.INT8:
             # insert RESCALE from int32 back to int8
             tqutils.insert_rescale_op_to_int8(
-                tosa_graph, max_output, scale_back, node, self.tosa_specs
+                tosa_graph, max_output, scale_back, node, self.tosa_spec
             )
@@ -128,7 +128,7 @@ def define_node(
                 )
 
             operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
 
             output.shape = tosa_shape(output.shape, output.dim_order)
@@ -154,5 +154,5 @@ def define_node(
         if output.dtype == ts.DType.INT8:
             # insert RESCALE from int32 back to int8
             tqutils.insert_rescale_op_to_int8(
-                tosa_graph, min_output, scale_back, node, self.tosa_specs
+                tosa_graph, min_output, scale_back, node, self.tosa_spec
             )
@@ -189,14 +189,14 @@ def define_node(
             input_A,
             input_A_qargs.zp,
             [1.0],
-            tosa_spec=self.tosa_specs,
+            tosa_spec=self.tosa_spec,
         )
         input_B_rescaled = tqutils.build_rescale_to_int32(
             tosa_graph,
             input_B,
             input_B_qargs.zp,
             [1.0],
-            tosa_spec=self.tosa_specs,
+            tosa_spec=self.tosa_spec,
         )
 
         output_shape = tutils.tosa_shape(output.shape, output.dim_order)
@@ -211,7 +211,7 @@ def define_node(
         )
         output_scale = input_A_qargs.scale * input_B_qargs.scale
         tqutils.insert_rescale_op_to_int8(
-            tosa_graph, mul_output, output_scale, node, self.tosa_specs
+            tosa_graph, mul_output, output_scale, node, self.tosa_spec
         )
 
 
 
@@ -16,7 +16,10 @@
     NodeVisitor,
     register_node_visitor,
 )
-
+from executorch.backends.arm.operators.operator_validation_utils import (
+    validate_num_inputs,
+    validate_same_dtype,
+)
 from executorch.backends.arm.tosa_mapping import TosaArg
 
 
@@ -60,14 +63,12 @@ def define_node(
             ts.DType.FP32,
         }
 
+        validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
+
         if inputs[0].dtype not in supported_dtypes:
             raise ValueError(f"Unsupported dtype for NEGATE: {inputs[0].dtype}")
 
-        if inputs[0].dtype != output.dtype:
-            raise ValueError(
-                "All inputs and output need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
         input_zp, output_zp = get_negate_zero_points(
             node, inputs[0].dtype == ts.DType.INT8
         )
@@ -109,14 +110,12 @@ def define_node(
             ts.DType.FP32,
         }
 
+        validate_num_inputs(self.target, inputs, 1)
+        validate_same_dtype(self.target, [*inputs, output])
+
         if inputs[0].dtype not in supported_dtypes:
             raise ValueError(f"Unsupported dtype for NEGATE: {inputs[0].dtype}")
 
-        if inputs[0].dtype != output.dtype:
-            raise ValueError(
-                "All inputs and output need same dtype."
-                f"Got {inputs[0].dtype=}, {output.dtype=}"
-            )
         input_zp, output_zp = get_negate_zero_points(
             node, inputs[0].dtype == ts.DType.INT8
         )
 
@@ -163,12 +163,16 @@ def define_node(
         validate_same_dtype(self.target, [*inputs, output])
 
         # Handle int8 (quantized) and int32
-        assert inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]
+        supported_dtypes = [ts.DType.INT8, ts.DType.INT32]
+        if inputs[0].dtype not in supported_dtypes:
+            raise TypeError(
+                f'IO data type needs to be {supported_dtypes}, got "{inputs[0].dtype}"'
+            )
 
         scale_back = 1.0
         if inputs[0].dtype == ts.DType.INT8:
             rescaled_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(
-                tosa_graph, inputs, node, self.tosa_specs
+                tosa_graph, inputs, node, self.tosa_spec
             )
         else:
             # input[0].dtype == ts.DType.INT32
@@ -197,7 +201,7 @@ def define_node(
             # Scale output back to 8 bit
             # pyre-ignore
             tqutils.insert_rescale_op_to_int8(
-                tosa_graph, sub_output, scale_back, node, self.tosa_specs
+                tosa_graph, sub_output, scale_back, node, self.tosa_spec
             )  # type: ignore[possibly-undefined]
 
 
@@ -228,8 +232,15 @@ def define_node(
             super().define_node(node, tosa_graph, inputs, output)
         else:
             # FP32 Sub lowering
-            assert inputs[0].dtype == ts.DType.FP32
-            assert output.dtype == ts.DType.FP32
+            if (
+                inputs[0].dtype != ts.DType.FP32
+                or inputs[1].dtype != ts.DType.FP32
+                or output.dtype != ts.DType.FP32
+            ):
+                raise TypeError(
+                    f"All IO needs to have data type fp32. Got: {inputs[0].dtype}, "
+                    f"input 2: {inputs[1].dtype} and output: {output.dtype}"
+                )
 
             # MI lowering
             tosa_graph.addOperator(
Original file line number	Diff line number	Diff line change
`@@ -306,6 +306,7 @@ class CheckProperQuantization(OperatorSupportBase):`
`306`	`306`	`exir_ops.edge.aten.sub.Tensor,`
`307`	`307`	`exir_ops.edge.aten.upsample_bilinear2d.vec,`
`308`	`308`	`exir_ops.edge.aten.upsample_nearest2d.vec,`
	`309`	`+ torch.ops.aten.scalar_tensor.default,`
`309`	`310`	`*TableOps.included_ops(),`
`310`	`311`	`)`
`311`	`312`
Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ def define_node(`
`98`	`98`	`if inputs[0].dtype == ts.DType.INT8:`
`99`	`99`	`# Rescale inputs to 32 bit`
`100`	`100`	`rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(`
`101`		`- tosa_graph, inputs, node, self.tosa_specs`
	`101`	`+ tosa_graph, inputs, node, self.tosa_spec`
`102`	`102`	`)`
`103`	`103`
`104`	`104`	`# Update IO`
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ def define_node(`
`97`	`97`	`if inputs[0].dtype == ts.DType.INT8:`
`98`	`98`	`# Rescale inputs to 32 bit`
`99`	`99`	`rescaled_inputs, _ = tqutils.insert_rescale_ops_to_int32(`
`100`		`- tosa_graph, inputs, node, self.tosa_specs`
	`100`	`+ tosa_graph, inputs, node, self.tosa_spec`
`101`	`101`	`)`
`102`	`102`
`103`	`103`	`# Update IO`
Original file line number	Diff line number	Diff line change
`@@ -129,7 +129,7 @@ def define_node(`
`129`	`129`	`)`
`130`	`130`
`131`	`131`	`operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(`
`132`		`- tosa_graph, inputs, node, self.tosa_specs`
	`132`	`+ tosa_graph, inputs, node, self.tosa_spec`
`133`	`133`	`)`
`134`	`134`
`135`	`135`	`output.shape = tosa_shape(output.shape, output.dim_order)`
`@@ -155,5 +155,5 @@ def define_node(`
`155`	`155`	`if output.dtype == ts.DType.INT8:`
`156`	`156`	`# insert RESCALE from int32 back to int8`
`157`	`157`	`tqutils.insert_rescale_op_to_int8(`
`158`		`- tosa_graph, max_output, scale_back, node, self.tosa_specs`
	`158`	`+ tosa_graph, max_output, scale_back, node, self.tosa_spec`
`159`	`159`	`)`
Original file line number	Diff line number	Diff line change
`@@ -128,7 +128,7 @@ def define_node(`
`128`	`128`	`)`
`129`	`129`
`130`	`130`	`operand_inputs, scale_back = tqutils.insert_rescale_ops_to_int32(`
`131`		`- tosa_graph, inputs, node, self.tosa_specs`
	`131`	`+ tosa_graph, inputs, node, self.tosa_spec`
`132`	`132`	`)`
`133`	`133`
`134`	`134`	`output.shape = tosa_shape(output.shape, output.dim_order)`
`@@ -154,5 +154,5 @@ def define_node(`
`154`	`154`	`if output.dtype == ts.DType.INT8:`
`155`	`155`	`# insert RESCALE from int32 back to int8`
`156`	`156`	`tqutils.insert_rescale_op_to_int8(`
`157`		`- tosa_graph, min_output, scale_back, node, self.tosa_specs`
	`157`	`+ tosa_graph, min_output, scale_back, node, self.tosa_spec`
`158`	`158`	`)`
Original file line number	Diff line number	Diff line change
`@@ -189,14 +189,14 @@ def define_node(`
`189`	`189`	`input_A,`
`190`	`190`	`input_A_qargs.zp,`
`191`	`191`	`[1.0],`
`192`		`- tosa_spec=self.tosa_specs,`
	`192`	`+ tosa_spec=self.tosa_spec,`
`193`	`193`	`)`
`194`	`194`	`input_B_rescaled = tqutils.build_rescale_to_int32(`
`195`	`195`	`tosa_graph,`
`196`	`196`	`input_B,`
`197`	`197`	`input_B_qargs.zp,`
`198`	`198`	`[1.0],`
`199`		`- tosa_spec=self.tosa_specs,`
	`199`	`+ tosa_spec=self.tosa_spec,`
`200`	`200`	`)`
`201`	`201`
`202`	`202`	`output_shape = tutils.tosa_shape(output.shape, output.dim_order)`
`@@ -211,7 +211,7 @@ def define_node(`
`211`	`211`	`)`
`212`	`212`	`output_scale = input_A_qargs.scale * input_B_qargs.scale`
`213`	`213`	`tqutils.insert_rescale_op_to_int8(`
`214`		`- tosa_graph, mul_output, output_scale, node, self.tosa_specs`
	`214`	`+ tosa_graph, mul_output, output_scale, node, self.tosa_spec`
`215`	`215`	`)`
`216`	`216`
`217`	`217`