pytorch
diff --git a/‎backends/arm/README.md
Lines changed: 12 additions & 2 deletions b/‎backends/arm/README.md
Lines changed: 12 additions & 2 deletions
diff --git a/‎backends/arm/arm_backend.py
Lines changed: 11 additions & 78 deletions b/‎backends/arm/arm_backend.py
Lines changed: 11 additions & 78 deletions
diff --git a/‎backends/arm/arm_partitioner.py
Lines changed: 6 additions & 52 deletions b/‎backends/arm/arm_partitioner.py
Lines changed: 6 additions & 52 deletions
diff --git a/‎backends/arm/operators/__init__.py
Lines changed: 1 addition & 2 deletions b/‎backends/arm/operators/__init__.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/arm/operators/op_add.py
Lines changed: 3 additions & 7 deletions b/‎backends/arm/operators/op_add.py
Lines changed: 3 additions & 7 deletions
@@ -15,11 +15,19 @@ ethos-u-vela compilation stack. which follows the fully AoT flow.
 ## Layout
 
 Export:
-- `arm_backend.py` - Main entrypoint for the ArmPartitioner and ArmBackend. For more information see the section on [Arm Bac
-kend Architecture](#arm-backend-architecture). For examples of use see `executorch/examples/arm`.
+- `arm_backend.py` - Main entrypoint for the ArmPartitioner and ArmBackend. For more information see the section on
+[Arm Backend Architecture](#arm-backend-architecture). For examples of use see `executorch/examples/arm`.
 - `tosa_mapping.py` - utilities for mapping edge dialect to TOSA
 - `tosa_quant_utils.py` - utilities for mapping quantization information to TOSA encoding
 
+Operators:
+- `node_visitor.py` - Base class for edge operator lowering
+- `op_*.py` - Edge operator lowering/serialization to TOSA
+
+Passes:
+- `arm_pass_manager.py` - Pass manager. Will decide which passes need to be applied depending on the compile_spec.
+- `*_pass.py` - Compiler passes derived from ExportPass
+
 Quantization:
 - `arm_quantizer.py` - Quantizer for Arm backend
 - `arm_quantizer_utils.py` - Utilities for quantization
@@ -36,8 +44,10 @@ This is the structure of the test directory
 
 ```
 test                            #  Root test folder
+├── misc                        #  Testing of debug features
 ├── models                      #  Full model tests
 ├── ops                         #  Single op tests
+├── passes                      #  Compiler passes tests
 ├── tester                      #  Arm Tester class
 ├── tosautil                    #  Utility functions for TOSA artifacts
 ├ common.py                     #  Common functions and definitions used by many tests
 
@@ -16,14 +16,13 @@
 import serializer.tosa_serializer as ts
 from executorch.backends.arm.arm_vela import vela_compile
 from executorch.backends.arm.operators.node_visitor import get_node_visitors
+from executorch.backends.arm.operators.op_output import process_output
 from executorch.backends.arm.operators.op_placeholder import process_placeholder
-from executorch.backends.arm.tosa_mapping import map_dtype, TosaArg
-from executorch.backends.arm.tosa_quant_utils import get_quant_node_dtype, is_quant_node
+from executorch.backends.arm.passes.arm_pass_manager import ArmPassManager
 from executorch.backends.arm.tosa_utils import (
     dbg_fail,
     dbg_tosa_dump,
-    is_consumer_node_depthwise_conv2d,
-    is_permute_node_before_addmm,
+    process_call_function,
 )
 from executorch.exir.backend.backend_details import BackendDetails, PreprocessResult
 from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -44,6 +43,7 @@ def __init__(self):
         self.compiler_flags = []
         self.output_format = None
         self.path_for_intermediates = None
+        # TODO MLETORCH-265 Remove permute_nhwc flag
         self.permute_nhwc = False
         self.quantize_io = False
 
@@ -216,18 +216,13 @@ def preprocess(  # noqa: C901
         artifact_path = None
         output_format = ""
         compile_flags = []
-        permute_memory_to_nhwc = False
         for spec in compile_spec:
             if spec.key == "debug_artifact_path":
                 artifact_path = spec.value.decode()
             if spec.key == "output_format":
                 output_format = spec.value.decode()
             if spec.key == "compile_flags":
                 compile_flags.append(spec.value.decode())
-            if spec.key == "permute_memory_format":
-                memory_format = spec.value.decode()
-                if memory_format == "nhwc":
-                    permute_memory_to_nhwc = True
 
         # Check that the output format is set in the compile spec
         if not output_format:
@@ -241,81 +236,19 @@ def preprocess(  # noqa: C901
         # Converted output for this subgraph, serializer needs path early as it emits
         # const data directly. Path created and data written only in debug builds.
         tosa_graph = ts.TosaSerializer(artifact_path)
+        graph_module = ArmPassManager().transform_to_backend_pipeline(
+            graph_module=edge_program.graph_module, compile_spec=compile_spec
+        )
 
         node_visitors = get_node_visitors(edge_program)
 
-        for node in edge_program.graph.nodes:
+        for node in graph_module.graph.nodes:
             if node.op == "call_function":
-                # Unpack arguments and convert
-                inputs = []
-                for arg in node.args:
-                    inputs.append(TosaArg(arg))
-
-                # Convert output (this node itself)
-                output = TosaArg(node)
-
-                # TODO: fragile code for temporary fix, not all outputs will be
-                # rank 4
-                if permute_memory_to_nhwc and len(output.shape) == 4:
-                    # TODO: remove this if check
-                    # this is added because we need to align the quant node
-                    # output shape before the depthwise_conv2d node. The output
-                    # shape between TOSA conv2d and depthwise_conv2d are different.
-                    if (
-                        node.all_input_nodes[0].op
-                        == "placeholder"  # check its parent is a placeholder
-                        and is_quant_node(node)
-                        and is_consumer_node_depthwise_conv2d(node)
-                    ):
-                        NHWC_Order = [2, 3, 0, 1]
-                    else:
-                        NHWC_Order = [0, 2, 3, 1]
-                    output.shape = [output.shape[i] for i in NHWC_Order]
-
-                # Add output to TOSA graph
-                tosa_graph.currRegion.currBasicBlock.addTensor(
-                    output.name,
-                    (
-                        inputs[0].shape
-                        if is_permute_node_before_addmm(node)
-                        else output.shape
-                    ),
-                    (
-                        map_dtype(get_quant_node_dtype(node))
-                        if is_quant_node(node)
-                        else output.dtype
-                    ),
-                )
-
-                # Visiting each Node
-                if node.target.__name__ in node_visitors:
-                    if node.target.__name__ in [
-                        "aten.add.Tensor",
-                        "aten._native_batch_norm_legit_no_training.default",
-                    ]:
-                        node_visitors[node.target.__name__].define_node(
-                            node,
-                            tosa_graph,
-                            inputs,
-                            output,
-                            is_quant_node(node),
-                            permute_memory_to_nhwc,
-                        )
-                    else:
-                        node_visitors[node.target.__name__].define_node(
-                            node, tosa_graph, inputs, output, is_quant_node(node)
-                        )
-                else:
-                    raise RuntimeError(f"Unknown operator {node.target}")
+                process_call_function(node, tosa_graph, node_visitors)
             elif node.op == "placeholder":
-                process_placeholder(
-                    node, tosa_graph, edge_program, permute_memory_to_nhwc
-                )
+                process_placeholder(node, tosa_graph, edge_program)
             elif node.op == "output":
-                for output in node.args[0]:
-                    tosa_graph.addOutputTensor(
-                        tosa_graph.currRegion.currBasicBlock.tensors[output.name]
-                    )
+                process_output(node, tosa_graph)
             else:
                 # This will only happen if an unpartitioned graph is passed without
                 # any checking of compatibility.
 
@@ -10,6 +10,7 @@
 
 import torch
 from executorch.backends.arm.arm_backend import ArmBackend
+from executorch.backends.arm.passes.tag_io_quant_pass import TagIOQuantPass
 from executorch.exir.backend.compile_spec_schema import CompileSpec
 from executorch.exir.backend.partitioner import (
     DelegationSpec,
@@ -18,6 +19,7 @@
 )
 from executorch.exir.backend.utils import tag_constant_data
 from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.passes import PassManager
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
 
@@ -54,9 +56,9 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
         supported &= self.is_node_supported_custom(node)
 
         # Override partitioning based on pre partition passes
-        if supported and "arm_partition" in node.meta:
-            supported = supported & node.meta["arm_partition"]
-            node.meta.pop("arm_partition")
+        if "arm_override_partition" in node.meta:
+            supported = supported & node.meta["arm_override_partition"]
+            node.meta.pop("arm_override_partition")
 
         return supported
 
@@ -69,54 +71,6 @@ def is_node_supported_custom(self, node: torch.fx.Node) -> bool:
         return True
 
 
-from executorch.exir.pass_base import ExportPass, PassResult
-from executorch.exir.passes import PassManager
-
-
-class TagIOQuant(ExportPass):
-    """
-    Pass run before partitioning to tag Q/DQ on any placeholder and output
-    to ensure we don't greedily partition them for device. Float conversion
-    has to happen outside a TOSA base inference profile.
-    """
-
-    def __init__(self, edge_program: torch.export.ExportedProgram):
-        super(TagIOQuant, self).__init__()
-        self.edge_program = edge_program
-
-    def is_quant_node(self, node: torch.fx.node.Node):
-        return node.target in {
-            exir_ops.edge.quantized_decomposed.quantize_per_channel.default,
-            exir_ops.edge.quantized_decomposed.quantize_per_tensor.default,
-            exir_ops.edge.quantized_decomposed.quantize_per_tensor.tensor,
-        }
-
-    def is_dequant_node(self, node: torch.fx.node.Node):
-        return node.target in {
-            exir_ops.edge.quantized_decomposed.dequantize_per_channel.default,
-            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.default,
-            exir_ops.edge.quantized_decomposed.dequantize_per_tensor.tensor,
-        }
-
-    def call(self, graph_module: torch.fx.GraphModule):
-        for node in graph_module.graph.nodes:
-            # tag q of input
-            if node.op == "placeholder":
-                for user in node.users.keys():
-                    # if we have an input going into a quantize
-                    if self.is_quant_node(user):
-                        user.meta["arm_partition"] = False
-
-            # tag dq of outputs
-            if node.op == "output":
-                quant, *_ = node.args[0]
-                if self.is_dequant_node(quant):
-                    quant.meta["arm_partition"] = False
-
-        graph_module.recompile()
-        return PassResult(graph_module, True)
-
-
 @final
 class ArmPartitioner(Partitioner):
     def __init__(self, compile_spec: List[CompileSpec]) -> None:
@@ -133,7 +87,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
                 # Exclude IO quantization from the partition
                 passes = PassManager(
                     passes=[
-                        TagIOQuant(exported_program),
+                        TagIOQuantPass(),
                     ]
                 )
                 passes(exported_program.graph_module)
 
@@ -1,4 +1,4 @@
-# Copyright 2023 Arm Limited and/or its affiliates.
+# Copyright 2023-2024 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -9,7 +9,6 @@
     op_addmm,
     op_avg_pool2d,
     op_batch_norm,
-    op_clone,
     op_conv2d,
     op_dequant,
     op_div,
 
@@ -16,7 +16,7 @@
     build_rescale_from_int32,
     build_rescale_to_int32,
 )
-from executorch.backends.arm.tosa_utils import broadcast_shapes, getNodeArgs
+from executorch.backends.arm.tosa_utils import broadcast_shapes, getNodeArgs, tosa_shape
 from serializer.tosa_serializer import TosaOp
 
 
@@ -34,7 +34,6 @@ def define_node(
         inputs: List[TosaArg],
         output: TosaArg,
         is_quant_node: bool,
-        permute_memory_to_nhwc: bool,
     ) -> None:
         if is_quant_node:
             # Single input or not
@@ -54,12 +53,9 @@ def define_node(
             inputA_rescale_scale = input_A_scale.number / min_scale
             inputB_rescale_scale = input_B_scale.number / min_scale
 
+            input_A.shape = tosa_shape(input_A.shape, input_A.dim_order)
+            input_B.shape = tosa_shape(input_B.shape, input_B.dim_order)
             broadcasted_shape = broadcast_shapes(input_A.shape, input_B.shape)
-            if permute_memory_to_nhwc:
-                NHWC_Order = [0, 2, 3, 1]
-                broadcasted_shape = [broadcasted_shape[i] for i in NHWC_Order]
-                input_A.shape = [input_A.shape[i] for i in NHWC_Order]
-                input_B.shape = [input_B.shape[i] for i in NHWC_Order]
 
             input_A_rescaled_to_int32 = build_rescale_to_int32(
                 tosa_graph,