Fixes for permute/annotation-pass

oscarandersson8218 · oscarandersson8218 · commit 0cd6a4310ef8 · 2024-07-01T11:03:41.000+02:00
Signed-off-by: Oscar Andersson &lt;oscar.andersson@arm.com&gt;
Change-Id: Ica6addb95d6b925beef4696780334268821af608
diff --git a/backends/arm/arm_backend.py b/backends/arm/arm_backend.py
@@ -18,7 +18,9 @@
 from executorch.backends.arm.operators.node_visitor import get_node_visitors
 from executorch.backends.arm.operators.op_output import process_output
 from executorch.backends.arm.operators.op_placeholder import process_placeholder
-from executorch.backends.arm.passes.permute_memory_pass import PermuteMemoryPass
+from executorch.backends.arm.passes.annotate_channels_last_dim_order_pass import (
+    AnnotateChannelsLastDimOrder,
+)
 from executorch.backends.arm.tosa_utils import (
     dbg_fail,
     dbg_tosa_dump,
@@ -44,6 +46,7 @@ def __init__(self):
         self.compiler_flags = []
         self.output_format = None
         self.path_for_intermediates = None
+        # TODO MLETORCH-265 Remove permute_nhwc flag
         self.permute_nhwc = False
         self.quantize_io = False
 
@@ -245,7 +248,7 @@ def preprocess(  # noqa: C901
         tosa_graph = ts.TosaSerializer(path)
         passes = PassManager()
         if permute_memory_to_nhwc:
-            passes.add_pass(PermuteMemoryPass(edge_program))
+            passes.add_pass(AnnotateChannelsLastDimOrder())
         passes(edge_program.graph_module)
 
         node_visitors = get_node_visitors(edge_program)
diff --git a/backends/arm/operators/op_placeholder.py b/backends/arm/operators/op_placeholder.py
@@ -21,6 +21,7 @@ def process_inputs(
     node: torch.fx.Node,
     tosa_graph: ts.TosaSerializer,
 ):
+    """Serialize an input node"""
     inputs = [TosaArg(node)]
     input_shape = inputs[0].shape
     input_dim_order = inputs[0].dim_order
@@ -39,6 +40,10 @@ def process_quantized_bias(
     tosa_graph: ts.TosaSerializer,
     parameter_values,
 ):
+    """
+    Serialize bias node that needs to be quantized.
+    This can be either an addmm or conv bias node.
+    """
     consumer_node = list(node.users)[0]
     if is_bias_node_for_quantized_addmm(node):
         (
@@ -73,17 +78,12 @@ def process_quantized_bias(
     )
 
 
-def permute(data, dim_order):
-    if len(data.shape) == 4:
-        data = np.transpose(data, dim_order)
-    return data
-
-
 def process_inputs_to_parameters(
     node: torch.fx.Node,
     tosa_graph: ts.TosaSerializer,
     edge_program: ExportedProgram,
 ):
+    """Serialize bias and non-quantized weights"""
     inputs = [TosaArg(node)]
     parameter_name = edge_program.graph_signature.inputs_to_parameters[node.name]
     parameter_data = edge_program.state_dict[parameter_name]
@@ -92,17 +92,11 @@ def process_inputs_to_parameters(
     parameter_values = parameter_data.detach().numpy()
 
     if is_bias_node_for_quantized_addmm(node) or is_bias_node_for_quantized_conv(node):
+        # BI bias
         process_quantized_bias(node, tosa_graph, parameter_values)
     else:
-        # Cases for:
-        # - MI_AddMM_bias
-        # - MI_AddMM_weight
-        # - MI_Conv2d_non_bias_weight
-        # - MI_Conv2d_weight
-        # - MI_Conv2d_bias
-        # - MI_DepthwiseConv2d_weight
-        # - MI_DepthwiseConv2d_bias
-        parameter_values = permute(parameter_values, inputs[0].dim_order)
+        # MI weights or bias
+        parameter_values = np.transpose(parameter_values, inputs[0].dim_order)
 
         tosa_graph.addConst(
             parameter_values.shape, inputs[0].dtype, parameter_values, name=node.name
@@ -114,6 +108,7 @@ def process_inputs_to_buffers(
     tosa_graph: ts.TosaSerializer,
     edge_program: ExportedProgram,
 ):
+    """Serialize quantized weights"""
     inputs = [TosaArg(node)]
     buffer_name = edge_program.graph_signature.inputs_to_buffers[node.name]
     buffer_data = edge_program.state_dict[buffer_name]
@@ -124,7 +119,7 @@ def process_inputs_to_buffers(
     # TODO: fragile code for temporary fix
     # the mean and var tensors are also stored here but they have shape (1, )
     # we only transpose weights here
-    buffer_values = permute(buffer_values, inputs[0].dim_order)
+    buffer_values = np.transpose(buffer_values, inputs[0].dim_order)
 
     tosa_graph.addConst(
         buffer_values.shape, inputs[0].dtype, buffer_values, name=node.name
@@ -136,6 +131,7 @@ def process_placeholder(
     tosa_graph: ts.TosaSerializer,
     edge_program: ExportedProgram,
 ):
+    """Wrapper for processing and serializing all types of placeholders"""
     assert node.name == node.target, "Expect placeholder name and target to match"
     assert 0 == len(node.args), "Can't handle default input values"
 
diff --git a/backends/arm/passes/annotate_channels_last_dim_order_pass.py b/backends/arm/passes/annotate_channels_last_dim_order_pass.py
@@ -0,0 +1,62 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.backends.arm.tosa_quant_utils import dq_op
+from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class AnnotateChannelsLastDimOrder(ExportPass):
+    """
+    Annotates each node with a tosa_dim_order. tosa_dim_order can be seen as a channels-last dim-order
+    that in most cases will be (0, 2, 3, 1) for nodes with 4D-shapes.
+    The annotated tosa_dim_order is used to permute the node's shape such that it
+    gives a TOSA-compliant shape.
+    """
+
+    def is_weight_node_for_dw_conv(self, node: torch.fx.Node):
+        """
+        returns True for dq and w in the following sequences;
+        w -> dw_conv -> ...
+        w -> dq -> dw_conv -> ...
+        """
+        if node.op == "call_function":
+            if node.target != dq_op:
+                return False
+            prev_node = node.args[0]
+            if prev_node.op != "placeholder":
+                return False
+            return is_consumer_node_depthwise_conv2d(node)
+        elif node.op == "placeholder":
+            # node is an input, weight or bias node
+            consumer_node = list(node.users)[0]
+            if self.is_weight_node_for_dw_conv(consumer_node):
+                return True
+            if is_consumer_node_depthwise_conv2d(node):
+                # Check that node is the weight-argument and not input or bias
+                return consumer_node.args[1] == node
+
+        return False
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        NHWC_Order = (0, 2, 3, 1)
+        HWCM_Order = (2, 3, 0, 1)
+        for node in graph_module.graph.nodes:
+            if isinstance(node.meta["val"], tuple):
+                node_data = node.meta["val"][0].data
+            else:
+                node_data = node.meta["val"].data
+
+            if len(node_data.shape) == 4:
+                dim_order = NHWC_Order
+                if self.is_weight_node_for_dw_conv(node):
+                    dim_order = HWCM_Order
+            else:
+                dim_order = tuple(range(node_data.dim()))
+            node.meta["tosa_dim_order"] = dim_order
+        graph_module.recompile()
+        return PassResult(graph_module, True)
diff --git a/backends/arm/passes/arm_pass.py b/backends/arm/passes/arm_pass.py
diff --git a/backends/arm/passes/permute_memory_pass.py b/backends/arm/passes/permute_memory_pass.py
diff --git a/backends/arm/test/ops/test_conv.py b/backends/arm/test/ops/test_conv.py
@@ -114,7 +114,7 @@ def forward(self, x):
         return x
 
 
-conv2d_2x2_3x1x40x40_nobias = Conv2d(
+conv2d_2x2_3x2x40x40_nobias = Conv2d(
     in_channels=2,
     out_channels=3,
     kernel_size=(2, 2),
@@ -221,7 +221,7 @@ def forward(self, x):
 # Shenanigan to get a nicer output when test fails. With unittest it looks like:
 # FAIL: test_conv2d_tosa_BI_2_3x3_1x3x12x12_st2_pd1
 testsuite = [
-    ("2x2_3x1x40x40_nobias", conv2d_2x2_3x1x40x40_nobias),
+    ("2x2_3x2x40x40_nobias", conv2d_2x2_3x2x40x40_nobias),
     ("3x3_1x3x256x256_st1", conv2d_3x3_1x3x256x256_st1),
     ("3x3_1x3x12x12_st2_pd1", conv2d_3x3_1x3x12x12_st2_pd1),
     ("1x1_1x2x128x128_st1", conv2d_1x1_1x2x128x128_st1),
@@ -236,7 +236,7 @@ def forward(self, x):
 # Check: https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela/+/refs/heads/main/SUPPORTED_OPS.md
 #     IFM Tensor batch size must be 1 - [FULLY_CONNECTED, RESHAPE, SHAPE, SLICE, SOFTMAX, SPLIT, SPLIT_V, SQUEEZE, STRIDED_SLICE, UNPACK]
 testsuite_u55 = testsuite.copy()
-testsuite_u55.remove(("2x2_3x1x40x40_nobias", conv2d_2x2_3x1x40x40_nobias))
+testsuite_u55.remove(("2x2_3x2x40x40_nobias", conv2d_2x2_3x2x40x40_nobias))
 testsuite_u55.remove(("5x5_3x2x128x128_st1", conv2d_5x5_3x2x128x128_st1))
 
 
diff --git a/backends/arm/tosa_utils.py b/backends/arm/tosa_utils.py
@@ -87,33 +87,6 @@ def promote_shape(tosa_fb, arg, promoted_shape, out_dtype):
     return reshape_res
 
 
-# Helper transpose function to match TOSA's shape requirements
-# E.g., TOSA 0.80.0 specification - 2.3.3 CONV2D shapes:
-# https://www.mlplatform.org/tosa/tosa_spec.html#_conv2d
-def transpose_helper(tosa_fb, input, new_order, out_dtype):
-    # Check new_order's length is equal to input rank
-    assert len(input.shape) == len(new_order), "Wrong shape order length"
-
-    # Check no duplications
-    assert len(set(new_order)) == len(new_order), "Contain duplicated dim numbers"
-
-    # Check all dims are valid
-    for idx in new_order:
-        if idx < 0:
-            assert True, "Negative dim number"
-        elif idx >= len(input.shape):
-            assert True, "Dim is greater than input rank"
-
-    input_shape_transpoed = [input.shape[i] for i in new_order]
-    attr = ts.TosaSerializerAttribute()
-    attr.TransposeAttribute(new_order)
-    input_transposed = tosa_fb.addIntermediate(input_shape_transpoed, out_dtype)
-    tosa_fb.addOperator(
-        TosaOp.Op().TRANSPOSE, [input.name], [input_transposed.name], attr
-    )
-    return input_transposed
-
-
 def getNodeArgs(node):
     return [TosaArg(arg) for arg in node.args]