pytorch
diff --git a/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/pytorch.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/apple-perf.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/apple-perf.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md
Lines changed: 1 addition & 1 deletion b/‎README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 79 additions & 26 deletions b/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 79 additions & 26 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 6 additions & 1 deletion b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 6 additions & 1 deletion
diff --git a/‎backends/arm/_passes/decompose_softmaxes_pass.py
Lines changed: 74 additions & 0 deletions b/‎backends/arm/_passes/decompose_softmaxes_pass.py
Lines changed: 74 additions & 0 deletions
diff --git a/‎backends/arm/arm_partitioner.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/arm_partitioner.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/operators/__init__.py
Lines changed: 0 additions & 1 deletion b/‎backends/arm/operators/__init__.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/operators/op_exp.py
Lines changed: 0 additions & 1 deletion b/‎backends/arm/operators/op_exp.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎backends/arm/operators/op_softmax.py
Lines changed: 0 additions & 99 deletions b/‎backends/arm/operators/op_softmax.py
Lines changed: 0 additions & 99 deletions
diff --git a/‎backends/arm/test/misc/test_debug_feats.py
Lines changed: 1 addition & 2 deletions b/‎backends/arm/test/misc/test_debug_feats.py
Lines changed: 1 addition & 2 deletions
@@ -1 +1 @@
-d1b87e26e5c4343f5b56bb1e6f89b479b389bfac
+export-D64151426
@@ -76,7 +76,7 @@ jobs:
           # on-demand and periodic benchmarking.
           CRON_DEFAULT_MODELS: "stories110M,mv3,mv2,ic4,ic3,resnet50,edsr,mobilebert,w2l"
           CRON_DEFAULT_DEVICES: "apple_iphone_15"
-          CRON_DEFAULT_DELEGATES: "nnpack,coreml,mps"
+          CRON_DEFAULT_DELEGATES: "xnnpack,coreml,mps"
         run: |
           set -ex
           MODELS="${{ inputs.models }}"
 
@@ -25,7 +25,7 @@ Check out the [Getting Started](https://pytorch.org/executorch/stable/getting-st
 Check out the examples of [Llama](./examples/models/llama/README.md), [Llava](./examples/models/llava/README.md) and [other models](./examples/README.md) running on edge devices using ExecuTorch.
 
 
-**[UPDATE - 09/25]** We have added support for running [Llama 3.2 1B/3B](./examples/models/llama/README.md) models via ExecuTorch.
+**[UPDATE - 10/24]** We have added support for running [Llama 3.2 Quantized 1B/3B](./examples/models/llama/README.md) models via ExecuTorch.
 
 ## Feedback
 
 
@@ -12,8 +12,9 @@
 from executorch.backends.arm._passes.arm_pass_utils import (
     create_node,
     get_first_fake_tensor,
+    insert_q_dq_pair,
 )
-from executorch.backends.arm.tosa_quant_utils import dq_op
+from executorch.backends.arm.tosa_quant_utils import dq_op, q_op
 from executorch.backends.arm.tosa_utils import is_consumer_node_depthwise_conv2d
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
@@ -79,37 +80,89 @@ def is_weight_node_for_depthwise_conv2d(self, node: torch.fx.Node):
 
         return False
 
+    def insert_input_transpose(self, node, input_node, graph_module):
+        quantize = input_node.target == dq_op
+        q_params = input_node.args[1:] if quantize else None
+        with graph_module.graph.inserting_before(node):
+            permute_node = create_node(
+                graph_module.graph,
+                torch.ops.passthrough_to_tosa._transpose,
+                args=(input_node, list(self.NHWC_inverse_order)),
+                quantize=quantize,
+                q_params=q_params,
+            )
+            node.replace_input_with(input_node, permute_node)
+
+            permute_node.meta["tosa_dim_order"] = tuple(
+                range(len(input_node.meta["val"].size()))
+            )
+
+    def insert_output_transpose(self, node, graph_module):
+        with graph_module.graph.inserting_after(node):
+            permute_node = create_node(
+                graph_module.graph,
+                torch.ops.passthrough_to_tosa._transpose,
+                args=(node, list(self.NHWC_order)),
+            )
+            permute_node.meta["tosa_dim_order"] = self.NHWC_order
+            node.meta["tosa_dim_order"] = (0, 1, 2, 3)
+            users = [user for user in node.users if user != permute_node]
+            for user in users:
+                user.replace_input_with(node, permute_node)
+
+            quantize = node.args[0] == q_op
+            if quantize:
+                q_params = node.args[0].args[1:]
+                insert_q_dq_pair(graph_module.graph, node, q_params)
+
     def insert_tosa_transposes(self, graph_module: torch.fx.GraphModule):
+        """
+        Reshape operations are not equivalent in NCHW and NHWC.
+        To get around this, transposes need to be added if the previous or new shape
+        fulfil the following condition:
+            C > 1 and (H or W > 1)
+
+        This is relevant for the following operations;
+        squeeze:     4D ->  3D
+        unsqueeze:  <4D ->  4D
+        view:       <4D ->  4D
+        view:        4D -> <4D
+        view:        4D ->  4D
+        """
+
+        def transpose_condition(shape):
+            if len(shape) != 4:
+                return False
+            C = shape[1]
+            H = shape[2]
+            W = shape[3]
+            return C > 1 and (H > 1 or W > 1)
+
         for node in graph_module.graph.nodes:
             if node.op != "call_function":
                 continue
             if node.target == exir_ops.edge.aten.squeeze_copy.dims:
                 input_node = node.args[0]
-                if input_node.meta["val"].dim() == 4:
-                    with graph_module.graph.inserting_before(node):
-                        permute_node = create_node(
-                            graph_module.graph,
-                            torch.ops.passthrough_to_tosa._transpose,
-                            args=(input_node, list(self.NHWC_inverse_order)),
-                        )
-                        permute_node.meta["tosa_dim_order"] = tuple(
-                            range(len(input_node.meta["val"].size()))
-                        )
-                        node.replace_input_with(input_node, permute_node)
-
-            if node.target == exir_ops.edge.aten.unsqueeze_copy.default:
-                if node.meta["val"].dim() == 4:
-                    with graph_module.graph.inserting_after(node):
-                        permute_node = create_node(
-                            graph_module.graph,
-                            torch.ops.passthrough_to_tosa._transpose,
-                            args=(node, list(self.NHWC_order)),
-                        )
-                        permute_node.meta["tosa_dim_order"] = self.NHWC_order
-                        node.meta["tosa_dim_order"] = (0, 1, 2, 3)
-                        users = [user for user in node.users if user != permute_node]
-                        for user in users:
-                            user.replace_input_with(node, permute_node)
+                input_shape = input_node.meta["val"].shape
+                if transpose_condition(input_shape):
+                    self.insert_input_transpose(node, input_node, graph_module)
+
+            elif node.target == exir_ops.edge.aten.unsqueeze_copy.default:
+                output_shape = node.meta["val"].shape
+                if transpose_condition(output_shape):
+                    self.insert_output_transpose(node, graph_module)
+
+            elif node.target == exir_ops.edge.aten.view_copy.default:
+                input_node = node.args[0]
+
+                old_shape = input_node.meta["val"].shape
+                new_shape = node.meta["val"].shape
+
+                if transpose_condition(old_shape):
+                    self.insert_input_transpose(node, input_node, graph_module)
+
+                if transpose_condition(new_shape):
+                    self.insert_output_transpose(node, graph_module)
 
     def call(self, graph_module: torch.fx.GraphModule):
         for node in graph_module.graph.nodes:
 
@@ -23,6 +23,9 @@
     DecomposeLayerNormPass,
 )
 from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
+from executorch.backends.arm._passes.decompose_softmaxes_pass import (
+    DecomposeSoftmaxesPass,
+)
 from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
 from executorch.backends.arm._passes.insert_squeeze_after_sum_pass import (
     InsertSqueezeAfterSumPass,
@@ -66,6 +69,7 @@ def transform_to_backend_pipeline(
         self.add_pass(DecomposeDivPass())
         self.add_pass(InsertSqueezeAfterSumPass())
         self.add_pass(ConvertSplitToSlicePass())
+        self.add_pass(DecomposeSoftmaxesPass())
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
                 memory_format = spec.value.decode()
@@ -75,9 +79,10 @@ def transform_to_backend_pipeline(
         return self._transform(exported_program.graph_module)
 
     def transform_for_annotation_pipeline(self, graph_module: torch.fx.GraphModule):
+        self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
-        self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeDivPass())
+        self.add_pass(DecomposeSoftmaxesPass())
         return self._transform(graph_module)
@@ -0,0 +1,74 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass
+
+# For BI case
+torch_softmax = (torch.ops.aten.softmax.int, torch.ops.aten.log_softmax.int)
+
+# For MI case
+edge_softmax = (
+    exir_ops.edge.aten._softmax.default,
+    exir_ops.edge.aten._log_softmax.default,
+)
+
+log_softmax = (torch.ops.aten.log_softmax.int, exir_ops.edge.aten._log_softmax.default)
+
+
+def get_logsoftmax_ops(op) -> tuple:
+    """
+    Returns the the (log_op, expo_op, sum_op, reciprocal_op), where the ops depends on if
+    the logsoftmax op is in exir_ops torch.ops.aten.
+    """
+    if op in edge_softmax:
+        return (
+            exir_ops.edge.aten.log.default,
+            exir_ops.edge.aten.exp.default,
+            exir_ops.edge.aten.sum.dim_IntList,
+            exir_ops.edge.aten.reciprocal.default,
+            exir_ops.edge.aten.mul.Tensor,
+        )
+    if op in torch_softmax:
+        return (
+            torch.ops.aten.log.default,
+            torch.ops.aten.exp.default,
+            torch.ops.aten.sum.dim_IntList,
+            torch.ops.aten.reciprocal.default,
+            torch.ops.aten.mul.Tensor,
+        )
+    raise RuntimeError(f"Can't get softmax decomposition ops for op {op}")
+
+
+class DecomposeSoftmaxesPass(ExportPass):
+    """
+    This pass decomposes log softmax or softmax into more primitive ops.
+
+    Example:
+        %op1 = exp(x)
+        %op2 = sum(%op1, dim)
+        %op3 = reciprocal(%op2)
+        %op4 = mul(%op1, %op3)
+        (in logsoftmax case: %op5 = log(%op4))
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in torch_softmax + edge_softmax:
+            return super().call_operator(op, args, kwargs, meta)
+
+        log_op, exp_op, sum_op, reciprocal_op, mul_op = get_logsoftmax_ops(op)
+
+        _input = args[0]
+        dim = [args[1]]
+
+        op1 = super().call_operator(exp_op, (_input,), {}, meta)
+        op2 = super().call_operator(sum_op, (op1, dim, True), {}, meta)
+        op3 = super().call_operator(reciprocal_op, (op2,), {}, meta)
+        op4 = super().call_operator(mul_op, (op1, op3), {}, meta)
+        if op in log_softmax:
+            op4 = super().call_operator(log_op, (op4,), {}, meta)
+        return op4
@@ -63,6 +63,7 @@ def is_node_supported(self, submodules, node: torch.fx.Node) -> bool:
             exir_ops.edge.aten.rsqrt.default,
             exir_ops.edge.aten._softmax.default,
             exir_ops.edge.aten.select_copy.int,
+            exir_ops.edge.aten._log_softmax.default,
             exir_ops.edge.aten.slice_copy.Tensor,
             exir_ops.edge.aten.sub.Tensor,
             exir_ops.edge.aten.sum.dim_IntList,
 
@@ -31,7 +31,6 @@
     op_select,
     op_sigmoid,
     op_slice,
-    op_softmax,
     op_squeeze,
     op_sub,
     op_sum,
 
@@ -42,7 +42,6 @@ def define_node(
     ) -> None:
 
         assert len(node.all_input_nodes) == 1
-        assert len(node.users) == 1
 
         if is_quant_node:
             # Assume quantized input is 8 bit.
 
@@ -107,9 +107,8 @@ def test_numerical_diff_prints(self):
             ArmTester(
                 model,
                 example_inputs=model.get_inputs(),
-                compile_spec=common.get_tosa_compile_spec(),
+                compile_spec=common.get_tosa_compile_spec(permute_memory_to_nhwc=False),
             )
-            .quantize()
             .export()
             .to_edge()
             .partition()
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-d1b87e26e5c4343f5b56bb1e6f89b479b389bfac`
	`1`	`+export-D64151426`