pytorch
diff --git a/‎.ci/docker/ci_commit_pins/buck2.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/buck2.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/setup-arm-baremetal-tools.sh
Lines changed: 11 additions & 0 deletions b/‎.ci/scripts/setup-arm-baremetal-tools.sh
Lines changed: 11 additions & 0 deletions
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 0 additions & 11 deletions b/‎.ci/scripts/utils.sh
Lines changed: 0 additions & 11 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 3 additions & 5 deletions b/‎.github/workflows/pull.yml
Lines changed: 3 additions & 5 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 6 additions & 5 deletions b/‎.github/workflows/trunk.yml
Lines changed: 6 additions & 5 deletions
diff --git a/‎backends/arm/README.md
Lines changed: 51 additions & 0 deletions b/‎backends/arm/README.md
Lines changed: 51 additions & 0 deletions
diff --git a/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 89 additions & 0 deletions b/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 89 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 46 additions & 12 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 46 additions & 12 deletions
@@ -1 +1 @@
-2024-05-15
+2024-12-16
@@ -0,0 +1,11 @@
+#!/bin/bash
+# Copyright 2024 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# NB: This function could be used to install Arm dependencies
+# Setup arm example environment (including TOSA tools)
+git config --global user.email "[email protected]"
+git config --global user.name "Github Executorch"
+bash examples/arm/setup.sh --i-agree-to-the-contained-eula
@@ -59,17 +59,6 @@ install_flatc_from_source() {
   popd || return
 }
 
-install_arm() {
-  # NB: This function could be used to install Arm dependencies
-  # Setup arm example environment (including TOSA tools)
-  git config --global user.email "[email protected]"
-  git config --global user.name "Github Executorch"
-  bash examples/arm/setup.sh --i-agree-to-the-contained-eula
-
-  # Test tosa_reference flow
-  source examples/arm/ethos-u-scratch/setup_path.sh
-}
-
 build_executorch_runner_buck2() {
   # Build executorch runtime with retry as this step is flaky on macos CI
   retry buck2 build //examples/portable/executor_runner:executor_runner
 
@@ -354,13 +354,11 @@ jobs:
         EXECUTORCH_BUILD_ARM_BAREMETAL=ON \
         .ci/scripts/setup-linux.sh "${BUILD_TOOL}"
 
-        source .ci/scripts/utils.sh
         # Install Arm dependencies
-        install_arm
-
-        # Run pytest with coverage
-        pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test
+        .ci/scripts/setup-arm-baremetal-tools.sh
 
+        # Run pytest without simulator
+        backends/arm/test/test_arm_baremetal.sh test_pytest
 
   test-llama-runner-qnn-linux:
     name: test-llama-runner-qnn-linux
 
@@ -146,14 +146,15 @@ jobs:
         source .ci/scripts/utils.sh
         install_executorch
 
-        install_arm
+        .ci/scripts/setup-arm-baremetal-tools.sh
 
         # Increase number of files user can monitor to bypass buck failures.
         # Hopefully this is high enough for this setup.
         sudo sysctl fs.inotify.max_user_watches=1048576 # 1024 * 1024
 
         # Test ethos-u delegate examples with run.sh
-        PYTHON_EXECUTABLE=python bash examples/arm/run.sh examples/arm/ethos-u-scratch/
+        backends/arm/test/test_arm_baremetal.sh test_run_ethosu_fvp
+
 
   test-arm-reference-delegation:
     name: test-arm-reference-delegation
@@ -172,10 +173,10 @@ jobs:
         source .ci/scripts/utils.sh
         install_executorch
 
-        install_arm
+        .ci/scripts/setup-arm-baremetal-tools.sh
 
-        # Run arm unit tests
-        pytest -c /dev/null -v -n auto --cov=./ --cov-report=xml backends/arm/test
+        # Run arm unit tests using the simulator
+        backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp
 
   test-coreml-delegate:
     name: test-coreml-delegate
 
@@ -39,6 +39,28 @@ Other:
 - `third-party/` - Dependencies on other code - in particular the TOSA serialization_lib for compiling to TOSA and the ethos-u-core-driver for the bare-metal backend supporting Ethos-U
 - `test/` - Unit test and test support functions
 
+## Testing
+
+After a setup you can run unit tests with the test_arm_baremetal.sh script.
+
+To run the pytests suite run
+
+```
+backends/arm/test/test_arm_baremetal.sh test_pytest
+```
+
+To run the unit test suite with Corstone3x0 FVP simulator support use
+
+```
+backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp
+```
+
+You can test to run some models with the run.sh flow
+
+```
+backends/arm/test/test_arm_baremetal.sh test_run_ethosu_fvp
+```
+
 ## Unit tests
 This is the structure of the test directory
 
@@ -51,6 +73,8 @@ test                            #  Root test folder
 ├── tester                      #  Arm Tester class
 ├── tosautil                    #  Utility functions for TOSA artifacts
 ├ common.py                     #  Common functions and definitions used by many tests
+├ setup_testing.sh              #  Script to prepare testing for using the Corstone 3x0 FVP
+├ test_arm_baremetal.sh         #  Help script to trigger testing
 ```
 
 Some example commands to run these tests follow. Run a single test:
@@ -59,6 +83,12 @@ Some example commands to run these tests follow. Run a single test:
 python -m unittest backends.arm.test.ops.test_add.TestSimpleAdd -k test_add2_tosa_BI
 ```
 
+or with pytest
+
+```
+pytest -c /dev/null -v -n auto backends/arm/test/ops/test_add.py -k test_add2_tosa_BI
+```
+
 Or all tests in "TestSimpleAdd":
 
 ```
@@ -71,6 +101,27 @@ Or discover and run many tests:
 python -m unittest discover -s backends/arm/test/ops/
 ```
 
+or with pytest
+
+```
+pytest -c /dev/null -v -n auto backends/arm/test/ops/
+```
+
+
+You can run tests using Corstone3x0 simulators to see how it would work on something more target like
+first you need to build and prepare some used target libs
+
+```
+examples/arm/run.sh --model_name=add --build_only
+backends/arm/test/setup_testing.sh
+```
+
+The you can run the tests with
+
+```
+pytest -c /dev/null -v -n auto backends/arm/test --arm_quantize_io --arm_run_corstoneFVP
+```
+
 ### Code coverage
 
 To get code coverage:
 
@@ -0,0 +1,89 @@
+# Copyright 2024 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import itertools
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.backends.arm.tosa_quant_utils import dq_op, q_op
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+from torch.fx import GraphModule
+from torch.fx.passes.utils.source_matcher_utils import get_source_partitions
+
+
+class AnnotateDecomposedMatmulPass(ExportPass):
+    """
+    torch.matmul can be decomposed in many ways, for instance:
+    dq -> matmul -> q can become
+    dq -> repeat -> view -> bmm -> view -> dq which makes quantization folding
+    difficult. This helper function find all matmul partitions and annotate its
+    matmul-op (can be mm or bmm).
+    """
+
+    def call(self, graph_module: GraphModule) -> PassResult:
+        matmul_partitions = get_source_partitions(
+            graph_module.graph,
+            [
+                torch.matmul,
+            ],
+            None,
+        )
+        matmul_partitions = list(
+            itertools.chain.from_iterable(matmul_partitions.values())
+        )
+        matmul_targets = {
+            exir_ops.edge.aten.mm.default,
+            exir_ops.edge.aten.bmm.default,
+        }
+        for partition in matmul_partitions:
+            quantized_input = all(
+                input_node.target == dq_op for input_node in partition.input_nodes
+            )
+            matmul_node = [
+                node for node in partition.nodes if node.target in matmul_targets
+            ][0]
+            if quantized_input:
+                matmul_args = matmul_node.all_input_nodes
+                for i in range(len(matmul_args)):
+                    input_node = partition.input_nodes[i]
+                    matmul_input_node = matmul_args[i]
+                    # Remove partition input dq-node
+                    input_node.replace_all_uses_with(input_node.all_input_nodes[0])
+                    graph_module.graph.erase_node(input_node)
+                    input_node_qargs = input_node.args[1:]
+                    with graph_module.graph.inserting_before(matmul_node):
+                        # Create new dq-node before matmul
+                        dq_node = create_node(
+                            graph=graph_module.graph,
+                            op_target=dq_op,
+                        )
+                        dq_node.args = (matmul_input_node, *input_node_qargs)
+                        matmul_node.replace_input_with(matmul_input_node, dq_node)
+
+            partition_output = list(partition.output_nodes[0].users)[0]
+            quantized_output = partition_output.target == q_op
+            if quantized_output:
+                output_node_qargs = partition_output.args[1:]
+                with graph_module.graph.inserting_after(matmul_node):
+                    # Create q-node after matmul
+                    q_node = create_node(
+                        graph=graph_module.graph,
+                        op_target=q_op,
+                    )
+                    matmul_node.replace_all_uses_with(q_node)
+                    q_node.args = (matmul_node, *output_node_qargs)
+                # Remove partition output q-node
+                partition_output.replace_all_uses_with(
+                    partition_output.all_input_nodes[0]
+                )
+                graph_module.graph.erase_node(partition_output)
+
+        # retrace the graph to update the fake tensor types
+        graph_module = super().call(graph_module).graph_module
+
+        graph_module.recompile()
+        return PassResult(graph_module, True)
@@ -11,6 +11,9 @@
 from executorch.backends.arm._passes.annotate_channels_last_dim_order_pass import (
     AnnotateChannelsLastDimOrder,
 )
+from executorch.backends.arm._passes.annotate_decomposed_matmul import (
+    AnnotateDecomposedMatmulPass,
+)
 from executorch.backends.arm._passes.cast_int64_pass import CastInt64ToInt32Pass
 from executorch.backends.arm._passes.conv1d_unsqueeze_pass import Conv1dUnsqueezePass
 from executorch.backends.arm._passes.convert_expand_copy_to_repeat import (
@@ -32,7 +35,9 @@
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     FoldAndAnnotateQParamsPass,
     QuantizeFullArgument,
+    RetraceFoldedDtypesPass,
 )
+from executorch.backends.arm._passes.insert_table_ops import InsertTableOpsPass
 from executorch.backends.arm._passes.keep_dims_false_to_squeeze_pass import (
     KeepDimsFalseToSqueezePass,
 )
@@ -67,24 +72,15 @@ def transform_to_backend_pipeline(
         self, exported_program: ExportedProgram, compile_spec: list[CompileSpec]
     ):
         """Apply passes before transforming program to backend"""
-        self.add_pass(CastInt64ToInt32Pass(exported_program))
+        self.add_pass(DecomposeLinearPass())
         self.add_pass(RemoveGetItemPass())
-        self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
-        self.add_pass(SizeAdjustConv2DPass())
-        self.add_pass(RemoveClonePass())
-        self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(DecomposeLayerNormPass())
-        self.add_pass(UnsqueezeBeforeRepeatPass())
         self.add_pass(DecomposeVarPass())
         self.add_pass(ConvertMeanDimToAveragePool())
         self.add_pass(DecomposeMeanDimPass())
-        self.add_pass(MatchArgRanksPass(exported_program))
-        self.add_pass(DecomposeDivPass())
-        self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(ConvertSplitToSlicePass())
-        self.add_pass(Conv1dUnsqueezePass(exported_program))
-        self.add_pass(DecomposeSoftmaxesPass())
-        self.add_pass(DecomposeLinearPass())
+        # TODO MLETORCH-558
+        self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeFullArgument())
         self.add_pass(
             FoldAndAnnotateQParamsPass(
@@ -93,11 +89,49 @@ def transform_to_backend_pipeline(
                     exir_ops.edge.aten.maximum.default,
                     exir_ops.edge.aten.add.Tensor,
                     exir_ops.edge.aten.avg_pool2d.default,
+                    exir_ops.edge.aten.bmm.default,
+                    exir_ops.edge.aten.cat.default,
                     exir_ops.edge.aten.convolution.default,
+                    exir_ops.edge.aten.clone.default,
+                    exir_ops.edge.aten.exp.default,
+                    exir_ops.edge.aten.expand_copy.default,
                     exir_ops.edge.aten.full.default,
+                    exir_ops.edge.aten.hardtanh.default,
+                    exir_ops.edge.aten.log.default,
+                    exir_ops.edge.aten.max_pool2d.default,
+                    exir_ops.edge.aten.mm.default,
+                    exir_ops.edge.aten.mul.Tensor,
+                    exir_ops.edge.aten.permute_copy.default,
+                    exir_ops.edge.aten.reciprocal.default,
+                    exir_ops.edge.aten.relu.default,
+                    exir_ops.edge.aten.repeat.default,
+                    exir_ops.edge.aten.rsqrt.default,
+                    exir_ops.edge.aten.select_copy.int,
+                    exir_ops.edge.aten.sigmoid.default,
+                    exir_ops.edge.aten.slice_copy.Tensor,
+                    exir_ops.edge.aten.squeeze_copy.dims,
+                    exir_ops.edge.aten.sub.Tensor,
+                    exir_ops.edge.aten.sum.dim_IntList,
+                    exir_ops.edge.aten.tanh.default,
+                    exir_ops.edge.aten.unsqueeze_copy.default,
+                    exir_ops.edge.aten.upsample_nearest2d.vec,
+                    exir_ops.edge.aten.view_copy.default,
                 ]
             )
         )
+        self.add_pass(RetraceFoldedDtypesPass())
+        self.add_pass(InsertTableOpsPass(exported_program))
+        self.add_pass(ConvertExpandCopyToRepeatPass())
+        self.add_pass(UnsqueezeBeforeRepeatPass())
+        self.add_pass(CastInt64ToInt32Pass(exported_program))
+        self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
+        self.add_pass(SizeAdjustConv2DPass())
+        self.add_pass(RemoveClonePass())
+        self.add_pass(MatchArgRanksPass(exported_program))
+        self.add_pass(DecomposeDivPass())
+        self.add_pass(KeepDimsFalseToSqueezePass())
+        self.add_pass(Conv1dUnsqueezePass(exported_program))
+        self.add_pass(DecomposeSoftmaxesPass())
         for spec in compile_spec:
             if spec.key == "permute_memory_format":
                 memory_format = spec.value.decode()