pytorch
diff --git a/‎.ci/scripts/test_eval_llama_mmlu.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_eval_llama_mmlu.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_eval_llama_wikitext.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_eval_llama_wikitext.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/apple.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/apple.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pull.yml
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/pull.yml
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/trunk.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 1 addition & 1 deletion b/‎.lintrunner.toml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 9 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 9 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 32 additions & 0 deletions b/‎CONTRIBUTING.md
Lines changed: 32 additions & 0 deletions
diff --git a/‎README-wheel.md
Lines changed: 8 additions & 7 deletions b/‎README-wheel.md
Lines changed: 8 additions & 7 deletions
diff --git a/‎backends/arm/README.md
Lines changed: 12 additions & 0 deletions b/‎backends/arm/README.md
Lines changed: 12 additions & 0 deletions
diff --git a/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 6 additions & 2 deletions b/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 6 additions & 2 deletions
diff --git a/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 35 additions & 8 deletions b/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 35 additions & 8 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 10 additions & 3 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 10 additions & 3 deletions
@@ -43,6 +43,7 @@ run_and_verify() {
 	--tasks mmlu \
 	-f 5 \
 	--max_seq_length 2048 \
+	--max_context_length 2048 \
 	--limit 5 > result.txt
 
     # Verify result.txt
 
@@ -41,6 +41,7 @@ run_and_verify() {
 	-kv \
 	-d fp32 \
 	--max_seq_length 2048 \
+	--max_context_length 2048 \
 	--limit 5 > result.txt
 
     # Verify result.txt
 
@@ -37,7 +37,7 @@ jobs:
         id: set_version
         shell: bash
         run: |
-          VERSION="0.4.0.$(TZ='PST8PDT' date +%Y%m%d)"
+          VERSION="0.5.0.$(TZ='PST8PDT' date +%Y%m%d)"
           echo "version=$VERSION" >> "$GITHUB_OUTPUT"
 
   build-demo-ios:
 
@@ -221,7 +221,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_executorch.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
 
         # install Llava requirements
         bash examples/models/llama/install_requirements.sh
@@ -484,7 +484,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_executorch.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
 
         # install phi-3-mini requirements
         bash examples/models/phi-3-mini/install_requirements.sh
@@ -514,7 +514,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_executorch.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
@@ -544,7 +544,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_executorch.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
@@ -574,7 +574,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/setup-linux.sh "cmake"
 
         # install pybind
-        bash install_executorch.sh --pybind xnnpack
+        bash install_executorch.sh --pybind xnnpack --use-pt-pinned-commit
 
         # install llama requirements
         bash examples/models/llama/install_requirements.sh
 
@@ -150,7 +150,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_executorch
+        install_executorch "use-pt-pinned-commit"
 
         .ci/scripts/setup-arm-baremetal-tools.sh
 
@@ -180,7 +180,7 @@ jobs:
         conda activate "${CONDA_ENV}"
 
         source .ci/scripts/utils.sh
-        install_executorch
+        install_executorch "use-pt-pinned-commit"
 
         .ci/scripts/setup-arm-baremetal-tools.sh
 
 
@@ -1,4 +1,4 @@
-merge_base_with = "origin/main"
+merge_base_with = "main"
 
 [[linter]]
 code = 'FLAKE8'
 
@@ -1,4 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -819,6 +820,14 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
     list(APPEND _executor_runner_libs quantized_ops_lib)
   endif()
 
+  if(EXECUTORCH_ENABLE_EVENT_TRACER)
+    if(EXECUTORCH_BUILD_DEVTOOLS)
+      list(APPEND _executor_runner_libs etdump flatccrt)
+    else()
+      message(SEND_ERROR "Use of 'EXECUTORCH_ENABLE_EVENT_TRACER' requires 'EXECUTORCH_BUILD_DEVTOOLS' to be enabled.")
+    endif()
+  endif()
+
   add_executable(executor_runner ${_executor_runner__srcs})
   if(CMAKE_BUILD_TYPE STREQUAL "Release")
     if(APPLE)
 
@@ -44,6 +44,38 @@ Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
 disclosure of security bugs. In those cases, please go through the process
 outlined on that page and do not file a public issue.
 
+### Issue Labels
+
+#### Module/Partner Labels
+
+[Labels beginning with `module:`](https://github.com/pytorch/executorch/labels?q=%22module%3A+%22)
+indicate the area that the issue relates to. The ExecuTorch oncall will
+typically add this label.
+
+[Labels beginning with `partner:`](https://github.com/pytorch/executorch/labels?q=%22partner%3A+%22)
+indicate the ExecuTorch partner who owns the issue. The ExecuTorch oncall will
+typically add this label.
+
+#### Lifecycle Labels
+
+The ExecuTorch oncall will triage new issues. If the issue requires more
+information from the issue's author, oncall will add the `need-user-input` label
+and wait for the author to respond.
+
+Once the issue contains enough information, the oncall will:
+- Ensure that the title is descriptive
+- Add one of the labels:
+  - `bug`: The issue describes an unexpected problem
+  - `feature`: The issue describes a request for new functionality
+  - `rfc`: The issue describes a proposed change to functionality
+- Add one `module:` label or one `partner:` label, as described above
+- Add the `triaged` label
+
+After this point, the oncall has finished the triage process, and the
+module owner or partner is responsible for resolving the issue. (See
+https://github.com/pytorch/executorch/issues/7679 for the mapping of labels to
+owners.)
+
 ### Claiming Issues
 We'd love your help closing out [open
 issues](https://github.com/pytorch/executorch/issues?q=sort%3Aupdated-desc+is%3Aissue+is%3Aopen)
 
@@ -4,20 +4,21 @@ standard on-device iOS and Android mobile deployments. One of the main goals for
 ExecuTorch is to enable wider customization and deployment capabilities of the
 PyTorch programs.
 
-The `executorch` pip package is in alpha.
-* Supported python versions: 3.10, 3.11
+The `executorch` pip package is in beta.
+* Supported python versions: 3.10, 3.11, 3.12
 * Compatible systems: Linux x86_64, macOS aarch64
 
-The prebuilt `executorch.extension.pybindings.portable_lib` module included in
-this package provides a way to run ExecuTorch `.pte` files, with some
-restrictions:
+The prebuilt `executorch.runtime` module included in this package provides a way
+to run ExecuTorch `.pte` files, with some restrictions:
 * Only [core ATen
   operators](https://pytorch.org/executorch/stable/ir-ops-set-definition.html)
   are linked into the prebuilt module
 * Only the [XNNPACK backend
   delegate](https://pytorch.org/executorch/main/native-delegates-executorch-xnnpack-delegate.html)
-  is linked into the prebuilt module
-* [macOS only] [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html) and [MPS](https://pytorch.org/executorch/main/build-run-mps.html) backend delegates are linked into the prebuilt module.
+  is linked into the prebuilt module.
+* \[macOS only] [Core ML](https://pytorch.org/executorch/main/build-run-coreml.html)
+  and [MPS](https://pytorch.org/executorch/main/build-run-mps.html) backend
+  delegates are also linked into the prebuilt module.
 
 Please visit the [ExecuTorch website](https://pytorch.org/executorch/) for
 tutorials and documentation. Here are some starting points:
 
@@ -122,6 +122,18 @@ The you can run the tests with
 pytest -c /dev/null -v -n auto backends/arm/test --arm_run_corstoneFVP
 ```
 
+## Passes
+
+With the default passes in the Arm Ethos-U backend, assuming the model lowers fully to the
+Ethos-U, the exported program is composed of a Quantize node, Ethos-U custom delegate
+and a Dequantize node. In some circumstances, you may want to feed quantized input to the Neural
+Network straight away, e.g. if you have a camera sensor outputting (u)int8 data and keep all the
+arithmetic of the application in the int8 domain. For these cases, you can apply the
+`exir/passes/quantize_io_pass.py`. See the unit test in `executorch/backends/arm/
+test/passes/test_ioquantization_pass.py`for an example how to feed quantized inputs and
+obtain quantized outputs.
+
+
 ### Code coverage
 
 To get code coverage:
 
@@ -116,7 +116,7 @@ def insert_input_transpose(node, input_node, graph_module):
         with graph_module.graph.inserting_before(node):
             permute_node = create_node(
                 graph_module.graph,
-                torch.ops.passthrough_to_tosa._transpose,
+                torch.ops.passthrough_to_tosa._transpose.default,
                 args=(
                     input_node,
                     list(AnnotateChannelsLastDimOrder.NHWC_inverse_order),
@@ -129,18 +129,22 @@ def insert_input_transpose(node, input_node, graph_module):
             permute_node.meta["tosa_dim_order"] = tuple(
                 range(len(input_node.meta["val"].size()))
             )
+            permute_node.meta["val"] = input_node.meta["val"]
 
     @staticmethod
     def insert_output_transpose(node, graph_module):
         with graph_module.graph.inserting_after(node):
             permute_node = create_node(
                 graph_module.graph,
-                torch.ops.passthrough_to_tosa._transpose,
+                torch.ops.passthrough_to_tosa._transpose.default,
                 args=(node, list(AnnotateChannelsLastDimOrder.NHWC_order)),
             )
             permute_node.meta["tosa_dim_order"] = (
                 AnnotateChannelsLastDimOrder.NHWC_order
             )
+            permute_node.meta["val"] = node.meta["val"].permute(
+                AnnotateChannelsLastDimOrder.NHWC_order
+            )
             node.meta["tosa_dim_order"] = (0, 1, 2, 3)
             users = [user for user in node.users if user != permute_node]
             for user in users:
 
@@ -6,9 +6,12 @@
 
 import itertools
 
+from typing import List
+
 import torch
 from executorch.backends.arm._passes.arm_pass_utils import create_node
-from executorch.backends.arm.tosa_quant_utils import dq_op, q_op
+
+from executorch.backends.arm.tosa_quant_utils import dq_op, q_op, QuantArgs
 from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_base import ExportPass, PassResult
 from torch.fx import GraphModule
@@ -24,6 +27,22 @@ class AnnotateDecomposedMatmulPass(ExportPass):
     matmul-op (can be mm or bmm).
     """
 
+    def _match_partition_to_node(
+        self, node: torch.fx.Node, partitioned_inputs: List[torch.fx.Node]
+    ) -> torch.fx.Node:
+        """
+        The partition.input_nodes order is not guaranteed. Compare these
+        with the matmul node inputs coming in and return the nodes
+        in the correct order.
+        """
+        if not node or node in partitioned_inputs or node.op == "placeholder":
+            return node
+        else:
+            return self._match_partition_to_node(
+                node.all_input_nodes[0], partitioned_inputs
+            )
+        raise RuntimeError(f"Cannot find an input node which matches, {node}.")
+
     def call(self, graph_module: GraphModule) -> PassResult:
         matmul_partitions = get_source_partitions(
             graph_module.graph,
@@ -45,28 +64,36 @@ def call(self, graph_module: GraphModule) -> PassResult:
             matmul_node = [
                 node for node in partition.nodes if node.target in matmul_targets
             ][0]
+
             if quantized_input:
                 matmul_args = matmul_node.all_input_nodes
-                for i in range(len(matmul_args)):
-                    input_node = partition.input_nodes[i]
-                    matmul_input_node = matmul_args[i]
+                for node in matmul_args:
+                    input_node = self._match_partition_to_node(
+                        node, partition.input_nodes
+                    )
+
                     # Remove partition input dq-node
                     input_node.replace_all_uses_with(input_node.all_input_nodes[0])
                     graph_module.graph.erase_node(input_node)
-                    input_node_qargs = input_node.args[1:]
+                    input_node_qargs = QuantArgs.from_operator(
+                        input_node.target, input_node.args
+                    )
+
                     with graph_module.graph.inserting_before(matmul_node):
                         # Create new dq-node before matmul
                         dq_node = create_node(
                             graph=graph_module.graph,
                             op_target=dq_op,
                         )
-                        dq_node.args = (matmul_input_node, *input_node_qargs)
-                        matmul_node.replace_input_with(matmul_input_node, dq_node)
+                        dq_node.args = (node, *input_node_qargs)
+                        matmul_node.replace_input_with(node, dq_node)
 
             partition_output = list(partition.output_nodes[0].users)[0]
             quantized_output = partition_output.target == q_op
             if quantized_output:
-                output_node_qargs = partition_output.args[1:]
+                output_node_qargs = QuantArgs.from_operator(
+                    partition_output.target, partition_output.args
+                )
                 with graph_module.graph.inserting_after(matmul_node):
                     # Create q-node after matmul
                     q_node = create_node(
 
@@ -24,6 +24,9 @@
 from executorch.backends.arm._passes.convert_squeezes_to_view import (  # type: ignore[import-not-found]
     ConvertSqueezesToViewPass,
 )
+from executorch.backends.arm._passes.decompose_batchnorm_pass import (
+    DecomposeBatchNormPass,
+)
 from executorch.backends.arm._passes.decompose_div_pass import DecomposeDivPass
 from executorch.backends.arm._passes.decompose_layernorm_pass import (
     DecomposeLayerNormPass,
@@ -39,9 +42,10 @@
 from executorch.backends.arm._passes.decompose_var_pass import DecomposeVarPass
 from executorch.backends.arm._passes.fold_qdq_with_annotated_qparams_pass import (
     FoldAndAnnotateQParamsPass,
-    QuantizeFullArgument,
+    QuantizeOperatorArguments,
     RetraceFoldedDtypesPass,
 )
+from executorch.backends.arm._passes.fuse_batchnorm2d_pass import FuseBatchnorm2DPass
 from executorch.backends.arm._passes.fuse_quantized_activation_pass import (  # type: ignore[import-not-found]
     FuseQuantizedActivationPass,
 )
@@ -86,13 +90,14 @@ def _transform(self, graph_module: GraphModule):
     def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
+        self.add_pass(DecomposeBatchNormPass())
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
         self.add_pass(ConvertMeanDimToAveragePoolPass())
 
         self.add_pass(AnnotateDecomposedMatmulPass())
-        self.add_pass(QuantizeFullArgument())
+        self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(InsertTableOpsPass(exported_program))
@@ -120,15 +125,17 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertSplitToSlicePass())
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(DecomposeBatchNormPass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeSoftmaxesPass())
+        self.add_pass(FuseBatchnorm2DPass(exported_program))
 
         self.add_pass(AnnotateDecomposedMatmulPass())
-        self.add_pass(QuantizeFullArgument())
+        self.add_pass(QuantizeOperatorArguments())
         self.add_pass(FoldAndAnnotateQParamsPass())  # type: ignore[call-arg]
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(InsertTableOpsPass(exported_program))
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-merge_base_with = "origin/main"`
	`1`	`+merge_base_with = "main"`
`2`	`2`
`3`	`3`	`[[linter]]`
`4`	`4`	`code = 'FLAKE8'`