pytorch
diff --git a/‎.ci/docker/requirements-ci.txt
Lines changed: 4 additions & 7 deletions b/‎.ci/docker/requirements-ci.txt
Lines changed: 4 additions & 7 deletions
diff --git a/‎.ci/scripts/build-qnn-sdk.sh
Lines changed: 9 additions & 2 deletions b/‎.ci/scripts/build-qnn-sdk.sh
Lines changed: 9 additions & 2 deletions
diff --git a/‎.ci/scripts/setup-qnn-deps.sh
Lines changed: 2 additions & 2 deletions b/‎.ci/scripts/setup-qnn-deps.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/pytorch-probot.yml
Lines changed: 1 addition & 0 deletions b/‎.github/pytorch-probot.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/apple/coreml/README.md
Lines changed: 3 additions & 3 deletions b/‎backends/apple/coreml/README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/apple/coreml/scripts/install_requirements.sh
Lines changed: 1 addition & 5 deletions b/‎backends/apple/coreml/scripts/install_requirements.sh
Lines changed: 1 addition & 5 deletions
diff --git a/‎backends/arm/README.md
Lines changed: 1 addition & 1 deletion b/‎backends/arm/README.md
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 66 additions & 60 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 66 additions & 60 deletions
diff --git a/‎backends/arm/_passes/cast_int64_pass.py
Lines changed: 5 additions & 1 deletion b/‎backends/arm/_passes/cast_int64_pass.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎backends/arm/_passes/decompose_select.py
Lines changed: 56 additions & 0 deletions b/‎backends/arm/_passes/decompose_select.py
Lines changed: 56 additions & 0 deletions
@@ -1,17 +1,14 @@
 mpmath==1.3.0
-numpy==1.21.3; python_version == '3.10'
-numpy==1.23.2; python_version == '3.11'
-numpy; python_version >= '3.12'
+numpy==2.0.0; python_version >= '3.10'
 PyYAML==6.0.1
 ruamel.yaml==0.17.32
 sympy==1.12
 timm==0.6.13
 tomli==2.0.1
 torchsr==1.0.4
-transformers==4.38.0
+transformers==4.47.1
 zstd==1.5.5.1
-pandas==2.0.3; python_version == '3.10'
-pandas; python_version >= '3.11'
+pandas==2.2.2; python_version >= '3.10'
 pytest==7.2.0
 pytest-cov==4.1.0
 expecttest==0.1.6
@@ -24,7 +21,7 @@ sphinx-gallery==0.14.0
 breathe==4.34.0
 exhale==0.2.3
 docutils==0.16
-matplotlib==3.7.2
+matplotlib==3.9.4
 # PyTorch Theme
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 myst-parser==0.18.1
 
@@ -1,5 +1,6 @@
 #!/bin/bash
 # Copyright (c) Meta Platforms, Inc. and affiliates.
+# Copyright 2025 Arm Limited and/or its affiliates.
 # All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
@@ -11,10 +12,16 @@ set -o xtrace
 build_qnn_backend() {
   echo "Start building qnn backend."
   export ANDROID_NDK_ROOT=/opt/ndk
-  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
+  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/../.." && pwd)"
 
-  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release
+  # Workaround to avoid issues around missing flatccrt library (depending on the
+  # number of jobs used), see issue #7300:
+  # Build twice (second time with `--no_clean`) to make sure libflatccrt.a is
+  # available.
+  # TODO: Remove this workaround once the underlying issue is fixed.
+  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release || \
+  bash backends/qualcomm/scripts/build.sh --skip_aarch64 --job_number 2 --release --no_clean
 }
 
 set_up_aot() {
 
@@ -16,9 +16,9 @@ install_qnn() {
   QNN_INSTALLATION_DIR=/tmp/qnn
   mkdir -p "${QNN_INSTALLATION_DIR}"
 
-  curl -Lo /tmp/v2.25.0.24.07.28.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.25.0.240728.zip"
+  curl -Lo /tmp/v2.28.0.24.10.29.zip "https://softwarecenter.qualcomm.com/api/download/software/qualcomm_neural_processing_sdk/v2.28.0.241029.zip"
   echo "Finishing downloading qnn sdk."
-  unzip -qo /tmp/v2.25.0.24.07.28.zip -d /tmp
+  unzip -qo /tmp/v2.28.0.24.10.29.zip -d /tmp
   echo "Finishing unzip qnn sdk."
 
 
 
@@ -121,7 +121,7 @@ echo "COREML option ${COREML}"
 if [[ "${MODE}" =~ .*qnn.* ]]; then
   QNN=ON
   export EXECUTORCH_ROOT="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")/.." && pwd)"
-  export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
+  export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
   export LD_LIBRARY_PATH="${QNN_SDK_ROOT}/lib/x86_64-linux-clang"
   export PYTHONPATH=".."
   cp schema/program.fbs exir/_serialize/program.fbs
 
@@ -1,4 +1,5 @@
 # The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml
+tracking_issue: 7679
 ciflow_push_tags:
 - ciflow/android
 - ciflow/apple
 
@@ -260,7 +260,7 @@ jobs:
                       --output_name="${OUT_ET_MODEL_NAME}.pte"
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
                 elif [[ ${{ matrix.config }} == "llama3_qnn_htp" ]]; then
-                    export QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728
+                    export QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029
                     export LD_LIBRARY_PATH=$QNN_SDK_ROOT/lib/x86_64-linux-clang/
                     export PYTHONPATH=$(pwd)/..
 
@@ -347,7 +347,7 @@ jobs:
         PYTHON_EXECUTABLE=python bash .ci/scripts/build-qnn-sdk.sh
 
         export ANDROID_ABIS="arm64-v8a"
-        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.25.0.240728 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
+        PYTHON_EXECUTABLE=python EXECUTORCH_BUILD_QNN=ON QNN_SDK_ROOT=/tmp/qnn/2.28.0.241029 bash build/build_android_llm_demo.sh ${ARTIFACTS_DIR_NAME}
 
   # Let's see how expensive this job is, we might want to tone it down by running it periodically
   benchmark-on-device:
 
@@ -93,14 +93,14 @@ class Model(torch.nn.Module):
 source_model = Model()
 example_inputs = (torch.randn((1, 3, 256, 256)), )
 
-pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()
+pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module()
 
 quantization_config = LinearQuantizerConfig.from_dict(
     {
         "global_config": {
             "quantization_scheme": QuantizationScheme.symmetric,
-            "activation_dtype": torch.uint8,
-            "weight_dtype": torch.int8,
+            "activation_dtype": torch.quint8,
+            "weight_dtype": torch.qint8,
             "weight_per_channel": True,
         }
     }
 
@@ -47,11 +47,7 @@ cmake --build "$COREMLTOOLS_DIR_PATH/build" --parallel
 
 echo "${green}ExecuTorch: Installing coremltools."
 pip install "$COREMLTOOLS_DIR_PATH"
-# CoreMLTools have started supporting numpy 2.0,
-# but ExecuTorch example model test env is still using older transformers,
-# so for now we will need to downgrade numpy to 1.x
-# TODO: Remove this numpy downgrade once later transformers starts to be used
-pip install numpy==1.26.4
+
 STATUS=$?
 if [ $STATUS -ne 0 ]; then
     echo "${red}ExecuTorch: Failed to install coremltools."
 
@@ -119,7 +119,7 @@ backends/arm/test/setup_testing.sh
 The you can run the tests with
 
 ```
-pytest -c /dev/null -v -n auto backends/arm/test --arm_quantize_io --arm_run_corstoneFVP
+pytest -c /dev/null -v -n auto backends/arm/test --arm_run_corstoneFVP
 ```
 
 ### Code coverage
 
@@ -7,7 +7,6 @@
 
 # pyre-unsafe
 
-import torch
 from executorch.backends.arm._passes.annotate_channels_last_dim_order_pass import (
     AnnotateChannelsLastDimOrder,
 )
@@ -28,6 +27,7 @@
 )
 from executorch.backends.arm._passes.decompose_linear_pass import DecomposeLinearPass
 from executorch.backends.arm._passes.decompose_meandim_pass import DecomposeMeanDimPass
+from executorch.backends.arm._passes.decompose_select import DecomposeSelectPass
 from executorch.backends.arm._passes.decompose_softmaxes_pass import (
     DecomposeSoftmaxesPass,
 )
@@ -46,7 +46,7 @@
 )
 from executorch.backends.arm._passes.match_arg_ranks_pass import MatchArgRanksPass
 from executorch.backends.arm._passes.meandim_to_averagepool_pass import (
-    ConvertMeanDimToAveragePool,
+    ConvertMeanDimToAveragePoolPass,
 )
 from executorch.backends.arm._passes.mm_to_bmm_pass import ConvertMmToBmmPass
 from executorch.backends.arm._passes.remove_clone_pass import RemoveClonePass
@@ -60,92 +60,98 @@
 from executorch.backends.arm._passes.unsqueeze_scalar_placeholders_pass import (
     UnsqueezeScalarPlaceholdersPass,
 )
+from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.backends.xnnpack._passes.remove_getitem_op import RemoveGetItemPass
 from executorch.exir import ExportedProgram
-from executorch.exir.backend.compile_spec_schema import CompileSpec
-from executorch.exir.dialects._ops import ops as exir_ops
 from executorch.exir.pass_manager import PassManager
+from torch.fx import GraphModule
 
 
 class ArmPassManager(PassManager):
 
-    def _transform(self, graph_module: torch.fx.GraphModule):
+    def __init__(self, tosa_spec: TosaSpecification) -> None:
+        self.tosa_spec = tosa_spec
+        super().__init__()
+
+    def _transform(self, graph_module: GraphModule):
         return self(graph_module).graph_module
 
-    def transform_to_backend_pipeline(
-        self, exported_program: ExportedProgram, compile_spec: list[CompileSpec]
-    ):
-        """Apply passes before transforming program to backend"""
+    def _tosa_080_BI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
         self.add_pass(FuseQuantizedActivationPass())
+        self.add_pass(RemoveGetItemPass())
+        self.add_pass(ConvertSplitToSlicePass())
+        self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
+        self.add_pass(ConvertMeanDimToAveragePoolPass())
+
+        self.add_pass(AnnotateDecomposedMatmulPass())
+        self.add_pass(QuantizeFullArgument())
+        self.add_pass(FoldAndAnnotateQParamsPass())
+        self.add_pass(RetraceFoldedDtypesPass())
+        self.add_pass(InsertTableOpsPass(exported_program))
+
+        self.add_pass(RemoveClonePass())
+        self.add_pass(SizeAdjustConv2DPass())
+        self.add_pass(ConvertExpandCopyToRepeatPass())
+        self.add_pass(UnsqueezeBeforeRepeatPass())
+        self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
+        self.add_pass(CastInt64ToInt32Pass(exported_program))
+        self.add_pass(MatchArgRanksPass(exported_program))
+        self.add_pass(KeepDimsFalseToSqueezePass())
+        self.add_pass(Conv1dUnsqueezePass(exported_program))
+        self.add_pass(DecomposeSelectPass())
+
+        self.add_pass(AnnotateChannelsLastDimOrder())
+
+        return self._transform(exported_program.graph_module)
+
+    def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModule:
+
+        self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
+        self.add_pass(ConvertSplitToSlicePass())
+        self.add_pass(ConvertMmToBmmPass())
+        self.add_pass(DecomposeLinearPass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
-        self.add_pass(ConvertMeanDimToAveragePool())
         self.add_pass(DecomposeMeanDimPass())
-        self.add_pass(ConvertSplitToSlicePass())
-        self.add_pass(ConvertMmToBmmPass())
-        # TODO MLETORCH-558
+        self.add_pass(ConvertMeanDimToAveragePoolPass())
+        self.add_pass(DecomposeDivPass())
+        self.add_pass(DecomposeSoftmaxesPass())
+
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeFullArgument())
-        self.add_pass(
-            FoldAndAnnotateQParamsPass(
-                [
-                    exir_ops.edge.aten.minimum.default,
-                    exir_ops.edge.aten.maximum.default,
-                    exir_ops.edge.aten.add.Tensor,
-                    exir_ops.edge.aten.avg_pool2d.default,
-                    exir_ops.edge.aten.bmm.default,
-                    exir_ops.edge.aten.cat.default,
-                    exir_ops.edge.aten.convolution.default,
-                    exir_ops.edge.aten.clone.default,
-                    exir_ops.edge.aten.exp.default,
-                    exir_ops.edge.aten.expand_copy.default,
-                    exir_ops.edge.aten.full.default,
-                    exir_ops.edge.aten.hardtanh.default,
-                    exir_ops.edge.aten.log.default,
-                    exir_ops.edge.aten.max_pool2d.default,
-                    exir_ops.edge.aten.mul.Tensor,
-                    exir_ops.edge.aten.permute_copy.default,
-                    exir_ops.edge.aten.reciprocal.default,
-                    exir_ops.edge.aten.relu.default,
-                    exir_ops.edge.aten.repeat.default,
-                    exir_ops.edge.aten.rsqrt.default,
-                    exir_ops.edge.aten.select_copy.int,
-                    exir_ops.edge.aten.sigmoid.default,
-                    exir_ops.edge.aten.slice_copy.Tensor,
-                    exir_ops.edge.aten.squeeze_copy.dims,
-                    exir_ops.edge.aten.sub.Tensor,
-                    exir_ops.edge.aten.sum.dim_IntList,
-                    exir_ops.edge.aten.tanh.default,
-                    exir_ops.edge.aten.unsqueeze_copy.default,
-                    exir_ops.edge.aten.upsample_nearest2d.vec,
-                    exir_ops.edge.aten.view_copy.default,
-                ]
-            )
-        )
+        self.add_pass(FoldAndAnnotateQParamsPass())
         self.add_pass(RetraceFoldedDtypesPass())
         self.add_pass(InsertTableOpsPass(exported_program))
+
+        self.add_pass(RemoveClonePass())
+        self.add_pass(SizeAdjustConv2DPass())
         self.add_pass(ConvertExpandCopyToRepeatPass())
         self.add_pass(UnsqueezeBeforeRepeatPass())
-        self.add_pass(CastInt64ToInt32Pass(exported_program))
         self.add_pass(UnsqueezeScalarPlaceholdersPass(exported_program))
-        self.add_pass(SizeAdjustConv2DPass())
-        self.add_pass(RemoveClonePass())
+        self.add_pass(CastInt64ToInt32Pass(exported_program))
         self.add_pass(MatchArgRanksPass(exported_program))
-        self.add_pass(DecomposeDivPass())
         self.add_pass(KeepDimsFalseToSqueezePass())
         self.add_pass(Conv1dUnsqueezePass(exported_program))
-        self.add_pass(DecomposeSoftmaxesPass())
-        for spec in compile_spec:
-            if spec.key == "permute_memory_format":
-                memory_format = spec.value.decode()
-                if memory_format == "nhwc":
-                    self.add_pass(AnnotateChannelsLastDimOrder())
+        self.add_pass(DecomposeSelectPass())
+
+        self.add_pass(AnnotateChannelsLastDimOrder())
 
         return self._transform(exported_program.graph_module)
 
-    def transform_for_annotation_pipeline(self, graph_module: torch.fx.GraphModule):
+    def transform_to_backend_pipeline(self, exported_program: ExportedProgram):
+        """Apply passes before transforming program to backend"""
+        if self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+BI"):
+            return self._tosa_080_BI_pipeline(exported_program)
+        elif self.tosa_spec == TosaSpecification.create_from_string("TOSA-0.80.0+MI"):
+            return self._tosa_080_MI_pipeline(exported_program)
+        else:
+            raise NotImplementedError(
+                f"No pass pipeline implemented for {self.tosa_spec=}"
+            )
+
+    def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(ScalarsToAttributePass())
         self.add_pass(DecomposeLayerNormPass())
         self.add_pass(DecomposeVarPass())
 
@@ -1,4 +1,4 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -17,6 +17,10 @@
 
 
 class CastInt64ToInt32Pass(ExportPass):
+    """
+    Cast int64 buffers to int32 if the int64 data is in int32 range.
+    """
+
     def __init__(self, exported_program: torch.export.ExportedProgram):
         super(CastInt64ToInt32Pass, self).__init__()
         self.exported_program = exported_program
 
@@ -0,0 +1,56 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# pyre-unsafe
+
+import torch
+from executorch.backends.arm._passes.arm_pass_utils import create_node
+from executorch.exir.dialects._ops import ops as exir_ops
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class DecomposeSelectPass(ExportPass):
+    """
+    This pass decomposes select into slice + squeeze to ensure that Aten and TOSA outputs has the same rank (input rank -1)
+    """
+
+    def call(self, graph_module: torch.fx.GraphModule):
+        for node in graph_module.graph.nodes:
+
+            if node.op != "call_function":
+                continue
+
+            if node.target in (
+                exir_ops.edge.aten.select.int,
+                exir_ops.edge.aten.select_copy.int,
+            ):
+                slice_op = exir_ops.edge.aten.slice_copy.Tensor
+                squeeze_op = exir_ops.edge.aten.squeeze_copy.dims
+            else:
+                continue
+
+            input_node, dim, index = node.args
+
+            rank = len(input_node.meta["val"].size())
+            dim = dim % rank if dim < 0 else dim
+            index = index % rank if index < 0 else index
+            dim_list = list(range(rank))
+
+            with graph_module.graph.inserting_before(node):
+                slice_node = create_node(
+                    graph_module.graph, slice_op, (input_node, dim, index, index + 1)
+                )
+                squeeze_node = create_node(
+                    graph_module.graph, squeeze_op, (slice_node, dim_list)
+                )
+
+            node.replace_all_uses_with(squeeze_node)
+            graph_module.graph.erase_node(node)
+
+        graph_module.graph.eliminate_dead_code()
+        graph_module.recompile()
+        graph_module = super().call(graph_module).graph_module
+        return PassResult(graph_module, True)
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`# The schema is from https://github.com/pytorch/pytorch/blob/main/.github/pytorch-probot.yml`
	`2`	`+tracking_issue: 7679`
`2`	`3`	`ciflow_push_tags:`
`3`	`4`	`- ciflow/android`
`4`	`5`	`- ciflow/apple`
Original file line number	Diff line number	Diff line change
`@@ -93,14 +93,14 @@ class Model(torch.nn.Module):`
`93`	`93`	`source_model = Model()`
`94`	`94`	`example_inputs = (torch.randn((1, 3, 256, 256)), )`
`95`	`95`
`96`		`-pre_autograd_aten_dialect = export_for_training(model, example_inputs).module()`
	`96`	`+pre_autograd_aten_dialect = export_for_training(source_model, example_inputs).module()`
`97`	`97`
`98`	`98`	`quantization_config = LinearQuantizerConfig.from_dict(`
`99`	`99`	`{`
`100`	`100`	`"global_config": {`
`101`	`101`	`"quantization_scheme": QuantizationScheme.symmetric,`
`102`		`- "activation_dtype": torch.uint8,`
`103`		`- "weight_dtype": torch.int8,`
	`102`	`+ "activation_dtype": torch.quint8,`
	`103`	`+ "weight_dtype": torch.qint8,`
`104`	`104`	`"weight_per_channel": True,`
`105`	`105`	`}`
`106`	`106`	`}`