pytorch
diff --git a/‎.ci/docker/common/install_base.sh
Lines changed: 5 additions & 0 deletions b/‎.ci/docker/common/install_base.sh
Lines changed: 5 additions & 0 deletions
diff --git a/‎.ci/docker/common/install_conda.sh
Lines changed: 10 additions & 2 deletions b/‎.ci/docker/common/install_conda.sh
Lines changed: 10 additions & 2 deletions
diff --git a/‎.ci/scripts/gather_benchmark_configs.py
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/gather_benchmark_configs.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/lint.yml
Lines changed: 2 additions & 1 deletion b/‎.github/workflows/lint.yml
Lines changed: 2 additions & 1 deletion
diff --git a/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.gitmodules
Lines changed: 3 additions & 0 deletions b/‎.gitmodules
Lines changed: 3 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 12 additions & 0 deletions b/‎CMakeLists.txt
Lines changed: 12 additions & 0 deletions
diff --git a/‎CONTRIBUTING.md
Lines changed: 8 additions & 2 deletions b/‎CONTRIBUTING.md
Lines changed: 8 additions & 2 deletions
diff --git a/‎README.md
Lines changed: 3 additions & 3 deletions b/‎README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/arm/_passes/_debug_passes.py
Lines changed: 23 additions & 0 deletions b/‎backends/arm/_passes/_debug_passes.py
Lines changed: 23 additions & 0 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 1 addition & 1 deletion b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/arm/_passes/decompose_select.py
Lines changed: 1 addition & 2 deletions b/‎backends/arm/_passes/decompose_select.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎backends/arm/arm_partitioner.py
Lines changed: 11 additions & 4 deletions b/‎backends/arm/arm_partitioner.py
Lines changed: 11 additions & 4 deletions
diff --git a/‎backends/arm/operator_support/__init__.py
Lines changed: 9 additions & 2 deletions b/‎backends/arm/operator_support/__init__.py
Lines changed: 9 additions & 2 deletions
diff --git a/‎backends/arm/operator_support/convolution_support.py
Lines changed: 99 additions & 0 deletions b/‎backends/arm/operator_support/convolution_support.py
Lines changed: 99 additions & 0 deletions
@@ -26,6 +26,11 @@ install_ubuntu() {
     libssl-dev \
     zip
 
+  # These libraries are needed by TorchVision
+  apt-get install -y --no-install-recommends \
+    libjpeg-dev \
+    libpng-dev
+
   # Cleanup package manager
   apt-get autoclean && apt-get clean
   rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
@@ -31,8 +31,16 @@ install_miniconda() {
 
 install_python() {
   pushd /opt/conda
-  # Install the correct Python version
+  # Install the selected Python version for CI jobs
   as_ci_user conda create -n "py_${PYTHON_VERSION}" -y --file /opt/conda/conda-env-ci.txt python="${PYTHON_VERSION}"
+
+  # From https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_conda.sh
+  if [[ $(uname -m) == "aarch64" ]]; then
+    conda_install "openblas==0.3.28=*openmp*"
+  else
+    conda_install mkl=2022.1.0 mkl-include=2022.1.0
+  fi
+
   popd
 }
 
@@ -53,7 +61,7 @@ fix_conda_ubuntu_libstdcxx() {
   # PyTorch sev: https://github.com/pytorch/pytorch/issues/105248
   # Ref: https://github.com/pytorch/pytorch/blob/main/.ci/docker/common/install_conda.sh
   if grep -e "2[02].04." /etc/issue >/dev/null; then
-    rm "/opt/conda/envs/py_${PYTHON_VERSION}/lib/libstdc++.so.6"
+    rm /opt/conda/envs/py_${PYTHON_VERSION}/lib/libstdc++.so*
   fi
 }
 
 
@@ -238,7 +238,7 @@ def set_output(name: str, val: Any) -> None:
     try:
         with open(github_output, "a") as env:
             env.write(f"{name}={val}\n")
-    except PermissionError:
+    except (PermissionError, FileNotFoundError):
         # Fall back to printing in case of permission error in unit tests
         print(f"::set-output name={name}::{val}")
 
 
@@ -31,7 +31,7 @@ jobs:
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
-        
+
         # For mypy linting, we need to first install executorch first so that
         # it builds the python package information.
         BUILD_TOOL="cmake"
@@ -74,6 +74,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-linter
       fetch-depth: 0
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
       script: |
         FILES_NEEDS_FORMAT=$(/opt/google-java-format -n extension/android/src/main/java/org/pytorch/executorch/*.java \
           examples/demo-apps/android/ExecuTorchDemo/app/src/main/java/com/example/executorchdemo/*.java \
 
@@ -212,7 +212,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 180
+      timeout: 90
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
@@ -526,7 +526,7 @@ jobs:
       docker-image: executorch-ubuntu-22.04-clang12
       submodules: 'true'
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
-      timeout: 180
+      timeout: 90
       script: |
         # The generic Linux job chooses to use base env, not the one setup by the image
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
 
@@ -67,3 +67,6 @@
 [submodule "backends/cadence/utils/FACTO"]
 	path = backends/cadence/utils/FACTO
 	url = https://github.com/pytorch-labs/FACTO.git
+[submodule "third-party/pocketfft"]
+	path = third-party/pocketfft
+	url = https://github.com/mreineck/pocketfft
@@ -182,6 +182,10 @@ option(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER "Build the Data Loader extension"
        OFF
 )
 
+option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
+       OFF
+)
+
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
 
 option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
@@ -240,6 +244,9 @@ cmake_dependent_option(
   "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF
 )
 
+if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+  set(EXECUTORCH_BUILF_EXTENSION_DATA_LOADER ON)
+endif()
 
 if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
   set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
@@ -694,6 +701,11 @@ if(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/data_loader)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
 
@@ -215,6 +215,14 @@ must work with threading**
 
 ## Testing
 
+### Running Tests Locally
+
+CI is run automatically on all pull requests. However, if you want to run tests locally, here are some example commands (not exhaustive):
+
+- The `sh test/build_size_test.sh` script will compile the C++runtime along with portable kernels.
+- The `test/run_oss_cpp_tests.sh` script will build and run C++ tests locally
+- Running `pytest` from the root directory will run Python tests locally.
+
 ### Writing Tests
 To help keep code quality high, ExecuTorch uses a combination of unit tests and
 end-to-end (e2e) tests. If you add a new feature or fix a bug, please add tests
@@ -229,8 +237,6 @@ If it's not clear how to add a test for your PR, take a look at the blame for
 the code you're modifying and find an author who has more context. Ask them
 for their help in the PR comments.
 
-The `test/run_oss_cpp_tests.sh` script will build and run C++ tests locally.
-
 ### Continuous Integration
 See https://hud.pytorch.org/hud/pytorch/executorch/main for the current state of
 the CI (continuous integration) jobs. If `main` is broken, consider rebasing
 
@@ -7,7 +7,7 @@
 <div align="center">
   <a href="https://github.com/pytorch/executorch/graphs/contributors"><img src="https://img.shields.io/github/contributors/pytorch/executorch?style=for-the-badge&color=blue" alt="Contributors"></a>
   <a href="https://github.com/pytorch/executorch/stargazers"><img src="https://img.shields.io/github/stars/pytorch/executorch?style=for-the-badge&color=blue" alt="Stargazers"></a>
-  <a href="https://discord.gg/MeacgB7A"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
+  <a href="https://discord.gg/Dh43CKSAdc"><img src="https://img.shields.io/badge/Discord-Join%20Us-purple?logo=discord&logoColor=white&style=for-the-badge" alt="Join our Discord community"></a>
   <a href="https://pytorch.org/executorch/stable/index.html"><img src="https://img.shields.io/badge/Documentation-000?logo=googledocs&logoColor=FFE165&style=for-the-badge" alt="Check out the documentation"></a>
   <hr>
 </div>
@@ -55,11 +55,11 @@ To get started you can:
 ## Feedback and Engagement
 
 We welcome any feedback, suggestions, and bug reports from the community to help
-us improve our technology. Check out the [Discussion Board](https://github.com/pytorch/executorch/discussions) or chat real time with us on [Discord](https://discord.gg/MeacgB7A)
+us improve our technology. Check out the [Discussion Board](https://github.com/pytorch/executorch/discussions) or chat real time with us on [Discord](https://discord.gg/Dh43CKSAdc)
 
 ## Contributing
 
-We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md) and chat with us on [Discord](https://discord.gg/MeacgB7A)
+We welcome contributions. To get started review the [guidelines](CONTRIBUTING.md) and chat with us on [Discord](https://discord.gg/Dh43CKSAdc)
 
 
 ## Directory Structure
 
@@ -0,0 +1,23 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.devtools.visualization.visualization_utils import visualize_graph
+from executorch.exir import ExportedProgram
+from executorch.exir.pass_base import ExportPass, PassResult
+
+
+class VisualizePass(ExportPass):
+    """
+    This pass visualizes the graph at the point of insertion in the pass manager
+    """
+
+    def __init__(self, exported_program: ExportedProgram) -> None:
+        super().__init__()
+        self.exported_program = exported_program
+
+    def call(self, graph_module: torch.fx.GraphModule) -> PassResult:
+        visualize_graph(graph_module, self.exported_program)
+        return PassResult(graph_module, False)
@@ -123,6 +123,7 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(FuseQuantizedActivationPass())
         self.add_pass(RemoveGetItemPass())
         self.add_pass(ConvertSplitToSlicePass())
+        self.add_pass(FuseBatchnorm2DPass(exported_program))
         self.add_pass(ConvertMmToBmmPass())
         self.add_pass(DecomposeLinearPass())
         self.add_pass(DecomposeBatchNormPass())
@@ -132,7 +133,6 @@ def _tosa_080_MI_pipeline(self, exported_program: ExportedProgram) -> GraphModul
         self.add_pass(ConvertMeanDimToAveragePoolPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeSoftmaxesPass())
-        self.add_pass(FuseBatchnorm2DPass(exported_program))
 
         self.add_pass(AnnotateDecomposedMatmulPass())
         self.add_pass(QuantizeOperatorArguments())
 
@@ -37,14 +37,13 @@ def call(self, graph_module: torch.fx.GraphModule):
             rank = len(input_node.meta["val"].size())
             dim = dim % rank if dim < 0 else dim
             index = index % rank if index < 0 else index
-            dim_list = list(range(rank))
 
             with graph_module.graph.inserting_before(node):
                 slice_node = create_node(
                     graph_module.graph, slice_op, (input_node, dim, index, index + 1)
                 )
                 squeeze_node = create_node(
-                    graph_module.graph, squeeze_op, (slice_node, dim_list)
+                    graph_module.graph, squeeze_op, (slice_node, [dim])
                 )
 
             node.replace_all_uses_with(squeeze_node)
 
@@ -7,14 +7,14 @@
 
 import logging
 import os
-from typing import Callable, final, List, Optional, Tuple
+from typing import Callable, final, List, Optional, Sequence, Tuple
 
 import torch
 from executorch.backends.arm.arm_backend import (  # type: ignore[attr-defined]
     ArmBackend,
 )  # usort: skip
 from executorch.backends.arm.operator_support.tosa_supported_operators import (
-    TOSASupportedOperators,
+    tosa_support_factory,
 )
 from executorch.backends.arm.tosa_specification import TosaSpecification
 from executorch.exir.backend.compile_spec_schema import CompileSpec
@@ -27,6 +27,8 @@
 from executorch.exir.dialects._ops import ops as exir_ops
 from torch.export.exported_program import ExportedProgram
 from torch.fx.passes.infra.partitioner import CapabilityBasedPartitioner
+from torch.fx.passes.operator_support import OperatorSupportBase
+
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.WARNING)
@@ -54,8 +56,13 @@ def is_dequant_node(node: torch.fx.node.Node) -> bool:
 
 @final
 class ArmPartitioner(Partitioner):
-    def __init__(self, compile_spec: List[CompileSpec]) -> None:
+    def __init__(
+        self,
+        compile_spec: List[CompileSpec],
+        additional_checks: Optional[Sequence[OperatorSupportBase]] = None,
+    ) -> None:
         self.delegation_spec = DelegationSpec(ArmBackend.__name__, compile_spec)
+        self.additional_checks = additional_checks
 
     def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         # Run the CapabilityBasedPartitioner to return the largest possible
@@ -72,7 +79,7 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
 
         capability_partitioner = CapabilityBasedPartitioner(
             exported_program.graph_module,
-            TOSASupportedOperators(tosa_spec),
+            tosa_support_factory(tosa_spec, self.additional_checks),
             allows_single_node_partition=True,
         )
         partition_list = capability_partitioner.propose_partitions()
 
@@ -1,8 +1,15 @@
-# Copyright 2024 Arm Limited and/or its affiliates.
+# Copyright 2024-2025 Arm Limited and/or its affiliates.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
-from . import right_shift_support, to_copy_support, tosa_supported_operators  # noqa
+from . import (  # noqa
+    convolution_support,
+    pool_2d_support,
+    reduce_sum_support,
+    right_shift_support,
+    to_copy_support,
+    tosa_supported_operators,
+)
@@ -0,0 +1,99 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import cast
+
+import torch
+import torch.fx as fx
+from executorch.backends.arm.operator_support.tosa_supported_operators import (
+    register_tosa_support_check,
+    SupportedTOSAOperatorCheck,
+)
+from executorch.backends.arm.tosa_specification import Tosa_0_80, TosaSpecification
+from executorch.exir.dialects._ops import ops as exir_ops
+
+
+@register_tosa_support_check
+class ConvolutionSupported(SupportedTOSAOperatorCheck):
+    targets = [exir_ops.edge.aten.convolution.default]
+
+    tosa_specs = [
+        TosaSpecification.create_from_string("TOSA-0.80+BI"),
+        TosaSpecification.create_from_string("TOSA-0.80+MI"),
+    ]
+
+    def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
+
+        # Not implemented
+        transposed = cast(bool, node.args[6])
+        output_padding = cast(list[int], node.args[7])
+        if transposed:
+            return False
+
+        for pad in output_padding:
+            if pad != 0:
+                return False
+
+        # Hardware specific constraints
+        if not (isinstance(tosa_spec, Tosa_0_80) and tosa_spec.is_U55_subset):
+            return True
+        else:
+            return self._is_node_supported_u55(node)
+
+    def _is_node_supported_u55(self, node: fx.Node):
+        """Hardware constraints for Ethos-U-55 case, Vela 4.2.0 (25.02 release)"""
+
+        shape_in = cast(torch.Tensor, node.all_input_nodes[0].meta["val"]).shape
+        shape_out = node.meta["val"].shape
+        kernel = cast(fx.Node, node.args[1]).meta["val"].shape
+        group = cast(int, node.args[8])
+
+        C_in = shape_in[1]
+        C_out = shape_out[1]
+        if (C_in == group) and (C_out % C_in) == 0:
+            # Depthwise convolution
+            for dim in shape_in[1:]:
+                if not 1 <= dim <= 65536:
+                    return False
+        else:
+            # Convolution
+            if not 1 <= C_in <= 65536:
+                return False
+
+        kernel_w = kernel[2]
+        kernel_h = kernel[3] if len(kernel) > 3 else 1
+        # Kernel condition misses constraint on sum of absolute weights
+        if not 1 <= kernel_h <= 64 or not 1 <= kernel_w * kernel_h <= 4096:
+            return False
+
+        if not self._stride_condition(node):
+            return False
+
+        return True
+
+    def _stride_condition(self, node: fx.Node) -> bool:
+        """This condition is somewhat complex but boils down
+        to not supporting stride > 3, unless we have some special conditions.
+        This condition is a simplified, relaxed version of the hardware constraint,
+        since the actual constraint requires information not available
+        here (without a lot of work).
+
+        This means that we might accept ops that are not actually supported.
+        """
+        strides = cast(list[int], node.args[3])
+        has_padding = any(pad > 0 for pad in cast(list[int], node.args[4]))
+        dilations = cast(list[int], node.args[5])
+        if len(dilations) == 1:
+            dilations = [dilations[0]] * 2
+        if len(strides) == 1:
+            strides = [strides[0]] * 2
+
+        for stride, dilation in zip(strides, dilations):
+            stride_condition = 1 <= stride <= 3
+            dilation_condition = (not has_padding) and (dilation == 1)
+            if (not stride_condition) and (not dilation_condition):
+                return False
+
+        return True
Original file line number	Diff line number	Diff line change
`@@ -37,14 +37,13 @@ def call(self, graph_module: torch.fx.GraphModule):`
`37`	`37`	`rank = len(input_node.meta["val"].size())`
`38`	`38`	`dim = dim % rank if dim < 0 else dim`
`39`	`39`	`index = index % rank if index < 0 else index`
`40`		`- dim_list = list(range(rank))`
`41`	`40`
`42`	`41`	`with graph_module.graph.inserting_before(node):`
`43`	`42`	`slice_node = create_node(`
`44`	`43`	`graph_module.graph, slice_op, (input_node, dim, index, index + 1)`
`45`	`44`	`)`
`46`	`45`	`squeeze_node = create_node(`
`47`		`- graph_module.graph, squeeze_op, (slice_node, dim_list)`
	`46`	`+ graph_module.graph, squeeze_op, (slice_node, [dim])`
`48`	`47`	`)`
`49`	`48`
`50`	`49`	`node.replace_all_uses_with(squeeze_node)`