pytorch
diff --git a/‎.ci/docker/ci_commit_pins/buck2.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/buck2.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 0 additions & 4 deletions b/‎.ci/scripts/test_model.sh
Lines changed: 0 additions & 4 deletions
diff --git a/‎.ci/scripts/unittest-linux.sh
Lines changed: 0 additions & 3 deletions b/‎.ci/scripts/unittest-linux.sh
Lines changed: 0 additions & 3 deletions
diff --git a/‎.ci/scripts/unittest-macos.sh
Lines changed: 0 additions & 1 deletion b/‎.ci/scripts/unittest-macos.sh
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/apple.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/apple.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/build-presets.yml
Lines changed: 17 additions & 0 deletions b/‎.github/workflows/build-presets.yml
Lines changed: 17 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 3 deletions b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 3 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 7 additions & 0 deletions b/‎.lintrunner.toml
Lines changed: 7 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 19 additions & 5 deletions b/‎CMakeLists.txt
Lines changed: 19 additions & 5 deletions
diff --git a/‎CMakePresets.json
Lines changed: 33 additions & 0 deletions b/‎CMakePresets.json
Lines changed: 33 additions & 0 deletions
diff --git a/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 3 additions & 3 deletions b/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/arm/operator_support/pool_2d_support.py
Lines changed: 4 additions & 1 deletion b/‎backends/arm/operator_support/pool_2d_support.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/operator_support/tosa_supported_operators.py
Lines changed: 2 additions & 1 deletion b/‎backends/arm/operator_support/tosa_supported_operators.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/arm/operators/op_max_pool2d.py
Lines changed: 46 additions & 0 deletions b/‎backends/arm/operators/op_max_pool2d.py
Lines changed: 46 additions & 0 deletions
diff --git a/‎backends/arm/operators/op_permute.py
Lines changed: 12 additions & 10 deletions b/‎backends/arm/operators/op_permute.py
Lines changed: 12 additions & 10 deletions
diff --git a/‎backends/arm/operators/op_slice.py
Lines changed: 20 additions & 4 deletions b/‎backends/arm/operators/op_slice.py
Lines changed: 20 additions & 4 deletions
@@ -1 +1 @@
-2024-12-16
+2025-05-06
@@ -87,10 +87,6 @@ test_model() {
     bash examples/models/llava/install_requirements.sh
     STRICT="--no-strict"
   fi
-  if [[ "$MODEL_NAME" == "llama3_2_vision_encoder" || "$MODEL_NAME" == "llama3_2_text_decoder" ]]; then
-    # Install requirements for llama vision.
-    bash examples/models/llama3_2_vision/install_requirements.sh
-  fi
   if [[ "${MODEL_NAME}" == "qwen2_5" ]]; then
       # Install requirements for export_llama
       bash examples/models/llama/install_requirements.sh
 
@@ -24,9 +24,6 @@ if [[ "$BUILD_TOOL" == "cmake" ]]; then
     CMAKE_ARGS="-DEXECUTORCH_BUILD_PYBIND=ON -DEXECUTORCH_BUILD_XNNPACK=ON -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON" \
     .ci/scripts/setup-linux.sh "$@"
 
-    # Install llama3_2_vision dependencies.
-    PYTHON_EXECUTABLE=python ./examples/models/llama3_2_vision/install_requirements.sh
-
     .ci/scripts/unittest-linux-cmake.sh
 elif [[ "$BUILD_TOOL" == "buck2" ]]; then
     # Removing this breaks sccache in the Buck build, apparently
 
@@ -29,7 +29,6 @@ if [[ "$BUILD_TOOL" == "cmake" ]]; then
     # Install llama3_2_vision dependencies.
     PYTHON_EXECUTABLE=python \
     ${CONDA_RUN} --no-capture-output \
-    ./examples/models/llama3_2_vision/install_requirements.sh
 
     .ci/scripts/unittest-macos-cmake.sh
 elif [[ "$BUILD_TOOL" == "buck2" ]]; then
 
@@ -5,6 +5,8 @@ on:
     branches:
       - main
       - release/*
+    tags:
+      - ciflow/trunk/*
   pull_request:
     paths:
       - .ci/scripts/setup-ios.sh
 
@@ -11,3 +11,20 @@ on:
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
+
+jobs:
+  apple:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    strategy:
+      matrix:
+        preset: [macos-arm64]
+    with:
+      job-name: build
+      runner: macos-latest-xlarge
+      python-version: 3.12
+      submodules: recursive
+      script: |
+        set -eux
+        ${CONDA_RUN} ./install_requirements.sh > /dev/null
+        ${CONDA_RUN} cmake --preset ${{ matrix.preset }}
+        ${CONDA_RUN} cmake --build cmake-out --parallel
@@ -434,9 +434,7 @@ jobs:
         output=$(ls -la cmake-out/test/size_test)
         arr=($output)
         size=${arr[4]}
-        # threshold=48120 on devserver with gcc11.4
-        # todo(lfq): update once binary size is below 50kb.
-        threshold="47552"
+        threshold="47560"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
 
@@ -220,6 +220,13 @@ exclude_patterns = [
     'extension/**',
     'kernels/optimized/**',
     # Justified <functional> include.
+    'kernels/portable/cpu/op_bitwise*.cpp',
+    'kernels/portable/cpu/op_eq.cpp',
+    'kernels/portable/cpu/op_ge.cpp',
+    'kernels/portable/cpu/op_gt.cpp',
+    'kernels/portable/cpu/op_le.cpp',
+    'kernels/portable/cpu/op_lt.cpp',
+    'kernels/portable/cpu/op_ne.cpp',
     'runtime/kernel/thread_parallel_interface.h',
     'scripts/**',
     'third-party/**',
 
@@ -44,6 +44,19 @@
 
 cmake_minimum_required(VERSION 3.24)
 project(executorch)
+
+# MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION --------------------------------------------------
+
+include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
+
+load_build_preset()
+include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
+
+# Print all the configs that were called with announce_configured_options.
+print_configured_options()
+
+# MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ----------------------------------------------------
+
 include(tools/cmake/Utils.cmake)
 include(CMakeDependentOption)
 
@@ -96,9 +109,6 @@ set(EXECUTORCH_PAL_DEFAULT
           "Which PAL default implementation to use: one of {posix, minimal}"
 )
 
-option(EXECUTORCH_ENABLE_LOGGING "Build with ET_LOG_ENABLED"
-       ${_default_release_disabled_options}
-)
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   # Avoid pulling in the logging strings, which can be large. Note that this
   # will set the compiler flag for all targets in this directory, and for all
@@ -170,8 +180,6 @@ option(EXECUTORCH_BUILD_ARM_BAREMETAL
        "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF
 )
 
-option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" OFF)
-
 option(EXECUTORCH_BUILD_KERNELS_CUSTOM "Build the custom kernels" OFF)
 
 option(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT "Build the custom ops lib for AOT"
@@ -234,6 +242,8 @@ option(EXECUTORCH_USE_DL "Use libdl library" ON)
 
 option(EXECUTORCH_BUILD_CADENCE "Build the Cadence DSP backend" OFF)
 
+option(EXECUTORCH_BUILD_CORTEX_M "Build the Cortex-M backend" OFF)
+
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
 #
@@ -707,6 +717,10 @@ if(EXECUTORCH_BUILD_XNNPACK)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/xnnpack)
 endif()
 
+if(EXECUTORCH_BUILD_CORTEX_M)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/backends/cortex_m)
+endif()
+
 if(EXECUTORCH_BUILD_DEVTOOLS)
   if(NOT EXECUTORCH_BUILD_ARM_BAREMETAL)
     set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER
 
@@ -0,0 +1,33 @@
+{
+  "version": 10,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 31,
+    "patch": 0
+  },
+  "$comment": "On-device AI across mobile, embedded and edge for PyTorch.",
+  "configurePresets": [
+    {
+      "name": "common",
+      "hidden": true,
+      "binaryDir": "${sourceDir}/cmake-out",
+      "generator": "Unix Makefiles"
+    },
+    {
+      "name": "macos-arm64",
+      "inherits": ["common"],
+      "generator": "Xcode",
+      "cacheVariables": {
+        "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/macos-arm64.cmake",
+        "PLATFORM": "MAC_ARM64",
+        "DEPLOYMENT_TARGET": "10.15"
+      },
+      "condition": {
+        "lhs": "${hostSystemName}",
+        "type": "equals",
+        "rhs": "Darwin"
+      }
+    }
+  ]
+}
@@ -1,13 +1,12 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
 import itertools
-
+import operator
 from typing import List
 
 import torch
@@ -22,7 +21,7 @@
 
 class AnnotateDecomposedMatmulPass(ExportPass):
     """
-    torch.matmul can be decomposed in many ways, for instance:
+    torch.matmul and it's equivalent operator @ can be decomposed in many ways, for instance:
     dq -> matmul -> q can become
     dq -> repeat -> view -> bmm -> view -> dq which makes quantization folding
     difficult. This helper function find all matmul partitions and annotate its
@@ -50,6 +49,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
             graph_module.graph,
             [
                 torch.matmul,
+                operator.matmul,
             ],
             None,
         )
 
@@ -54,8 +54,11 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
         kernel = cast(tuple[int, int], node.args[1])
         stride = cast(tuple[int, int], node.args[2])
         if len(node.args) > 3:
+            padding = cast(tuple[int, int], node.args[3])
             # Padding case
-            if not all(1 <= k <= 8 for k in kernel):
+            if not all(1 <= k <= 8 for k in kernel) and not all(
+                v == 0 for v in padding
+            ):
                 self.reporter.report_reject(
                     node, f"Avgpool2d with padding needs kernel dims < 8, got {kernel}"
                 )
 
@@ -335,6 +335,7 @@ def _is_matmul_node_supported(
                 graph_module.graph,
                 [
                     torch.matmul,
+                    operator.matmul,
                 ],
                 None,
             )
@@ -385,7 +386,7 @@ def is_node_supported(
         ):
             source_fn_stack: tuple[typing.Any] = node.meta.get("source_fn_stack", [])
             if len(source_fn_stack) > 0:
-                if source_fn_stack[-1][1] in (torch.matmul,):
+                if source_fn_stack[-1][1] in (torch.matmul, operator.matmul):
                     return self._is_matmul_node_supported(submodules, node)
 
         elif node.target in (exir_ops.edge.aten.max_pool2d_with_indices.default,):
 
@@ -23,6 +23,24 @@
 from executorch.backends.arm.tosa_specification import TosaSpecification
 
 
+# Similarly to Conv2d, the TOSA spec requires that following is exactly divisible:
+# `(input + 2 * pad - kernel_size) / stride`
+# PyTorch however, does not require this, so as needed, we must adjust the padding.
+def adjust_pad_if_needed(
+    input_size: int, kernel_size: int, stride: int, pad: int
+) -> int:
+    if pad == 0:
+        return pad
+
+    mod_remainder = (input_size + 2 * pad - kernel_size) % stride
+
+    # No need to adjust
+    if mod_remainder == 0:
+        return pad
+
+    return pad - mod_remainder
+
+
 @register_node_visitor
 class MaxPool2dVisitor_0_80(NodeVisitor):
     target = "aten.max_pool2d.default"
@@ -61,6 +79,20 @@ def define_node(
         except IndexError:
             pad_size_list = [0, 0, 0, 0]
 
+        # Adjust the padding as necessary
+        pad_size_list[1] = adjust_pad_if_needed(
+            input_tensor.shape[2],
+            kernel_size[0],
+            stride[0],
+            pad_size_list[1],
+        )
+        pad_size_list[3] = adjust_pad_if_needed(
+            input_tensor.shape[3],
+            kernel_size[1],
+            stride[1],
+            pad_size_list[3],
+        )
+
         accumulator_type = output.dtype
 
         # Initilize zero point to zero.
@@ -131,6 +163,20 @@ def define_node(
         except IndexError:
             pad_size_list = [0, 0, 0, 0]
 
+        # Adjust the padding as necessary
+        pad_size_list[1] = adjust_pad_if_needed(
+            input_tensor.shape[2],
+            kernel_size[0],
+            stride[0],
+            pad_size_list[1],
+        )
+        pad_size_list[3] = adjust_pad_if_needed(
+            input_tensor.shape[3],
+            kernel_size[1],
+            stride[1],
+            pad_size_list[3],
+        )
+
         attr = ts.TosaSerializerAttribute()
         attr.MaxPool2dAttribute(
             kernel=kernel_size, stride=stride, pad=pad_size_list, nan_mode=1
 
@@ -46,24 +46,26 @@ def permutation_matrix_to_vector(permutation_matrix: torch.Tensor) -> list[int]:
     (1,0,2)
     """
     N = len(permutation_matrix)
-    assert N == len(
-        permutation_matrix[0]
-    ), f"A permutation matrix must be square, got shape {permutation_matrix.shape}"
+    if N != len(permutation_matrix[0]):
+        raise ValueError(
+            f"A permutation matrix must be square, got shape {permutation_matrix.shape}"
+        )
 
     p = [0] * N
     for row_index, row in enumerate(permutation_matrix):
         saw_one = False
         for col_index, value in enumerate(row):
             if value == 1:
-                assert (
-                    not saw_one
-                ), f"A permutation matrix can only have one 1 per row, got row {row}."
+                if saw_one:
+                    raise ValueError(
+                        f"A permutation matrix can only have one 1 per row, got {row=}"
+                    )
                 p[row_index] = col_index
                 saw_one = True
-            else:
-                assert (
-                    value == 0
-                ), f"A permutation matrix only contains 1's and 0's, got value {value}."
+            elif value != 0:
+                raise ValueError(
+                    f"A permutation matrix only contains 1's and 0's, got {value=}"
+                )
     return p
 
 
 
@@ -68,8 +68,16 @@ def define_node(
         end_index = _fixup_end(end, shape, dim)
         size = end_index - start_index
 
-        assert size > 0
-        assert size <= shape[dim]
+        if size <= 0:
+            raise ValueError(
+                f"The calculated slice size must be positive. Got {size=} "
+                f"with {start_index=} and {end_index=}."
+            )
+        if size > shape[dim]:
+            raise ValueError(
+                f"The calculated slice size cannot be greater than the dimension size"
+                f". Got {size=} and {shape[dim]=}."
+            )
 
         # Convert aten args to Tosa's start and size attributes and in TOSA dim order.
         attr = ts.TosaSerializerAttribute()
@@ -122,8 +130,16 @@ def define_node(
         end_index = _fixup_end(end, shape, dim)
         size = end_index - start_index
 
-        assert size > 0
-        assert size <= shape[dim]
+        if size <= 0:
+            raise ValueError(
+                f"The calculated slice size must be positive. Got {size=} "
+                f"with {start_index=} and {end_index=}."
+            )
+        if size > shape[dim]:
+            raise ValueError(
+                f"The calculated slice size cannot be greater than the dimension size"
+                f". Got {size=} and {shape[dim]=}."
+            )
 
         # Convert aten args to Tosa's start and size shape_t tensors and in TOSA dim order.
         starts = [