pytorch
diff --git a/‎.ci/docker/ci_commit_pins/buck2.txt
Lines changed: 1 addition & 1 deletion b/‎.ci/docker/ci_commit_pins/buck2.txt
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/apple.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/apple.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/build-presets.yml
Lines changed: 17 additions & 0 deletions b/‎.github/workflows/build-presets.yml
Lines changed: 17 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 3 deletions b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 3 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 13 additions & 5 deletions b/‎CMakeLists.txt
Lines changed: 13 additions & 5 deletions
diff --git a/‎CMakePresets.json
Lines changed: 33 additions & 0 deletions b/‎CMakePresets.json
Lines changed: 33 additions & 0 deletions
diff --git a/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 0 deletions b/‎backends/arm/_passes/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 3 additions & 3 deletions b/‎backends/arm/_passes/annotate_decomposed_matmul.py
Lines changed: 3 additions & 3 deletions
diff --git a/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 2 additions & 0 deletions b/‎backends/arm/_passes/arm_pass_manager.py
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/arm/_passes/decompose_cosine_similarity_pass.py
Lines changed: 75 additions & 0 deletions b/‎backends/arm/_passes/decompose_cosine_similarity_pass.py
Lines changed: 75 additions & 0 deletions
diff --git a/‎backends/arm/operator_support/pool_2d_support.py
Lines changed: 4 additions & 1 deletion b/‎backends/arm/operator_support/pool_2d_support.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/operator_support/tosa_supported_operators.py
Lines changed: 2 additions & 1 deletion b/‎backends/arm/operator_support/tosa_supported_operators.py
Lines changed: 2 additions & 1 deletion
diff --git a/‎backends/arm/operators/TARGETS
Lines changed: 6 additions & 0 deletions b/‎backends/arm/operators/TARGETS
Lines changed: 6 additions & 0 deletions
diff --git a/‎backends/arm/operators/op_max_pool2d.py
Lines changed: 46 additions & 0 deletions b/‎backends/arm/operators/op_max_pool2d.py
Lines changed: 46 additions & 0 deletions
@@ -1 +1 @@
-2024-12-16
+2025-05-06
@@ -5,6 +5,8 @@ on:
     branches:
       - main
       - release/*
+    tags:
+      - ciflow/trunk/*
   pull_request:
     paths:
       - .ci/scripts/setup-ios.sh
 
@@ -11,3 +11,20 @@ on:
 concurrency:
   group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
   cancel-in-progress: true
+
+jobs:
+  apple:
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@main
+    strategy:
+      matrix:
+        preset: [macos-arm64]
+    with:
+      job-name: build
+      runner: macos-latest-xlarge
+      python-version: 3.12
+      submodules: recursive
+      script: |
+        set -eux
+        ${CONDA_RUN} ./install_requirements.sh > /dev/null
+        ${CONDA_RUN} cmake --preset ${{ matrix.preset }}
+        ${CONDA_RUN} cmake --build cmake-out --parallel
@@ -434,9 +434,7 @@ jobs:
         output=$(ls -la cmake-out/test/size_test)
         arr=($output)
         size=${arr[4]}
-        # threshold=48120 on devserver with gcc11.4
-        # todo(lfq): update once binary size is below 50kb.
-        threshold="47552"
+        threshold="47560"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
 
@@ -44,6 +44,19 @@
 
 cmake_minimum_required(VERSION 3.24)
 project(executorch)
+
+# MARK: - Start EXECUTORCH_H12025_BUILD_MIGRATION --------------------------------------------------
+
+include(${PROJECT_SOURCE_DIR}/tools/cmake/common/preset.cmake)
+
+load_build_preset()
+include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
+
+# Print all the configs that were called with announce_configured_options.
+print_configured_options()
+
+# MARK: - End EXECUTORCH_H12025_BUILD_MIGRATION ----------------------------------------------------
+
 include(tools/cmake/Utils.cmake)
 include(CMakeDependentOption)
 
@@ -96,9 +109,6 @@ set(EXECUTORCH_PAL_DEFAULT
           "Which PAL default implementation to use: one of {posix, minimal}"
 )
 
-option(EXECUTORCH_ENABLE_LOGGING "Build with ET_LOG_ENABLED"
-       ${_default_release_disabled_options}
-)
 if(NOT EXECUTORCH_ENABLE_LOGGING)
   # Avoid pulling in the logging strings, which can be large. Note that this
   # will set the compiler flag for all targets in this directory, and for all
@@ -170,8 +180,6 @@ option(EXECUTORCH_BUILD_ARM_BAREMETAL
        "Build the Arm Baremetal flow for Cortex-M and Ethos-U" OFF
 )
 
-option(EXECUTORCH_BUILD_COREML "Build the Core ML backend" OFF)
-
 option(EXECUTORCH_BUILD_KERNELS_CUSTOM "Build the custom kernels" OFF)
 
 option(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT "Build the custom ops lib for AOT"
 
@@ -0,0 +1,33 @@
+{
+  "version": 10,
+  "cmakeMinimumRequired": {
+    "major": 3,
+    "minor": 31,
+    "patch": 0
+  },
+  "$comment": "On-device AI across mobile, embedded and edge for PyTorch.",
+  "configurePresets": [
+    {
+      "name": "common",
+      "hidden": true,
+      "binaryDir": "${sourceDir}/cmake-out",
+      "generator": "Unix Makefiles"
+    },
+    {
+      "name": "macos-arm64",
+      "inherits": ["common"],
+      "generator": "Xcode",
+      "cacheVariables": {
+        "CMAKE_TOOLCHAIN_FILE": "${sourceDir}/third-party/ios-cmake/ios.toolchain.cmake",
+        "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/macos-arm64.cmake",
+        "PLATFORM": "MAC_ARM64",
+        "DEPLOYMENT_TARGET": "10.15"
+      },
+      "condition": {
+        "lhs": "${hostSystemName}",
+        "type": "equals",
+        "rhs": "Darwin"
+      }
+    }
+  ]
+}
@@ -19,6 +19,7 @@
 from .convert_squeezes_to_view import ConvertSqueezesToViewPass  # noqa
 from .convert_to_clamp import ConvertToClampPass  # noqa
 from .decompose_batchnorm_pass import DecomposeBatchNormPass  # noqa
+from .decompose_cosine_similarity_pass import DecomposeCosineSimilarityPass  # noqa
 from .decompose_div_pass import DecomposeDivPass  # noqa
 from .decompose_gelu_pass import DecomposeGeluPass  # noqa
 from .decompose_layernorm_pass import DecomposeLayerNormPass  # noqa
 
@@ -1,13 +1,12 @@
 # Copyright 2024-2025 Arm Limited and/or its affiliates.
-# All rights reserved.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
 
 # pyre-unsafe
 
 import itertools
-
+import operator
 from typing import List
 
 import torch
@@ -22,7 +21,7 @@
 
 class AnnotateDecomposedMatmulPass(ExportPass):
     """
-    torch.matmul can be decomposed in many ways, for instance:
+    torch.matmul and it's equivalent operator @ can be decomposed in many ways, for instance:
     dq -> matmul -> q can become
     dq -> repeat -> view -> bmm -> view -> dq which makes quantization folding
     difficult. This helper function find all matmul partitions and annotate its
@@ -50,6 +49,7 @@ def call(self, graph_module: GraphModule) -> PassResult:
             graph_module.graph,
             [
                 torch.matmul,
+                operator.matmul,
             ],
             None,
         )
 
@@ -24,6 +24,7 @@
     ConvertSqueezesToViewPass,
     ConvertToClampPass,
     DecomposeBatchNormPass,
+    DecomposeCosineSimilarityPass,
     DecomposeDivPass,
     DecomposeGeluPass,
     DecomposeLayerNormPass,
@@ -205,6 +206,7 @@ def transform_for_annotation_pipeline(self, graph_module: GraphModule):
         self.add_pass(DecomposeVarPass())
         self.add_pass(DecomposeMeanDimPass())
         self.add_pass(DecomposeNotEqualPass())
+        self.add_pass(DecomposeCosineSimilarityPass())
         self.add_pass(DecomposeDivPass())
         self.add_pass(DecomposeLeakyReLUPass())
         self.add_pass(DecomposeSqrtPass())
 
@@ -0,0 +1,75 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+from executorch.exir.pass_base import ExportPass
+
+torch_cosine_similarity = (torch.ops.aten.cosine_similarity.default,)
+
+
+class DecomposeCosineSimilarityPass(ExportPass):
+    """
+    Decomposition of aten.cosine_similarity:
+
+      dot    = sum(mul(x1, x2), dims, keepdim=False)
+      norm   = pow( sum(mul(x, x), dims, keepdim=False), 0.5 )
+      eps    = full( (), eps_scalar )
+      n1c    = max(norm1, eps)
+      n2c    = max(norm2, eps)
+      denom  = mul(n1c, n2c)
+      out    = div(dot, denom)
+    """
+
+    def call_operator(self, op, args, kwargs, meta):
+        if op not in torch_cosine_similarity:
+            return super().call_operator(op, args, kwargs, meta)
+
+        x1, x2 = args[0], args[1]
+        dim = kwargs.get("dim", 1)
+        eps = kwargs.get("eps", 1e-8)
+        dims = [dim] if isinstance(dim, int) else list(dim)
+
+        # 1) dot
+        prod = super().call_operator(torch.ops.aten.mul.Tensor, (x1, x2), {}, meta)
+        dot = super().call_operator(
+            torch.ops.aten.sum.dim_IntList, (prod, dims, False), {}, meta
+        )
+
+        # 2a) norm1 = pow(sum(x1*x1), 0.5)
+        x1_sq = super().call_operator(torch.ops.aten.mul.Tensor, (x1, x1), {}, meta)
+        s1 = super().call_operator(
+            torch.ops.aten.sum.dim_IntList, (x1_sq, dims, False), {}, meta
+        )
+        norm1 = super().call_operator(
+            torch.ops.aten.pow.Tensor_Scalar, (s1, 0.5), {}, meta
+        )
+
+        # 2b) norm2 = pow(sum(x2*x2), 0.5)
+        x2_sq = super().call_operator(torch.ops.aten.mul.Tensor, (x2, x2), {}, meta)
+        s2 = super().call_operator(
+            torch.ops.aten.sum.dim_IntList, (x2_sq, dims, False), {}, meta
+        )
+        norm2 = super().call_operator(
+            torch.ops.aten.pow.Tensor_Scalar, (s2, 0.5), {}, meta
+        )
+
+        # 3) eps scalar - we need to broadcast ourselves as TOSA dont do this for scalar
+        eps_t = super().call_operator(
+            torch.ops.aten.full_like.default, (norm1, eps), {}, meta
+        )
+
+        # 4) clamp to avoid zero division
+        n1c = super().call_operator(
+            torch.ops.aten.maximum.default, (norm1, eps_t), {}, meta
+        )
+        n2c = super().call_operator(
+            torch.ops.aten.maximum.default, (norm2, eps_t), {}, meta
+        )
+
+        # 5) denom and divide
+        denom = super().call_operator(torch.ops.aten.mul.Tensor, (n1c, n2c), {}, meta)
+        out = super().call_operator(torch.ops.aten.div.Tensor, (dot, denom), {}, meta)
+
+        return out
@@ -54,8 +54,11 @@ def is_node_tosa_supported(self, node: fx.Node, tosa_spec: TosaSpecification):
         kernel = cast(tuple[int, int], node.args[1])
         stride = cast(tuple[int, int], node.args[2])
         if len(node.args) > 3:
+            padding = cast(tuple[int, int], node.args[3])
             # Padding case
-            if not all(1 <= k <= 8 for k in kernel):
+            if not all(1 <= k <= 8 for k in kernel) and not all(
+                v == 0 for v in padding
+            ):
                 self.reporter.report_reject(
                     node, f"Avgpool2d with padding needs kernel dims < 8, got {kernel}"
                 )
 
@@ -335,6 +335,7 @@ def _is_matmul_node_supported(
                 graph_module.graph,
                 [
                     torch.matmul,
+                    operator.matmul,
                 ],
                 None,
             )
@@ -385,7 +386,7 @@ def is_node_supported(
         ):
             source_fn_stack: tuple[typing.Any] = node.meta.get("source_fn_stack", [])
             if len(source_fn_stack) > 0:
-                if source_fn_stack[-1][1] in (torch.matmul,):
+                if source_fn_stack[-1][1] in (torch.matmul, operator.matmul):
                     return self._is_matmul_node_supported(submodules, node)
 
         elif node.target in (exir_ops.edge.aten.max_pool2d_with_indices.default,):
 
@@ -10,13 +10,19 @@ python_library(
     ],
 )
 
+python_library(
+    name = "operator_validation_utils",
+    srcs = ["operator_validation_utils.py"],
+)
+
 python_library(
     name = "ops",
     srcs = glob(["op_*.py", "ops_*.py"]),
     deps = [
         "fbsource//third-party/tosa_tools/v0.80/serialization_lib/python/tosa:tosa",
         "fbsource//third-party/tosa_tools/v1.00/serialization_lib/python/tosa:tosa",
         ":node_visitor",
+        ":operator_validation_utils",
         "//executorch/backends/arm:tosa_mapping",
         "//executorch/backends/arm:tosa_quant_utils",
         "//executorch/backends/arm:tosa_utils",
 
@@ -23,6 +23,24 @@
 from executorch.backends.arm.tosa_specification import TosaSpecification
 
 
+# Similarly to Conv2d, the TOSA spec requires that following is exactly divisible:
+# `(input + 2 * pad - kernel_size) / stride`
+# PyTorch however, does not require this, so as needed, we must adjust the padding.
+def adjust_pad_if_needed(
+    input_size: int, kernel_size: int, stride: int, pad: int
+) -> int:
+    if pad == 0:
+        return pad
+
+    mod_remainder = (input_size + 2 * pad - kernel_size) % stride
+
+    # No need to adjust
+    if mod_remainder == 0:
+        return pad
+
+    return pad - mod_remainder
+
+
 @register_node_visitor
 class MaxPool2dVisitor_0_80(NodeVisitor):
     target = "aten.max_pool2d.default"
@@ -61,6 +79,20 @@ def define_node(
         except IndexError:
             pad_size_list = [0, 0, 0, 0]
 
+        # Adjust the padding as necessary
+        pad_size_list[1] = adjust_pad_if_needed(
+            input_tensor.shape[2],
+            kernel_size[0],
+            stride[0],
+            pad_size_list[1],
+        )
+        pad_size_list[3] = adjust_pad_if_needed(
+            input_tensor.shape[3],
+            kernel_size[1],
+            stride[1],
+            pad_size_list[3],
+        )
+
         accumulator_type = output.dtype
 
         # Initilize zero point to zero.
@@ -131,6 +163,20 @@ def define_node(
         except IndexError:
             pad_size_list = [0, 0, 0, 0]
 
+        # Adjust the padding as necessary
+        pad_size_list[1] = adjust_pad_if_needed(
+            input_tensor.shape[2],
+            kernel_size[0],
+            stride[0],
+            pad_size_list[1],
+        )
+        pad_size_list[3] = adjust_pad_if_needed(
+            input_tensor.shape[3],
+            kernel_size[1],
+            stride[1],
+            pad_size_list[3],
+        )
+
         attr = ts.TosaSerializerAttribute()
         attr.MaxPool2dAttribute(
             kernel=kernel_size, stride=stride, pad=pad_size_list, nan_mode=1