pytorch
diff --git a/‎.ci/scripts/build_llama_android.sh
Lines changed: 3 additions & 13 deletions b/‎.ci/scripts/build_llama_android.sh
Lines changed: 3 additions & 13 deletions
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 15 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 15 deletions
diff --git a/‎.github/workflows/build-presets.yml
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/build-presets.yml
Lines changed: 4 additions & 4 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/trunk.yml
Lines changed: 26 additions & 0 deletions
diff --git a/‎CMakePresets.json
Lines changed: 20 additions & 0 deletions b/‎CMakePresets.json
Lines changed: 20 additions & 0 deletions
diff --git a/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 0 additions & 3 deletions b/‎backends/apple/coreml/CMakeLists.txt
Lines changed: 0 additions & 3 deletions
diff --git a/‎backends/arm/scripts/install_reference_model.sh
Lines changed: 1 addition & 1 deletion b/‎backends/arm/scripts/install_reference_model.sh
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/example/example_quantizer.py
Lines changed: 5 additions & 2 deletions b/‎backends/example/example_quantizer.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/mediatek/quantizer/qconfig.py
Lines changed: 5 additions & 2 deletions b/‎backends/mediatek/quantizer/qconfig.py
Lines changed: 5 additions & 2 deletions
diff --git a/‎backends/nxp/aten_passes/fuse_batch_norm_with_conv_pass.py
Lines changed: 146 additions & 0 deletions b/‎backends/nxp/aten_passes/fuse_batch_norm_with_conv_pass.py
Lines changed: 146 additions & 0 deletions
@@ -22,18 +22,12 @@ install_executorch_and_backend_lib() {
   ANDROID_NDK=/opt/ndk
   BUCK2=buck2
   ANDROID_ABI=arm64-v8a
-  cmake -DBUCK2="${BUCK2}" \
+  cmake --preset llm \
+    -DBUCK2="${BUCK2}" \
     -DCMAKE_TOOLCHAIN_FILE="${ANDROID_NDK}/build/cmake/android.toolchain.cmake" \
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_BUILD_TYPE=Release \
-    -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-    -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DXNNPACK_ENABLE_ARM_BF16=OFF \
     -Bcmake-android-out .
 
@@ -51,11 +45,7 @@ build_llama_runner() {
     -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK"/build/cmake/android.toolchain.cmake  \
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
-    -DCMAKE_BUILD_TYPE=Release -DPYTHON_EXECUTABLE=python \
-    -DEXECUTORCH_BUILD_XNNPACK=ON \
-    -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-    -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DCMAKE_BUILD_TYPE=Release \
     -Bcmake-android-out/examples/models/llama examples/models/llama
 
     cmake --build cmake-android-out/examples/models/llama -j4 --config Release
 
@@ -152,21 +152,11 @@ which "${PYTHON_EXECUTABLE}"
 cmake_install_executorch_libraries() {
     echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
     rm -rf cmake-out
-    retry cmake \
+    retry cmake --preset llm \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-        -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
-        -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
-        -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
-        -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-        -DEXECUTORCH_BUILD_MPS="$MPS" \
-        -DEXECUTORCH_BUILD_COREML="$COREML" \
         -DEXECUTORCH_BUILD_QNN="$QNN" \
         -DQNN_SDK_ROOT="$QNN_SDK_ROOT" \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out .
     cmake --build cmake-out -j9 --target install --config "$CMAKE_BUILD_TYPE"
 }
@@ -181,10 +171,6 @@ cmake_build_llama_runner() {
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
-        -DEXECUTORCH_BUILD_KERNELS_CUSTOM="$CUSTOM" \
-        -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-        -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out/${dir} \
         ${dir}
     cmake --build cmake-out/${dir} -j9 --config "$CMAKE_BUILD_TYPE"
 
@@ -20,7 +20,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        preset: [macos-arm64, pybind]
+        preset: [macos-arm64, pybind, llm]
     with:
       job-name: build
       ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
@@ -32,14 +32,14 @@ jobs:
         set -eux
         ${CONDA_RUN} ./install_requirements.sh > /dev/null
         ${CONDA_RUN} cmake --preset ${{ matrix.preset }}
-        ${CONDA_RUN} cmake --build cmake-out --parallel
+        ${CONDA_RUN} cmake --build cmake-out -j$(( $(sysctl -n hw.ncpu) - 1 ))
 
   linux:
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     strategy:
       fail-fast: false
       matrix:
-        preset: [pybind]
+        preset: [pybind, llm]
         runner: [linux.2xlarge, linux.arm64.2xlarge]
         docker-image: [executorch-ubuntu-22.04-clang12, executorch-ubuntu-22.04-gcc11-aarch64]
         # Excluding specific runner + docker image combinations that don't make sense:
@@ -65,4 +65,4 @@ jobs:
 
         ./install_requirements.sh > /dev/null
         cmake --preset ${{ matrix.preset }}
-        cmake --build cmake-out --parallel
+        cmake --build cmake-out -j$(( $(nproc) - 1 ))
@@ -692,3 +692,29 @@ jobs:
       build-mode: Release
       build-tool: cmake
       docker-image: executorch-ubuntu-22.04-clang12
+
+  unittest-nxp-neutron:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'true'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+        
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        
+        # Build and install Executorch
+        PYTHON_EXECUTABLE=python \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
+        .ci/scripts/setup-linux.sh --build-tool "cmake"
+        
+        # Run pytest
+        PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh
@@ -36,6 +36,26 @@
         "string": "${hostSystemName}",
         "list": ["Darwin", "Linux", "Windows"]
       }
+    },
+    {
+        "name": "llm",
+        "displayName": "Build LLM libraries",
+        "inherits": [
+            "common"
+        ],
+        "cacheVariables": {
+            "EXECUTORCH_BUILD_PRESET_FILE": "${sourceDir}/tools/cmake/preset/llm.cmake",
+            "CMAKE_OSX_DEPLOYMENT_TARGET": "10.15"
+        },
+        "condition": {
+            "type": "inList",
+            "string": "${hostSystemName}",
+            "list": [
+                "Darwin",
+                "Linux",
+                "Windows"
+            ]
+        }
     }
   ]
 }
@@ -25,8 +25,6 @@ endif()
 
 option(COREML_BUILD_EXECUTOR_RUNNER "Build CoreML executor runner." OFF)
 
-set(CMAKE_OSX_DEPLOYMENT_TARGET 10.15)
-
 # inmemoryfs sources
 set(INMEMORYFS_SOURCES
     runtime/inmemoryfs/inmemory_filesystem.cpp
@@ -240,7 +238,6 @@ if(EXECUTORCH_BUILD_COREML AND EXECUTORCH_BUILD_PYBIND)
 
   pybind11_add_module(executorchcoreml SHARED runtime/inmemoryfs/inmemory_filesystem_py.cpp)
 
-  target_compile_options(executorchcoreml PRIVATE -mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET})
   if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
     target_compile_options(executorchcoreml PRIVATE -g)
   endif()
 
@@ -13,7 +13,7 @@ tosa_reference_model_url="https://git.gitlab.arm.com/tosa/tosa-reference-model.g
 tosa_reference_model_0_80_branch="v0.80"
 tosa_reference_model_0_80_rev="70ed0b40fa831387e36abdb4f7fb9670a3464f5a"
 tosa_serialization_lib_0_80_rev="v0.80.1"
-tosa_reference_model_1_0_rev="4d17b5b960cd986d8cb8052188fbe3ae494789e8"
+tosa_reference_model_1_0_rev="d102f426dd2e3c1f25bbf23292ec8ee51aa9c677"
 
 script_dir=$(cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd)
 
 
@@ -10,8 +10,11 @@
 import torch
 from executorch.backends.example.example_operators.ops import module_to_annotator
 from torch import fx
-from torchao.quantization.pt2e.graph_utils import find_sequential_partitions
-from torchao.quantization.pt2e.observer import HistogramObserver, MinMaxObserver
+from torchao.quantization.pt2e import (
+    find_sequential_partitions,
+    HistogramObserver,
+    MinMaxObserver,
+)
 from torchao.quantization.pt2e.quantizer import (
     OperatorConfig,
     QuantizationSpec,
 
@@ -10,8 +10,11 @@
 
 import torch
 
-from torchao.quantization.pt2e.fake_quantize import FakeQuantize
-from torchao.quantization.pt2e.observer import MinMaxObserver, PerChannelMinMaxObserver
+from torchao.quantization.pt2e import (
+    FakeQuantize,
+    MinMaxObserver,
+    PerChannelMinMaxObserver,
+)
 from torchao.quantization.pt2e.quantizer import QuantizationSpec
 
 
 
@@ -0,0 +1,146 @@
+# Copyright 2025 NXP
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Optional
+
+import torch
+from torch.export.unflatten import _assign_attr, _AttrKind
+from torch.fx import GraphModule, Node
+from torch.fx.passes.infra.pass_base import PassBase, PassResult
+from torch.nn.parameter import Parameter
+from torch.nn.utils import fuse_conv_bn_weights
+
+
+class FuseBatchNormWithConvPass(PassBase):
+    """The executorch batch normalization carries out the following computation [1].
+
+        (x - mean) / sqrt(var + eps) * W + B
+
+    Which can be expressed as
+
+        x * (W / sqrt(var + eps)) + (B - mean * (W / sqrt(var + eps)))
+
+    So the batch norm can be done as 1 multiplication and 1 addition, provided that the parameters are static,
+     and the terms can be precomputed. If there is a `Conv` operator before the batch normalization, this scale and
+     bias can be statically integrated into the weights and bias of the `Conv`, which allows the batch norm to be
+     completely removed.
+
+
+                               │
+                 ┌─────────────▼─────────────┐
+                 │ aten.conv1d | aten.conv2d │
+                 └─────────────┬─────────────┘
+                               │                                                                │
+         ┌─────────────────────▼─────────────────────┐        replace with        ┌─────────────▼─────────────┐
+         │              aten.batch_norm              │       ──────────────►      │ aten.conv1d | aten.conv2d │
+         └─────────────────────┬─────────────────────┘                            └─────────────┬─────────────┘
+                               │                                                                ▼
+                               ▼
+
+    [1] https://github.com/pytorch/executorch/blob/v0.5.0-rc2/kernels/portable/cpu/op_native_batch_norm.cpp#L118-L128
+    """
+
+    def _get_tensor_constant_from_node(self, graph_module, node) -> Parameter | None:
+        """Get the static data from a given node. If it doesn't have any data, return `None`."""
+        if node is None or node.op != "get_attr":
+            return None
+
+        target_atoms = node.target.split(".")
+        attr_itr = graph_module
+        for atom in target_atoms:
+            if not hasattr(attr_itr, atom):
+                return None
+            attr_itr = getattr(attr_itr, atom)
+        return attr_itr
+
+    def call(self, graph_module: GraphModule) -> Optional[PassResult]:
+        def _is_batch_norm(node_: Node) -> bool:
+            return (
+                node_.op == "call_function"
+                and node_.target == torch.ops.aten.batch_norm.default
+            )
+
+        def _is_conv(node_: Node):
+            is_conv = node_.op == "call_function" and node_.target in (
+                torch.ops.aten.conv1d.default,
+                torch.ops.aten.conv2d.default,
+            )
+            has_single_user = len(node.users) == 1
+
+            return is_conv and has_single_user
+
+        made_changes = False
+
+        if not any(map(_is_batch_norm, graph_module.graph.nodes)):
+            return PassResult(
+                graph_module, made_changes
+            )  # No batch norm nodes in the model.
+
+        for node in graph_module.graph.nodes:
+            if not _is_batch_norm(node):
+                continue  # Not BatchNorm.
+
+            bn_node = node
+
+            if not _is_conv(bn_node.args[0]):
+                continue  # Something other than a Conv node comes before the BatchNorm.
+
+            conv_node = bn_node.args[0]
+            conv_weight_node = conv_node.args[1]
+            conv_bias_node = conv_node.args[2] if len(conv_node.args) > 2 else None
+
+            # conv args: input, weight, bias, stride, padding, dilation, ...
+            conv_w = self._get_tensor_constant_from_node(graph_module, conv_weight_node)
+            conv_b = self._get_tensor_constant_from_node(graph_module, conv_bias_node)
+
+            # batch norm args: input, weight, bias, running mean, training, running var, momentum, eps
+            bn_w = self._get_tensor_constant_from_node(graph_module, bn_node.args[1])
+            bn_b = self._get_tensor_constant_from_node(graph_module, bn_node.args[2])
+            bn_rm = self._get_tensor_constant_from_node(graph_module, bn_node.args[3])
+            bn_rv = self._get_tensor_constant_from_node(graph_module, bn_node.args[4])
+            bn_eps = bn_node.args[7]
+
+            if any(
+                t is None for t in (conv_w, bn_rm, bn_rv)
+            ):  # The other inputs can be None.
+                continue  # The data is not static. Leave this BatchNorm as is (probably a rare case).
+            fused_weight, fused_bias = fuse_conv_bn_weights(
+                conv_w, conv_b, bn_rm, bn_rv, bn_eps, bn_w, bn_b
+            )
+
+            # Update the weight and bias for Conv.
+            conv_args = list(conv_node.args)
+            if len(conv_args) == 2:
+                # Fill in the default bias argument.
+                conv_args.append(None)
+
+            weight_attr_name = conv_weight_node.target
+            _assign_attr(
+                fused_weight, graph_module, weight_attr_name, _AttrKind.PARAMETER
+            )
+
+            if conv_bias_node is not None:
+                bias_attr_name = conv_bias_node.target
+                _assign_attr(
+                    fused_bias, graph_module, str(bias_attr_name), _AttrKind.PARAMETER
+                )
+            else:
+                # The Conv doesn't have a bias. Create a new one.
+                bias_attr_name = weight_attr_name + "_bias"
+                _assign_attr(
+                    fused_bias, graph_module, bias_attr_name, _AttrKind.PARAMETER
+                )
+                with graph_module.graph.inserting_before(conv_node):
+                    get_bias_node = graph_module.graph.get_attr(bias_attr_name)
+
+                conv_args[2] = get_bias_node
+
+            conv_node.args = tuple(conv_args)
+
+            # Replace the uses of the BatchNorm with the Conv.
+            bn_node.replace_all_uses_with(conv_node)
+
+            made_changes = True
+
+        return PassResult(graph_module, made_changes)