pytorch
diff --git a/‎.ci/scripts/build-qnn-sdk.sh
Lines changed: 2 additions & 0 deletions b/‎.ci/scripts/build-qnn-sdk.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎.ci/scripts/build_llama_android.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/build_llama_android.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 2 additions & 0 deletions b/‎.ci/scripts/test_llama_torchao_lowbit.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎.ci/scripts/test_llava.sh
Lines changed: 3 additions & 1 deletion b/‎.ci/scripts/test_llava.sh
Lines changed: 3 additions & 1 deletion
diff --git a/‎.ci/scripts/test_phi_3_mini.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_phi_3_mini.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pull.yml
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 31 additions & 1 deletion b/‎.github/workflows/trunk.yml
Lines changed: 31 additions & 1 deletion
diff --git a/‎.lintrunner.toml
Lines changed: 1 addition & 7 deletions b/‎.lintrunner.toml
Lines changed: 1 addition & 7 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 6 additions & 31 deletions b/‎CMakeLists.txt
Lines changed: 6 additions & 31 deletions
diff --git a/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 4 additions & 1 deletion b/‎backends/arm/_passes/annotate_channels_last_dim_order_pass.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/_passes/convert_split_to_slice.py
Lines changed: 8 additions & 3 deletions b/‎backends/arm/_passes/convert_split_to_slice.py
Lines changed: 8 additions & 3 deletions
diff --git a/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
Lines changed: 13 additions & 9 deletions b/‎backends/arm/_passes/fold_qdq_with_annotated_qparams_pass.py
Lines changed: 13 additions & 9 deletions
diff --git a/‎backends/arm/_passes/insert_table_ops.py
Lines changed: 11 additions & 2 deletions b/‎backends/arm/_passes/insert_table_ops.py
Lines changed: 11 additions & 2 deletions
diff --git a/‎backends/arm/_passes/remove_clone_pass.py
Lines changed: 4 additions & 1 deletion b/‎backends/arm/_passes/remove_clone_pass.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎backends/arm/operators/op_abs.py
Lines changed: 4 additions & 4 deletions b/‎backends/arm/operators/op_abs.py
Lines changed: 4 additions & 4 deletions
diff --git a/‎backends/arm/operators/op_add.py
Lines changed: 4 additions & 4 deletions b/‎backends/arm/operators/op_add.py
Lines changed: 4 additions & 4 deletions
@@ -32,6 +32,8 @@ set_up_aot() {
       -DQNN_SDK_ROOT=${QNN_SDK_ROOT} \
       -DEXECUTORCH_BUILD_DEVTOOLS=ON \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
       -DPYTHON_EXECUTABLE=python3
 
@@ -42,6 +42,7 @@ build_llama_runner() {
     popd
     ANDROID_ABI=arm64-v8a
     cmake -DBUCK2="${BUCK2}" \
+    -DBUILD_TESTING=OFF \
     -DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK"/build/cmake/android.toolchain.cmake  \
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
 
@@ -169,6 +169,7 @@ cmake_build_llama_runner() {
     popd
     dir="examples/models/llama"
     retry cmake \
+        -DBUILD_TESTING=OFF \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -Bcmake-out/${dir} \
 
@@ -30,6 +30,7 @@ cmake -DPYTHON_EXECUTABLE=python \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
+    -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
     -DEXECUTORCH_BUILD_XNNPACK=OFF \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
@@ -40,6 +41,7 @@ cmake --build cmake-out -j16 --target install --config Release
 
 # Install llama runner with torchao
 cmake -DPYTHON_EXECUTABLE=python \
+    -DBUILD_TESTING=OFF \
     -DCMAKE_BUILD_TYPE=Release \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
 
@@ -37,6 +37,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DEXECUTORCH_ENABLE_LOGGING=ON              \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+        -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
         -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON      \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON        \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON     \
@@ -64,9 +65,10 @@ cmake_install_executorch_libraries_for_android() {
 
 
 LLAVA_COMMON_CMAKE_ARGS="                        \
+        -DBUILD_TESTING=OFF                      \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}      \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}         \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}   \
         -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON     \
         -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON  \
         -DEXECUTORCH_BUILD_XNNPACK=ON"
 
@@ -27,6 +27,7 @@ cmake_install_executorch_libraries() {
       -DEXECUTORCH_ENABLE_LOGGING=1 \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
       -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+      -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
       -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
       -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
       -DEXECUTORCH_BUILD_XNNPACK=ON \
 
@@ -371,7 +371,7 @@ jobs:
         size=${arr[4]}
         # threshold=48120 on devserver with gcc11.4
         # todo(lfq): update once binary size is below 50kb.
-        threshold="55504"
+        threshold="55584"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
@@ -406,7 +406,7 @@ jobs:
         output=$(ls -la cmake-out/test/size_test)
         arr=($output)
         size=${arr[4]}
-        threshold="51656"
+        threshold="51728"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
         else
 
@@ -262,7 +262,7 @@ jobs:
         output=$(ls -la ${elf})
         arr=($output)
         size=${arr[4]}
-        threshold="102400" # 100KiB
+        threshold="103068" # ~100KiB
         echo "size: $size, threshold: $threshold"
         if [[ "$size" -le "$threshold" ]]; then
           echo "Success $size <= $threshold"
@@ -552,6 +552,7 @@ jobs:
           -DEXECUTORCH_ENABLE_LOGGING=1 \
           -DCMAKE_BUILD_TYPE=Release \
           -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
+          -DEXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR=ON \
           -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
           -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
           -DEXECUTORCH_BUILD_XNNPACK=ON \
@@ -686,3 +687,32 @@ jobs:
       build-mode: Release
       build-tool: cmake
       docker-image: executorch-ubuntu-22.04-clang12
+
+  unittest-nxp-neutron:
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    permissions:
+      id-token: write
+      contents: read
+    with:
+      runner: linux.2xlarge
+      docker-image: executorch-ubuntu-22.04-clang12
+      submodules: 'recursive'
+      ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }}
+      timeout: 90
+      script: |
+        set -eux
+        
+        # The generic Linux job chooses to use base env, not the one setup by the image
+        CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
+        conda activate "${CONDA_ENV}"
+        
+        # Build and install Executorch
+        PYTHON_EXECUTABLE=python \
+        CMAKE_ARGS="-DEXECUTORCH_BUILD_NXP_NEUTRON=ON" \
+        .ci/scripts/setup-linux.sh --build-tool "cmake"
+        
+        # Install test requirements
+        pip install -r backends/nxp/requirements-tests.txt
+        
+        # Run pytest
+        PYTHON_EXECUTABLE=python bash backends/nxp/run_unittests.sh
@@ -386,15 +386,9 @@ exclude_patterns = [
     "third-party/**",
     # TODO: remove exceptions as we migrate
     # backends
-    "backends/vulkan/quantizer/**",
-    "backends/vulkan/test/**",
-    "backends/xnnpack/quantizer/**",
-    "backends/xnnpack/test/**",
-    "exir/tests/test_passes.py",
-    "extension/llm/export/builder.py",
-    "extension/llm/export/quantizer_lib.py",
     "exir/tests/test_memory_planning.py",
     "exir/backend/test/demos/test_xnnpack_qnnpack.py",
+    "backends/xnnpack/test/test_xnnpack_utils.py",
 ]
 
 command = [
 
@@ -75,9 +75,13 @@ if(NOT PYTHON_EXECUTABLE)
 endif()
 announce_configured_options(PYTHON_EXECUTABLE)
 
+if(NOT BUCK2)
+  resolve_buck2()
+endif()
+announce_configured_options(BUCK2)
+
 announce_configured_options(CMAKE_CXX_COMPILER_ID)
 announce_configured_options(CMAKE_TOOLCHAIN_FILE)
-announce_configured_options(BUCK2)
 
 load_build_preset()
 include(${PROJECT_SOURCE_DIR}/tools/cmake/preset/default.cmake)
@@ -148,37 +152,11 @@ else()
 endif()
 
 if(EXECUTORCH_BUILD_TESTS)
-  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
   include(CTest)
 endif()
 
 add_subdirectory(third-party)
 
-if(EXECUTORCH_BUILD_EXTENSION_TRAINING)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
-  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
-  set(EXECUTORCH_BUILD_EXTENSION_MODULE ON)
-  set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
-endif()
-
-if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
-endif()
-
-if(EXECUTORCH_BUILD_EXTENSION_MODULE)
-  set(EXECUTORCH_BUILD_EXTENSION_DATA_LOADER ON)
-  set(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR ON)
-endif()
-
-if(EXECUTORCH_BUILD_KERNELS_CUSTOM_AOT)
-  set(EXECUTORCH_BUILD_EXTENSION_TENSOR ON)
-  set(EXECUTORCH_BUILD_KERNELS_CUSTOM ON)
-endif()
-
-if(EXECUTORCH_BUILD_KERNELS_CUSTOM)
-  set(EXECUTORCH_BUILD_KERNELS_OPTIMIZED ON)
-endif()
-
 if(NOT DEFINED FXDIV_SOURCE_DIR)
   set(ORIGINAL_CMAKE_POSITION_INDEPENDENT_CODE_FLAG
       ${CMAKE_POSITION_INDEPENDENT_CODE}
@@ -290,9 +268,6 @@ set(_common_include_directories
 #
 
 if(NOT EXECUTORCH_SRCS_FILE)
-  # Find or download buck2 binary.
-  resolve_buck2()
-
   # A file wasn't provided. Run a script to extract the source lists from the
   # buck2 build system and write them to a file we can include.
   #
@@ -335,7 +310,7 @@ if(EXECUTORCH_USE_CPP_CODE_COVERAGE)
            " -fprofile-instr-generate -fcoverage-mapping"
     )
   else()
-    message(ERROR
+    message(FATAL_ERROR
             "Code coverage for compiler ${CMAKE_CXX_COMPILER_ID} is unsupported"
     )
   endif()
 
@@ -35,7 +35,10 @@
 def _transpose_impl(*args, **kwargs):
     # Validate length of dim_order array
     dim = args[1]
-    assert len(dim) in (4, 5)
+    if len(dim) != 4 and len(dim) != 5:
+        raise ValueError(
+            f"Dim order length must be either 4 or 5, got {len(dim)}: {dim}"
+        )
     # Pass-through in edge-IR
     return args[0]
 
 
@@ -41,9 +41,14 @@ def call(self, graph_module: torch.fx.GraphModule):
             dim = split_node.args[2] if len(split_node.args) > 2 else 0
             dim = (dim + rank) % rank
 
-            assert (
-                sum(split_lengths) == shape[dim]
-            ), "Given split lengths don't sum up to the size of the dimension."
+            # Validate that split lengths cover the entire dimension
+            length_sum = sum(split_lengths)
+            dim_size = shape[dim]
+            if length_sum != dim_size:
+                raise ValueError(
+                    f"Split sizes {split_lengths} sum to {length_sum}, "
+                    f"but dimension {dim} has size {dim_size}"
+                )
 
             # Convert split argument 'split_lengths' to slice arguments start and end.
             starts = [0] * len(split_lengths)
 
@@ -120,7 +120,9 @@ def fold_and_annotate_arg(
         if input_qparams is not None:
             node.meta["input_qparams"][i] = input_qparams
             for n in nodes_to_remove:
-                assert n.target == dq_op
+                if n.target != dq_op:
+                    raise RuntimeError(f"Expected {dq_op} dq_op, got {n.target}")
+
                 n.replace_all_uses_with(n.args[0])  # type: ignore[arg-type]
                 graph_module.graph.erase_node(n)
 
@@ -136,14 +138,16 @@ def call(self, graph_module: GraphModule) -> PassResult:
                 continue
 
             # Make sure we haven't already set qparams meta information on the node
-            assert "input_qparams" not in n.meta, (
-                f'Unexpected key "input_qparams" found in meta for node {n}. '
-                "input_qparams should not have been set at this point"
-            )
-            assert "output_qparams" not in n.meta, (
-                f'Unexpected key "output_qparams" found in meta for node {n}. '
-                "output_qparams should not have been set at this point"
-            )
+            if "input_qparams" in n.meta:
+                raise RuntimeError(
+                    f'Unexpected key "input_qparams" found in meta for node {n}. '
+                    "input_qparams should not have been set at this point"
+                )
+            if "output_qparams" in n.meta:
+                raise RuntimeError(
+                    f'Unexpected key "output_qparams" found in meta for node {n}. '
+                    "output_qparams should not have been set at this point"
+                )
 
             # for the inputs and outputs search the graph for quantization info and
             # store the information in a dict with order of the _tensor_ inputs as key,
 
@@ -240,8 +240,17 @@ def call(self, graph_module: GraphModule) -> PassResult:
                     args=(node.args[0],),
                 )
                 output_node = table_node
-                assert len(input_qparams) == 1
-                assert len(output_qparams) == 1
+                # Expect exactly one quantization parameter for input and output
+                if len(input_qparams) != 1:
+                    raise ValueError(
+                        f"InsertTableOpsPass expected exactly one input quantization parameter, "
+                        f"got {len(input_qparams)} for node {node.name}"
+                    )
+                if len(output_qparams) != 1:
+                    raise ValueError(
+                        f"InsertTableOpsPass expected exactly one output quantization parameter, "
+                        f"got {len(output_qparams)} for node {node.name}"
+                    )
 
                 # Generate table buffer and how much to lshift the table output.
                 buffer, lshift = self.generate_table_values(
 
@@ -17,5 +17,8 @@ def call_operator(self, op, args, kwargs, meta):
         if op != exir_ops.edge.aten.clone.default:
             return super().call_operator(op, args, kwargs, meta)
 
-        assert len(args) == 1
+        if len(args) != 1:
+            raise ValueError(
+                f"clone operator expects exactly one argument, got {len(args)}"
+            )
         return args[0]
@@ -44,7 +44,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
-        validate_same_dtype(self.target, [*inputs, output])
+        validate_same_dtype(self.target, [*inputs, output], ts)
 
         # Handle int8 (quantized) and int32
         if not (inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]):
@@ -106,7 +106,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
-        validate_same_dtype(self.target, [*inputs, output])
+        validate_same_dtype(self.target, [*inputs, output], ts)
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
@@ -153,7 +153,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
-        validate_same_dtype(self.target, [*inputs, output])
+        validate_same_dtype(self.target, [*inputs, output], ts)
 
         # Handle int8 (quantized) and int32
         if not (inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]):
@@ -216,7 +216,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 1)
-        validate_same_dtype(self.target, [*inputs, output])
+        validate_same_dtype(self.target, [*inputs, output], ts)
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
 
@@ -45,7 +45,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-        validate_same_dtype(self.target, [*inputs, output])
+        validate_same_dtype(self.target, [*inputs, output], ts)
 
         # Handle int8 (quantized) and int32
         supported_dtypes = [ts.DType.INT8, ts.DType.INT32]
@@ -118,7 +118,7 @@ def define_node(
         import tosa_tools.v0_80.serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-        validate_same_dtype(self.target, [*inputs, output])
+        validate_same_dtype(self.target, [*inputs, output], ts)
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers
@@ -163,7 +163,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-        validate_same_dtype(self.target, [*inputs, output])
+        validate_same_dtype(self.target, [*inputs, output], ts)
 
         # Handle int8 (quantized) and int32
         supported_dtypes = [ts.DType.INT8, ts.DType.INT32]
@@ -226,7 +226,7 @@ def define_node(
         import serializer.tosa_serializer as ts  # type: ignore
 
         validate_num_inputs(self.target, inputs, 2)
-        validate_same_dtype(self.target, [*inputs, output])
+        validate_same_dtype(self.target, [*inputs, output], ts)
 
         if inputs[0].dtype in [ts.DType.INT8, ts.DType.INT32]:
             # Call the inherited define_node for handling integers