pytorch
diff --git a/‎.ci/scripts/build_llama_android.sh
Lines changed: 8 additions & 0 deletions b/‎.ci/scripts/build_llama_android.sh
Lines changed: 8 additions & 0 deletions
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎.ci/scripts/test_llava.sh
Lines changed: 7 additions & 3 deletions b/‎.ci/scripts/test_llava.sh
Lines changed: 7 additions & 3 deletions
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 3 additions & 2 deletions b/‎.ci/scripts/test_model.sh
Lines changed: 3 additions & 2 deletions
diff --git a/‎.ci/scripts/test_phi_3_mini.sh
Lines changed: 4 additions & 0 deletions b/‎.ci/scripts/test_phi_3_mini.sh
Lines changed: 4 additions & 0 deletions
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 1 addition & 0 deletions b/‎.ci/scripts/utils.sh
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/android-perf.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/android-perf.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/apple-perf.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/apple-perf.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/pull.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 2 additions & 0 deletions b/‎.github/workflows/trunk.yml
Lines changed: 2 additions & 0 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 6 additions & 2 deletions b/‎CMakeLists.txt
Lines changed: 6 additions & 2 deletions
diff --git a/‎backends/arm/test/models/test_dl3_arm.py
Lines changed: 92 additions & 0 deletions b/‎backends/arm/test/models/test_dl3_arm.py
Lines changed: 92 additions & 0 deletions
diff --git a/‎backends/cadence/aot/export_example.py
Lines changed: 3 additions & 33 deletions b/‎backends/cadence/aot/export_example.py
Lines changed: 3 additions & 33 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/ops_registrations.py
Lines changed: 1 addition & 1 deletion
@@ -10,6 +10,12 @@ set -exu
 # shellcheck source=/dev/null
 source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
 
+if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
+  PYTHON_EXECUTABLE=python3
+fi
+which "${PYTHON_EXECUTABLE}"
+CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
+
 install_executorch_and_backend_lib() {
   echo "Installing executorch and xnnpack backend"
   clean_executorch_install_folders
@@ -22,6 +28,7 @@ install_executorch_and_backend_lib() {
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_BUILD_TYPE=Release \
+    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -47,6 +54,7 @@ build_llama_runner() {
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
+    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -Bcmake-android-out/examples/models/llama examples/models/llama
 
     cmake --build cmake-android-out/examples/models/llama -j4 --config Release
 
@@ -154,6 +154,7 @@ cmake_install_executorch_libraries() {
     rm -rf cmake-out
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
+        -DCMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
 
@@ -30,9 +30,11 @@ fi
 NPROC=8
 if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
+python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
+CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
 EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}         \
-        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}            \
+        -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}      \
         -DEXECUTORCH_ENABLE_LOGGING=ON              \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON      \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -46,6 +48,7 @@ EXECUTORCH_COMMON_CMAKE_ARGS="                      \
 cmake_install_executorch_libraries() {
     cmake                               \
         ${EXECUTORCH_COMMON_CMAKE_ARGS} \
+        "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \
         -B${BUILD_DIR} .
 
     cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
@@ -56,6 +59,7 @@ cmake_install_executorch_libraries_for_android() {
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI=arm64-v8a                                                 \
         ${EXECUTORCH_COMMON_CMAKE_ARGS}                                         \
+        "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \
         -B${BUILD_DIR} .
 
     cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
@@ -76,7 +80,7 @@ cmake_build_llava_runner() {
 
     cmake                                 \
         ${LLAVA_COMMON_CMAKE_ARGS}        \
-        -DCMAKE_PREFIX_PATH="$python_lib" \
+        -DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}" \
         -B${BUILD_DIR}/${dir}             \
         ${dir}
 
@@ -92,7 +96,7 @@ cmake_build_llava_runner_for_android() {
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI=arm64-v8a                                                 \
         ${LLAVA_COMMON_CMAKE_ARGS}                                              \
-        -DCMAKE_PREFIX_PATH="$python_lib"                                       \
+        -DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}"                  \
         -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON                                  \
         -B${BUILD_DIR}/${dir}                                                   \
         ${dir}
 
@@ -50,10 +50,12 @@ prepare_artifacts_upload() {
 
 build_cmake_executor_runner() {
   echo "Building executor_runner"
+  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   rm -rf ${CMAKE_OUTPUT_DIR}
   cmake -DCMAKE_BUILD_TYPE=Debug \
       -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
+      -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -B${CMAKE_OUTPUT_DIR} .
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
@@ -98,8 +100,7 @@ test_model() {
 
 build_cmake_xnn_executor_runner() {
   echo "Building xnn_executor_runner"
-  SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
+  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
 
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
 
@@ -22,8 +22,10 @@ NPROC=8
 if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
 cmake_install_executorch_libraries() {
+  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   cmake -DPYTHON_EXECUTABLE=python \
       -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
+      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
       -DEXECUTORCH_ENABLE_LOGGING=1 \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
       -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -39,8 +41,10 @@ cmake_install_executorch_libraries() {
 }
 
 cmake_build_phi_3_mini() {
+  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
       -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
+      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
       -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
       -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
 
@@ -136,6 +136,7 @@ cmake_install_executorch_lib() {
   clean_executorch_install_folders
   retry cmake -DBUCK2="$BUCK" \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
+          -DCMAKE_PREFIX_PATH="$($PYTHON_EXECUTABLE -c 'import torch as _; print(_.__path__[0])')" \
           -DCMAKE_BUILD_TYPE=Release \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
 
@@ -222,6 +222,7 @@ jobs:
                       --preq_mode 8da4w_output_8da8w \
                       --preq_group_size 32 \
                       --max_seq_length 2048 \
+                      --max_context_length 2048 \
                       --output_name "${OUT_ET_MODEL_NAME}.pte" \
                       -kv \
                       -d fp32 \
@@ -253,6 +254,7 @@ jobs:
                       --xnnpack-extended-ops \
                       -d fp32 \
                       --max_seq_length 2048 \
+                      --max_context_length 2048 \
                       --output_name "${OUT_ET_MODEL_NAME}.pte" \
                       --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
                     ls -lh "${OUT_ET_MODEL_NAME}.pte"
 
@@ -233,6 +233,7 @@ jobs:
                 --preq_mode 8da4w_output_8da8w \
                 --preq_group_size 32 \
                 --max_seq_length 2048 \
+                --max_context_length 2048 \
                 --output_name "${OUT_ET_MODEL_NAME}.pte" \
                 -kv \
                 -d fp32 \
@@ -264,6 +265,7 @@ jobs:
                 --xnnpack-extended-ops \
                 -d fp32 \
                 --max_seq_length 2048 \
+                --max_context_length 2048 \
                 --output_name "${OUT_ET_MODEL_NAME}.pte" \
                 --metadata '{"get_bos_id":128000, "get_eos_ids":[128009, 128001]}'
               ls -lh "${OUT_ET_MODEL_NAME}.pte"
 
@@ -147,6 +147,8 @@ jobs:
         CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]")
         conda activate "${CONDA_ENV}"
 
+        source .ci/scripts/utils.sh
+        install_executorch "use-pt-pinned-commit"
         BUILD_TOOL="cmake"
         PYTHON_EXECUTABLE=python \
         bash .ci/scripts/build_llama_android.sh  "${BUILD_TOOL}"
 
@@ -394,6 +394,7 @@ jobs:
         rm -rf cmake-out
         cmake \
             -DCMAKE_INSTALL_PREFIX=cmake-out \
+            -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \
             -DCMAKE_BUILD_TYPE=Release \
             -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
             -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
@@ -411,6 +412,7 @@ jobs:
         cmake \
             -DCMAKE_INSTALL_PREFIX=cmake-out \
             -DCMAKE_BUILD_TYPE=Release \
+            -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \
             -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
             -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
             -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
 
@@ -596,7 +596,7 @@ endif()
 # any backends.
 #
 add_library(executorch ${_executorch__srcs})
-target_link_libraries(executorch PRIVATE executorch_core)
+target_link_libraries(executorch PUBLIC executorch_core)
 target_include_directories(executorch PUBLIC ${_common_include_directories})
 target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS)
 target_compile_options(executorch PUBLIC ${_common_compile_options})
@@ -614,6 +614,8 @@ if(BUILD_EXECUTORCH_PORTABLE_OPS)
 endif()
 
 if(EXECUTORCH_BUILD_KERNELS_OPTIMIZED)
+  # find pytorch lib here to make it available to all sub-directories
+  find_package_torch_headers()
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
@@ -750,7 +752,9 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  find_package(Torch CONFIG REQUIRED)
+  if(NOT TARGET torch)
+    find_package(Torch CONFIG REQUIRED)
+  endif()
   find_library(
     TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
   )
 
@@ -0,0 +1,92 @@
+# Copyright 2025 Arm Limited and/or its affiliates.
+
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import unittest
+
+import pytest
+
+from executorch.backends.arm.test import common, conftest
+
+from executorch.backends.arm.test.tester.arm_tester import ArmTester
+from executorch.examples.models import deeplab_v3
+
+
+class TestDl3(unittest.TestCase):
+    """Tests DeepLabv3."""
+
+    dl3 = deeplab_v3.DeepLabV3ResNet50Model()
+    model_inputs = dl3.get_example_inputs()
+    dl3 = dl3.get_eager_model()
+
+    @unittest.expectedFailure
+    def test_dl3_tosa_MI(self):
+        (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+MI"),
+            )
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .run_method_and_compare_outputs(self.model_inputs)
+        )
+
+    @unittest.expectedFailure
+    def test_dl3_tosa_BI(self):
+        (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_tosa_compile_spec("TOSA-0.80+BI"),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .run_method_and_compare_outputs(atol=1.0, qtol=1, inputs=self.model_inputs)
+        )
+
+    @pytest.mark.slow
+    @pytest.mark.corstone_fvp
+    @unittest.skip
+    def test_dl3_u55_BI(self):
+        tester = (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_u55_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=self.model_inputs
+            )
+
+    @pytest.mark.slow
+    @pytest.mark.corstone_fvp
+    @unittest.skip
+    def test_dl3_u85_BI(self):
+        tester = (
+            ArmTester(
+                self.dl3,
+                example_inputs=self.model_inputs,
+                compile_spec=common.get_u85_compile_spec(),
+            )
+            .quantize()
+            .export()
+            .to_edge_transform_and_lower()
+            .to_executorch()
+            .serialize()
+        )
+        if conftest.is_option_enabled("corstone_fvp"):
+            tester.run_method_and_compare_outputs(
+                atol=1.0, qtol=1, inputs=self.model_inputs
+            )
@@ -6,11 +6,11 @@
 
 # Example script for exporting simple models to flatbuffer
 
+# pyre-unsafe
+
 import logging
 import tempfile
 
-import torch
-
 from executorch.backends.cadence.aot.ops_registrations import *  # noqa
 from typing import Any, Tuple
 
@@ -23,38 +23,15 @@
 from executorch.backends.cadence.aot.quantizer.quantizer import CadenceDefaultQuantizer
 from executorch.backends.cadence.runtime import runtime
 from executorch.backends.cadence.runtime.executor import BundledProgramManager
-from executorch.backends.xnnpack.quantizer.xnnpack_quantizer_utils import (
-    QuantizationConfig,
-    QuantizationSpec,
-)
 from executorch.exir import ExecutorchProgramManager
 from torch import nn
-from torch.ao.quantization.observer import HistogramObserver, MinMaxObserver
 
 from .utils import save_bpte_program, save_pte_program
 
 
 FORMAT = "[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s"
 logging.basicConfig(level=logging.INFO, format=FORMAT)
 
-act_qspec = QuantizationSpec(
-    dtype=torch.int8,
-    quant_min=-128,
-    quant_max=127,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
-)
-
-wgt_qspec = QuantizationSpec(
-    dtype=torch.int8,
-    quant_min=-128,
-    quant_max=127,
-    qscheme=torch.per_tensor_affine,
-    is_dynamic=False,
-    observer_or_fake_quant_ctr=MinMaxObserver,
-)
-
 
 def export_model(
     model: nn.Module,
@@ -66,15 +43,8 @@ def export_model(
     working_dir = tempfile.mkdtemp(dir="/tmp")
     logging.debug(f"Created work directory {working_dir}")
 
-    qconfig = QuantizationConfig(
-        act_qspec,
-        act_qspec,
-        wgt_qspec,
-        None,
-    )
-
     # Instantiate the quantizer
-    quantizer = CadenceDefaultQuantizer(qconfig)
+    quantizer = CadenceDefaultQuantizer()
 
     # Convert the model
     converted_model = convert_pt2(model, example_inputs, quantizer)
 
@@ -576,7 +576,7 @@ def quantized_relu_per_tensor_meta(
     out_multiplier: int,
     out_shift: int,
 ) -> torch.Tensor:
-    return input.new_empty(input.size(), dtype=torch.uint8)
+    return input.new_empty(input.size(), dtype=input.dtype)
 
 
 @register_fake("cadence::fully_connected")