pytorch
diff --git a/‎.ci/scripts/build_llama_android.sh
Lines changed: 0 additions & 3 deletions b/‎.ci/scripts/build_llama_android.sh
Lines changed: 0 additions & 3 deletions
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 0 additions & 1 deletion b/‎.ci/scripts/test_llama.sh
Lines changed: 0 additions & 1 deletion
diff --git a/‎.ci/scripts/test_llava.sh
Lines changed: 2 additions & 5 deletions b/‎.ci/scripts/test_llava.sh
Lines changed: 2 additions & 5 deletions
diff --git a/‎.ci/scripts/test_model.sh
Lines changed: 0 additions & 4 deletions b/‎.ci/scripts/test_model.sh
Lines changed: 0 additions & 4 deletions
diff --git a/‎.ci/scripts/test_phi_3_mini.sh
Lines changed: 0 additions & 4 deletions b/‎.ci/scripts/test_phi_3_mini.sh
Lines changed: 0 additions & 4 deletions
diff --git a/‎.ci/scripts/test_qnn_static_llama.sh
Lines changed: 2 additions & 2 deletions b/‎.ci/scripts/test_qnn_static_llama.sh
Lines changed: 2 additions & 2 deletions
diff --git a/‎.ci/scripts/test_quantized_aot_lib.sh
Lines changed: 0 additions & 3 deletions b/‎.ci/scripts/test_quantized_aot_lib.sh
Lines changed: 0 additions & 3 deletions
diff --git a/‎.ci/scripts/utils.sh
Lines changed: 0 additions & 1 deletion b/‎.ci/scripts/utils.sh
Lines changed: 0 additions & 1 deletion
diff --git a/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 3 deletions b/‎.github/workflows/trunk.yml
Lines changed: 1 addition & 3 deletions
diff --git a/‎CMakeLists.txt
Lines changed: 9 additions & 3 deletions b/‎CMakeLists.txt
Lines changed: 9 additions & 3 deletions
diff --git a/‎backends/apple/coreml/partition/coreml_partitioner.py
Lines changed: 15 additions & 1 deletion b/‎backends/apple/coreml/partition/coreml_partitioner.py
Lines changed: 15 additions & 1 deletion
diff --git a/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 46 additions & 0 deletions b/‎backends/apple/coreml/test/test_coreml_partitioner.py
Lines changed: 46 additions & 0 deletions
diff --git a/‎backends/arm/README.md
Lines changed: 2 additions & 2 deletions b/‎backends/arm/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/arm/TARGETS
Lines changed: 4 additions & 1 deletion b/‎backends/arm/TARGETS
Lines changed: 4 additions & 1 deletion
@@ -14,7 +14,6 @@ if [[ -z "${PYTHON_EXECUTABLE:-}" ]]; then
   PYTHON_EXECUTABLE=python3
 fi
 which "${PYTHON_EXECUTABLE}"
-CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
 
 install_executorch_and_backend_lib() {
   echo "Installing executorch and xnnpack backend"
@@ -28,7 +27,6 @@ install_executorch_and_backend_lib() {
     -DANDROID_ABI="${ANDROID_ABI}" \
     -DCMAKE_INSTALL_PREFIX=cmake-android-out \
     -DCMAKE_BUILD_TYPE=Release \
-    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
     -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
     -DEXECUTORCH_BUILD_EXTENSION_TENSOR=ON \
@@ -54,7 +52,6 @@ build_llama_runner() {
     -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
     -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
-    -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
     -Bcmake-android-out/examples/models/llama examples/models/llama
 
     cmake --build cmake-android-out/examples/models/llama -j4 --config Release
 
@@ -154,7 +154,6 @@ cmake_install_executorch_libraries() {
     rm -rf cmake-out
     retry cmake \
         -DCMAKE_INSTALL_PREFIX=cmake-out \
-        -DCMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')" \
         -DCMAKE_BUILD_TYPE="$CMAKE_BUILD_TYPE" \
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
 
@@ -31,7 +31,6 @@ NPROC=8
 if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
 python_lib=$($PYTHON_EXECUTABLE -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')
-CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
 EXECUTORCH_COMMON_CMAKE_ARGS="                      \
         -DCMAKE_INSTALL_PREFIX=${BUILD_DIR}         \
         -DCMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE}      \
@@ -48,7 +47,6 @@ EXECUTORCH_COMMON_CMAKE_ARGS="                      \
 cmake_install_executorch_libraries() {
     cmake                               \
         ${EXECUTORCH_COMMON_CMAKE_ARGS} \
-        "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \
         -B${BUILD_DIR} .
 
     cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
@@ -59,7 +57,6 @@ cmake_install_executorch_libraries_for_android() {
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI=arm64-v8a                                                 \
         ${EXECUTORCH_COMMON_CMAKE_ARGS}                                         \
-        "-DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH}" \
         -B${BUILD_DIR} .
 
     cmake --build ${BUILD_DIR} -j${NPROC} --target install --config ${CMAKE_BUILD_TYPE}
@@ -80,7 +77,7 @@ cmake_build_llava_runner() {
 
     cmake                                 \
         ${LLAVA_COMMON_CMAKE_ARGS}        \
-        -DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}" \
+        -DCMAKE_PREFIX_PATH="$python_lib" \
         -B${BUILD_DIR}/${dir}             \
         ${dir}
 
@@ -96,7 +93,7 @@ cmake_build_llava_runner_for_android() {
         -DCMAKE_TOOLCHAIN_FILE=$ANDROID_NDK/build/cmake/android.toolchain.cmake \
         -DANDROID_ABI=arm64-v8a                                                 \
         ${LLAVA_COMMON_CMAKE_ARGS}                                              \
-        -DCMAKE_PREFIX_PATH="$python_lib;${CMAKE_PREFIX_PATH}"                  \
+        -DCMAKE_PREFIX_PATH="$python_lib"                  \
         -DLLAVA_RUNNER_NO_TORCH_DUMMY_IMAGE=ON                                  \
         -B${BUILD_DIR}/${dir}                                                   \
         ${dir}
 
@@ -50,12 +50,10 @@ prepare_artifacts_upload() {
 
 build_cmake_executor_runner() {
   echo "Building executor_runner"
-  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   rm -rf ${CMAKE_OUTPUT_DIR}
   cmake -DCMAKE_BUILD_TYPE=Debug \
       -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-      -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -B${CMAKE_OUTPUT_DIR} .
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4 --config Debug
@@ -100,14 +98,12 @@ test_model() {
 
 build_cmake_xnn_executor_runner() {
   echo "Building xnn_executor_runner"
-  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
 
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
     && retry cmake -DCMAKE_BUILD_TYPE=Release \
       -DEXECUTORCH_BUILD_XNNPACK=ON \
-      -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
   cmake --build ${CMAKE_OUTPUT_DIR} -j4
 
@@ -22,10 +22,8 @@ NPROC=8
 if hash nproc &> /dev/null; then NPROC=$(nproc); fi
 
 cmake_install_executorch_libraries() {
-  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   cmake -DPYTHON_EXECUTABLE=python \
       -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
       -DEXECUTORCH_ENABLE_LOGGING=1 \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
       -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
@@ -41,10 +39,8 @@ cmake_install_executorch_libraries() {
 }
 
 cmake_build_phi_3_mini() {
-  CMAKE_PREFIX_PATH="$(python3 -c 'import torch as _; print(_.__path__[0])')"
   cmake -DPYTHON_EXECUTABLE=$PYTHON_EXECUTABLE \
       -DCMAKE_INSTALL_PREFIX=${BUILD_DIR} \
-      -DCMAKE_PREFIX_PATH="${CMAKE_PREFIX_PATH}" \
       -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
       -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
       -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
 
@@ -34,11 +34,11 @@ $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o to
 
 set +e
 # Compile only as weight sharing is not applicable on x86
-$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --llama_artifacts . --compile_only
 exit_code1=$?
 
 # Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
-$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
+$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
 exit_code2=$?
 
 # Check the exit codes and print messages
 
@@ -16,13 +16,10 @@ CMAKE_OUTPUT_DIR=cmake-out
 
 build_cmake_quantized_aot_lib() {
   echo "Building quantized aot lib"
-  SITE_PACKAGES="$(${PYTHON_EXECUTABLE} -c 'from distutils.sysconfig import get_python_lib; print(get_python_lib())')"
-  CMAKE_PREFIX_PATH="${SITE_PACKAGES}/torch"
   (rm -rf ${CMAKE_OUTPUT_DIR} \
     && mkdir ${CMAKE_OUTPUT_DIR} \
     && cd ${CMAKE_OUTPUT_DIR} \
     && retry cmake -DCMAKE_BUILD_TYPE=Release \
-      -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
       -DEXECUTORCH_BUILD_KERNELS_QUANTIZED_AOT=ON \
       -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
 
 
@@ -136,7 +136,6 @@ cmake_install_executorch_lib() {
   clean_executorch_install_folders
   retry cmake -DBUCK2="$BUCK" \
           -DCMAKE_INSTALL_PREFIX=cmake-out \
-          -DCMAKE_PREFIX_PATH="$($PYTHON_EXECUTABLE -c 'import torch as _; print(_.__path__[0])')" \
           -DCMAKE_BUILD_TYPE=Release \
           -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
           -Bcmake-out .
 
@@ -159,7 +159,7 @@ jobs:
         sudo sysctl fs.inotify.max_user_watches=1048576 # 1024 * 1024
 
         # Test ethos-u delegate examples with run.sh
-        backends/arm/test/test_arm_baremetal.sh test_run_ethosu_fvp
+        backends/arm/test/test_arm_baremetal.sh test_full_ethosu_fvp
 
 
   test-arm-reference-delegation:
@@ -394,7 +394,6 @@ jobs:
         rm -rf cmake-out
         cmake \
             -DCMAKE_INSTALL_PREFIX=cmake-out \
-            -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \
             -DCMAKE_BUILD_TYPE=Release \
             -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
             -DEXECUTORCH_BUILD_EXTENSION_MODULE=ON \
@@ -412,7 +411,6 @@ jobs:
         cmake \
             -DCMAKE_INSTALL_PREFIX=cmake-out \
             -DCMAKE_BUILD_TYPE=Release \
-            -DCMAKE_PREFIX_PATH="$(python -c 'import torch as _; print(_.__path__[0])')" \
             -DEXECUTORCH_BUILD_KERNELS_CUSTOM=ON \
             -DEXECUTORCH_BUILD_KERNELS_OPTIMIZED=ON \
             -DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
 
@@ -186,6 +186,10 @@ option(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR "Build the Flat Tensor extension"
        OFF
 )
 
+option(EXECUTORCH_BUILD_EXTENSION_LLM "Build the LLM extension"
+       OFF
+)
+
 option(EXECUTORCH_BUILD_EXTENSION_MODULE "Build the Module extension" OFF)
 
 option(EXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL "Build the Runner Util extension"
@@ -718,6 +722,10 @@ if(EXECUTORCH_BUILD_EXTENSION_FLAT_TENSOR)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/flat_tensor/serialize)
 endif()
 
+if(EXECUTORCH_BUILD_EXTENSION_LLM)
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/llm/tokenizer)
+endif()
+
 if(EXECUTORCH_BUILD_EXTENSION_MODULE)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/extension/module)
 endif()
@@ -753,9 +761,7 @@ if(EXECUTORCH_BUILD_PYBIND)
   endif()
 
   # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  if(NOT TARGET torch)
-    find_package(Torch CONFIG REQUIRED)
-  endif()
+  find_package_torch()
   find_library(
     TORCH_PYTHON_LIBRARY torch_python PATHS "${TORCH_INSTALL_PREFIX}/lib"
   )
 
@@ -3,7 +3,7 @@
 # Please refer to the license found in the LICENSE file in the root directory of the source tree.
 
 import logging
-from typing import List, Optional
+from typing import Callable, List, Optional, Tuple
 
 import coremltools as ct
 
@@ -104,3 +104,17 @@ def partition(self, exported_program: ExportedProgram) -> PartitionResult:
         return PartitionResult(
             tagged_exported_program=exported_program, partition_tags=partition_tags
         )
+
+    def ops_to_not_decompose(
+        self, ep: ExportedProgram
+    ) -> Tuple[List[torch._ops.OpOverload], Optional[Callable[[torch.fx.Node], bool]]]:
+        do_not_decompose = []
+        op_support = OperatorsSupportedForCoreMLBackend()
+        for node in ep.graph.nodes:
+            if (
+                node.op == "call_function"
+                and isinstance(node.target, torch._ops.OpOverload)
+                and op_support.is_node_supported(None, node)
+            ):
+                do_not_decompose.append(node.target)
+        return do_not_decompose, None
@@ -13,6 +13,7 @@
 
 from executorch.backends.apple.coreml.compiler import CoreMLBackend
 from executorch.backends.apple.coreml.partition import CoreMLPartitioner
+from executorch.exir.backend.utils import format_delegated_graph
 
 
 class TestCoreMLPartitioner(unittest.TestCase):
@@ -79,6 +80,50 @@ def test_vit_skip_conv(self):
             "getitem",
         ]
 
+    def test_ops_to_not_decompose(self):
+        class Model(torch.nn.Module):
+            def forward(self, q, k, v, mask):
+                return torch.ops.aten.scaled_dot_product_attention.default(
+                    q, k, v, attn_mask=mask
+                )
+
+        model = Model()
+        model.eval()
+
+        batch_size = 1
+        n_heads = 12
+        seq_len = 1
+        max_seq_length = 32
+        embedding_dim = 16
+        q = torch.randn(batch_size, n_heads, seq_len, embedding_dim)
+        k = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
+        v = torch.randn(batch_size, n_heads, max_seq_length, embedding_dim)
+        mask = torch.randn(seq_len, max_seq_length)
+        example_inputs = (q, k, v, mask)
+        ep = torch.export.export(model, example_inputs)
+        coreml_partitioner = CoreMLPartitioner()
+
+        # Using to_edge_transform_and_lower, we expect SDPA will be preserved and show up in delegated graph
+        edge_program_manager = executorch.exir.to_edge_transform_and_lower(
+            ep, partitioner=[coreml_partitioner]
+        )
+        self.assertTrue(
+            "executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
+            in format_delegated_graph(
+                edge_program_manager.exported_program().graph_module
+            )
+        )
+
+        # Using to_edge flow, we expect SDPA will be decomposed and not show up in delegated graph
+        edge_program_manager2 = executorch.exir.to_edge(ep)
+        edge_program_manager2.to_backend(coreml_partitioner)
+        self.assertTrue(
+            "executorch.exir.dialects.edge._ops.aten.scaled_dot_product_attention.default"
+            not in format_delegated_graph(
+                edge_program_manager2.exported_program().graph_module
+            )
+        )
+
     def test_buffer(self):
         embedding_dim = 3
         max_seq_len = 2
@@ -129,4 +174,5 @@ def forward(self, q, k_val, input_pos):
     test_runner = TestCoreMLPartitioner()
     test_runner.test_add_sub_skip_mm()
     test_runner.test_vit_skip_conv()
+    test_runner.test_ops_to_not_decompose()
     test_runner.test_buffer()
@@ -55,10 +55,10 @@ To run the unit test suite with Corstone3x0 FVP simulator support use
 backends/arm/test/test_arm_baremetal.sh test_pytest_ethosu_fvp
 ```
 
-You can test to run some models with the run.sh flow
+You can test to run some models with the full fvp test flow
 
 ```
-backends/arm/test/test_arm_baremetal.sh test_run_ethosu_fvp
+backends/arm/test/test_arm_baremetal.sh test_full_ethosu_fvp
 ```
 
 ## Unit tests
 
@@ -4,7 +4,10 @@ load("@fbcode_macros//build_defs:python_library.bzl", "python_library")
 python_library(
     name = "arm_partitioner",
     srcs = [
-        "arm_partitioner.py",
+        "ethosu_backend.py",
+        "ethosu_partitioner.py",
+        "tosa_backend.py",
+        "tosa_partitioner.py",
     ],
     typing = True,
     deps = [