pytorch
diff --git a/‎.ci/scripts/test_llama.sh
Lines changed: 2 additions & 0 deletions b/‎.ci/scripts/test_llama.sh
Lines changed: 2 additions & 0 deletions
diff --git a/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/pull.yml
Lines changed: 1 addition & 1 deletion
diff --git a/‎CMakeLists.txt
Lines changed: 27 additions & 25 deletions b/‎CMakeLists.txt
Lines changed: 27 additions & 25 deletions
diff --git a/‎backends/xnnpack/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎backends/xnnpack/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/models/llama2/CMakeLists.txt
Lines changed: 29 additions & 16 deletions b/‎examples/models/llama2/CMakeLists.txt
Lines changed: 29 additions & 16 deletions
diff --git a/‎examples/models/llama2/TARGETS
Lines changed: 1 addition & 0 deletions b/‎examples/models/llama2/TARGETS
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/models/llama2/custom_ops/CMakeLists.txt
Lines changed: 12 additions & 14 deletions b/‎examples/models/llama2/custom_ops/CMakeLists.txt
Lines changed: 12 additions & 14 deletions
diff --git a/‎examples/models/llama2/custom_ops/custom_ops.yaml
Lines changed: 0 additions & 14 deletions b/‎examples/models/llama2/custom_ops/custom_ops.yaml
Lines changed: 0 additions & 14 deletions
diff --git a/‎examples/models/llama2/custom_ops/op_sdpa.cpp
Lines changed: 7 additions & 2 deletions b/‎examples/models/llama2/custom_ops/op_sdpa.cpp
Lines changed: 7 additions & 2 deletions
diff --git a/‎examples/models/llama2/custom_ops/op_sdpa.h
Lines changed: 45 additions & 0 deletions b/‎examples/models/llama2/custom_ops/op_sdpa.h
Lines changed: 45 additions & 0 deletions
@@ -119,6 +119,8 @@ echo "Exporting ${EXPORTED_MODEL_NAME}"
 EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}"
 if [[ "${MODE}" == "xnnpack" ]]; then
   EXPORT_ARGS="${EXPORT_ARGS} --pt2e_quantize xnnpack_dynamic"
+elif [[ "${MODE}" == "xnnpack+kv+custom" ]]; then
+  EXPORT_ARGS="${EXPORT_ARGS} --pt2e_quantize xnnpack_dynamic --use_sdpa_with_kv_cache -kv"
 fi
 $PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
 
 
@@ -90,7 +90,7 @@ jobs:
       matrix:
         dtype: [fp32]
         build-tool: [buck2, cmake]
-        mode: [portable, xnnpack]
+        mode: [portable, xnnpack, xnnpack+kv+custom]
       fail-fast: false
     with:
       runner: linux.2xlarge
 
@@ -175,8 +175,9 @@ option(EXECUTORCH_BUILD_VULKAN "Build the Vulkan backend" OFF)
 #
 # pthreadpool: build pthreadpool library. Disable on unsupported platforms
 #
-cmake_dependent_option(EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library."
-                       ON "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
+cmake_dependent_option(
+  EXECUTORCH_BUILD_PTHREADPOOL "Build pthreadpool library." ON
+  "NOT EXECUTORCH_BUILD_ARM_BAREMETAL" OFF)
 
 #
 # cpuinfo: build cpuinfo library. Disable on unsupported platforms
@@ -499,25 +500,38 @@ if(EXECUTORCH_BUILD_PYBIND)
     add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/sdk)
   endif()
 
+  # find pytorch lib, to allow pybind to take at::Tensor as input/output
+  find_package(Torch CONFIG REQUIRED)
+  find_library(TORCH_PYTHON_LIBRARY torch_python
+               PATHS "${TORCH_INSTALL_PREFIX}/lib")
+
+  set(_dep_libs
+      ${TORCH_PYTHON_LIBRARY}
+      bundled_program
+      etdump
+      executorch
+      extension_data_loader
+      portable_ops_lib
+      util
+      torch)
+
   if(EXECUTORCH_BUILD_COREML)
-    set(PYBIND_LINK_COREML "coremldelegate")
+    list(APPEND _dep_libs coremldelegate)
   endif()
 
   if(EXECUTORCH_BUILD_MPS)
-    set(PYBIND_LINK_MPS "mpsdelegate")
+    list(APPEND _dep_libs mpsdelegate)
   endif()
 
   if(EXECUTORCH_BUILD_XNNPACK)
-    # need to explicitly specify XNNPACK here
-    # otherwise uses XNNPACK symbols from libtorch_cpu
-    set(PYBIND_LINK_XNNPACK xnnpack_backend XNNPACK)
+    # need to explicitly specify XNNPACK here otherwise uses XNNPACK symbols
+    # from libtorch_cpu
+    list(APPEND _dep_libs xnnpack_backend XNNPACK)
   endif()
 
-  # find pytorch lib, to allow pybind to take at::Tensor as input/output
-  find_package(Torch CONFIG REQUIRED)
-  find_library(TORCH_PYTHON_LIBRARY torch_python
-               PATHS "${TORCH_INSTALL_PREFIX}/lib")
-
+  if(EXECUTORCH_BUILD_CUSTOM)
+    list(APPEND _dep_libs custom_ops custom_ops_aot_lib)
+  endif()
   # compile options for pybind
 
   set(_pybind_compile_options -Wno-deprecated-declarations -fPIC -frtti
@@ -539,19 +553,7 @@ if(EXECUTORCH_BUILD_PYBIND)
                              PUBLIC EXECUTORCH_PYTHON_MODULE_NAME=portable_lib)
   target_include_directories(portable_lib PRIVATE ${TORCH_INCLUDE_DIRS})
   target_compile_options(portable_lib PUBLIC ${_pybind_compile_options})
-  target_link_libraries(
-    portable_lib
-    PUBLIC ${TORCH_PYTHON_LIBRARY}
-           bundled_program
-           etdump
-           executorch
-           extension_data_loader
-           portable_ops_lib
-           util
-           torch
-           ${PYBIND_LINK_COREML}
-           ${PYBIND_LINK_MPS}
-           ${PYBIND_LINK_XNNPACK})
+  target_link_libraries(portable_lib PUBLIC ${_dep_libs})
 
   install(TARGETS portable_lib
           LIBRARY DESTINATION executorch/extension/pybindings)
 
@@ -72,6 +72,7 @@ target_include_directories(
   xnnpack_schema INTERFACE ${_xnnpack_schema__include_dir}
                            ${EXECUTORCH_ROOT}/third-party/flatbuffers/include)
 
+target_compile_options(pthreadpool PUBLIC ${_common_compile_options})
 set(xnnpack_third_party pthreadpool cpuinfo)
 
 include(cmake/Dependencies.cmake)
 
@@ -49,22 +49,24 @@ set(_common_compile_options -Wno-deprecated-declarations -fPIC)
 # Let files say "include <executorch/path/to/header.h>".
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
-# For some reason android build is not able to find where gflags is
-# and hence cannot find corresponding .cmake file
+# For some reason android build is not able to find where gflags is and hence
+# cannot find corresponding .cmake file
 set(gflags_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../third-party/gflags)
 find_package(gflags REQUIRED)
 
 #
 # llama_main: test binary to run llama, with tokenizer and sampler integrated
 #
-add_executable(llama_main main.cpp
-${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/cpuinfo_utils.cpp)
+add_executable(
+  llama_main
+  main.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/threadpool/cpuinfo_utils.cpp
+)
 if(CMAKE_BUILD_TYPE EQUAL "RELEASE")
   target_link_options(llama_main PRIVATE "LINKER:--gc-sections")
 endif()
 
-# find `executorch` libraries
-# Same as for gflags
+# find `executorch` libraries Same as for gflags
 set(executorch_DIR ${CMAKE_CURRENT_BINARY_DIR}/../../../lib/cmake/ExecuTorch)
 find_package(executorch CONFIG REQUIRED)
 if(CMAKE_TOOLCHAIN_IOS OR ANDROID)
@@ -77,23 +79,35 @@ add_subdirectory(custom_ops)
 # llama_runner library
 add_subdirectory(runner)
 
-target_include_directories(llama_main PUBLIC
-${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/cpuinfo/include)
-target_include_directories(llama_main PUBLIC
-${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/pthreadpool/include)
+target_include_directories(
+  llama_main
+  PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/cpuinfo/include
+)
+target_include_directories(
+  llama_main
+  PUBLIC
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../../backends/xnnpack/third-party/pthreadpool/include
+)
 
 set(link_libraries)
 
 if(EXECUTORCH_BUILD_OPTIMIZED)
-  list(APPEND link_libraries optimized_native_cpu_ops_lib optimized_kernels
-  portable_kernels cpublas eigen_blas)
+  list(
+    APPEND
+    link_libraries
+    optimized_native_cpu_ops_lib
+    optimized_kernels
+    portable_kernels
+    cpublas
+    eigen_blas)
   target_link_options_shared_lib(optimized_native_cpu_ops_lib)
 else()
   list(APPEND link_libraries portable_ops_lib portable_kernels)
   target_link_options_shared_lib(portable_ops_lib)
 endif()
 
-target_link_libraries(llama_main PUBLIC gflags llama_runner custom_ops_lib)
+target_link_libraries(llama_main PUBLIC gflags llama_runner custom_ops)
 
 # XNNPACK pthreadpool cpuinfo
 if(TARGET xnnpack_backend)
@@ -114,14 +128,13 @@ if(TARGET qnn_executorch_backend)
   target_link_options_shared_lib(qnn_executorch_backend)
 endif()
 
-# This one is needed for cpuinfo where it uses android
-# specific log lib
+# This one is needed for cpuinfo where it uses android specific log lib
 if(ANDROID)
   list(APPEND link_libraries log)
 endif()
 
 target_compile_options(llama_main PUBLIC ${_common_compile_options}
-  -DET_USE_THREADPOOL)
+                                         -DET_USE_THREADPOOL)
 target_link_libraries(llama_main PUBLIC ${link_libraries})
 
 if(APPLE)
 
@@ -52,6 +52,7 @@ runtime.python_binary(
     main_module = "executorch.examples.models.llama2.export_llama",
     # visibility = ["//executorch/examples/..."],
     preload_deps = [
+        "//executorch/examples/models/llama2/custom_ops:custom_ops_aot_lib",
         "//executorch/kernels/quantized:aot_lib",
     ],
     deps = [
 
@@ -44,21 +44,12 @@ include(${EXECUTORCH_SRCS_FILE})
 set(_common_include_directories ${EXECUTORCH_ROOT}/..)
 
 # Custom op libraries
-set(custom_ops_libs extension_module)
+set(custom_ops_libs extension_module executorch)
 list(APPEND custom_ops_libs pthreadpool)
 list(APPEND custom_ops_libs cpuinfo)
 list(APPEND custom_ops_libs cpublas)
 list(APPEND custom_ops_libs eigen_blas)
 
-# Generate C++ bindings to register kernels into both PyTorch (for AOT) and
-# Executorch (for runtime). Here select all ops in optimized.yaml
-set(_yaml "${CMAKE_CURRENT_LIST_DIR}/custom_ops.yaml")
-gen_selected_ops("${_yaml}" "" "")
-
-generate_bindings_for_kernels(FUNCTIONS_YAML
-                              ${CMAKE_CURRENT_SOURCE_DIR}/custom_ops.yaml)
-message("Generated files ${gen_command_sources}")
-
 list(TRANSFORM _custom_ops__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
 # TODO: Consider moving xnnpack/threadpool in a separate lib since it's now used
@@ -82,7 +73,14 @@ target_link_libraries(custom_ops PUBLIC ${custom_ops_libs})
 target_compile_options(custom_ops PUBLIC ${_common_compile_options}
                                          -DET_USE_THREADPOOL)
 
-# Build a library for _custom_ops_srcs
-#
-# custom_ops_lib: Register optimized ops kernels into Executorch runtime
-gen_operators_lib("custom_ops_lib" KERNEL_LIBS custom_ops DEPS executorch)
+# Add a AOT library
+find_package(Torch CONFIG REQUIRED)
+add_library(custom_ops_aot_lib SHARED
+            ${CMAKE_CURRENT_SOURCE_DIR}/op_sdpa_aot.cpp)
+target_include_directories(custom_ops_aot_lib
+                           PUBLIC "${_common_include_directories}")
+target_include_directories(
+  custom_ops_aot_lib PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/../../../../include")
+target_link_libraries(custom_ops_aot_lib PUBLIC custom_ops torch)
+
+install(TARGETS custom_ops custom_ops_aot_lib DESTINATION lib)
@@ -5,8 +5,7 @@
  * This source code is licensed under the BSD-style license found in the
  * LICENSE file in the root directory of this source tree.
  */
-
-#include <executorch/runtime/kernel/kernel_includes.h>
+#include <executorch/examples/models/llama2/custom_ops/op_sdpa.h>
 
 #include <executorch/kernels/optimized/blas/CPUBlas.h>
 #include <executorch/kernels/optimized/vec/functional.h>
@@ -22,6 +21,7 @@
 #include <executorch/backends/xnnpack/threadpool/threadpool.h>
 #include <executorch/extension/parallel/thread_parallel.h>
 #endif
+#include <executorch/extension/kernel_util/make_boxed_from_unboxed_functor.h>
 
 namespace torch {
 namespace executor {
@@ -843,3 +843,8 @@ Tensor& sdpa_with_kv_cache_out(
 } // namespace native
 } // namespace executor
 } // namespace torch
+
+EXECUTORCH_LIBRARY(
+    llama,
+    "sdpa_with_kv_cache.out",
+    torch::executor::native::sdpa_with_kv_cache_out);
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+namespace torch {
+namespace executor {
+
+namespace native {
+
+Tensor& sdpa_with_kv_cache_out(
+    RuntimeContext& ctx,
+    const Tensor& q_projected,
+    const Tensor& k_projected,
+    const Tensor& v_projected,
+    Tensor& key_cache,
+    Tensor& value_cache,
+    const int64_t start_pos,
+    const int64_t seq_len,
+    const optional<Tensor>& attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    Tensor& output);
+
+Tensor& flash_attention_kernel_out(
+    RuntimeContext& ctx,
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const optional<Tensor>& attn_mask,
+    const double dropout_p,
+    const bool is_causal,
+    // @lint-ignore CLANGTIDY facebook-hte-ParameterMightThrowOnCopy
+    const optional<double> scale,
+    Tensor& output);
+
+} // namespace native
+} // namespace executor
+} // namespace torch