pytorch
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 1 deletion b/‎CMakeLists.txt
Lines changed: 4 additions & 1 deletion
diff --git a/‎build/Codegen.cmake
Lines changed: 52 additions & 11 deletions b/‎build/Codegen.cmake
Lines changed: 52 additions & 11 deletions
diff --git a/‎build/cmake_deps.toml
Lines changed: 16 additions & 0 deletions b/‎build/cmake_deps.toml
Lines changed: 16 additions & 0 deletions
diff --git a/‎codegen/tools/merge_yaml.py
Lines changed: 8 additions & 2 deletions b/‎codegen/tools/merge_yaml.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎configurations/CMakeLists.txt
Lines changed: 51 additions & 0 deletions b/‎configurations/CMakeLists.txt
Lines changed: 51 additions & 0 deletions
diff --git a/‎docs/source/kernel-library-custom-aten-kernel.md
Lines changed: 3 additions & 3 deletions b/‎docs/source/kernel-library-custom-aten-kernel.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎examples/apple/mps/CMakeLists.txt
Lines changed: 5 additions & 2 deletions b/‎examples/apple/mps/CMakeLists.txt
Lines changed: 5 additions & 2 deletions
diff --git a/‎examples/arm/CMakeLists.txt
Lines changed: 4 additions & 2 deletions b/‎examples/arm/CMakeLists.txt
Lines changed: 4 additions & 2 deletions
diff --git a/‎examples/portable/custom_ops/CMakeLists.txt
Lines changed: 6 additions & 2 deletions b/‎examples/portable/custom_ops/CMakeLists.txt
Lines changed: 6 additions & 2 deletions
diff --git a/‎examples/qualcomm/CMakeLists.txt
Lines changed: 4 additions & 2 deletions b/‎examples/qualcomm/CMakeLists.txt
Lines changed: 4 additions & 2 deletions
diff --git a/‎examples/sdk/CMakeLists.txt
Lines changed: 5 additions & 2 deletions b/‎examples/sdk/CMakeLists.txt
Lines changed: 5 additions & 2 deletions
diff --git a/‎examples/selective_build/CMakeLists.txt
Lines changed: 7 additions & 3 deletions b/‎examples/selective_build/CMakeLists.txt
Lines changed: 7 additions & 3 deletions
diff --git a/‎examples/xtensa/ops/CMakeLists.txt
Lines changed: 6 additions & 2 deletions b/‎examples/xtensa/ops/CMakeLists.txt
Lines changed: 6 additions & 2 deletions
@@ -309,6 +309,9 @@ endif()
 # operators necessary for the models that will run.
 #
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
 
 #
 # gflags: Commandline flag host library.
@@ -336,7 +339,7 @@ cmake_dependent_option(
   EXECUTORCH_BUILD_HOST_TARGETS OFF)
 if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   # Baseline libraries that executor_runner will link against.
-  set(_executor_runner_libs executorch portable_ops_lib gflags)
+  set(_executor_runner_libs executorch optimized_native_cpu_ops_lib gflags)
 
   # Generate lib to register quantized ops
   if(REGISTER_QUANTIZED_OPS)
 
@@ -37,8 +37,21 @@ function(gen_selected_ops ops_schema_yaml root_ops include_all_ops)
 endfunction()
 
 # Codegen for registering kernels. Kernels are defined in functions_yaml and
-# custom_ops_yaml
-function(generate_bindings_for_kernels functions_yaml custom_ops_yaml)
+# custom_ops_yaml.
+#
+# Invoked as
+# generate_bindings_for_kernels(
+#   FUNCTIONS_YAML functions_yaml
+#   CUSTOM_OPS_YAML custom_ops_yaml
+# )
+function(generate_bindings_for_kernels)
+  set(arg_names FUNCTIONS_YAML CUSTOM_OPS_YAML)
+  cmake_parse_arguments(GEN "" "${arg_names}" "" ${ARGN})
+
+  message(STATUS "Generating kernel bindings:")
+  message(STATUS "  FUNCTIONS_YAML: ${GEN_FUNCTIONS_YAML}")
+  message(STATUS "  CUSTOM_OPS_YAML: ${GEN_CUSTOM_OPS_YAML}")
+
   # Command to generate selected_operators.yaml from custom_ops.yaml.
   file(GLOB_RECURSE _codegen_templates "${EXECUTORCH_ROOT}/codegen/templates/*")
   file(GLOB_RECURSE _torchgen_srcs "${TORCH_ROOT}/torchgen/*.py")
@@ -60,11 +73,11 @@ function(generate_bindings_for_kernels functions_yaml custom_ops_yaml)
       ${CMAKE_CURRENT_BINARY_DIR}/Functions.h
       ${CMAKE_CURRENT_BINARY_DIR}/NativeFunctions.h)
 
-  if(functions_yaml)
-    list(APPEND _gen_command --functions-yaml-path=${functions_yaml})
+  if(GEN_FUNCTIONS_YAML)
+    list(APPEND _gen_command --functions-yaml-path=${GEN_FUNCTIONS_YAML})
   endif()
-  if(custom_ops_yaml)
-    list(APPEND _gen_command --custom-ops-yaml-path=${custom_ops_yaml})
+  if(GEN_CUSTOM_OPS_YAML)
+    list(APPEND _gen_command --custom-ops-yaml-path=${GEN_CUSTOM_OPS_YAML})
     list(
       APPEND
       _gen_command_sources
@@ -77,7 +90,7 @@ function(generate_bindings_for_kernels functions_yaml custom_ops_yaml)
     COMMENT "Generating code for kernel registration"
     OUTPUT ${_gen_command_sources}
     COMMAND ${_gen_command}
-    DEPENDS ${_oplist_yaml} ${custom_ops_yaml} ${functions_yaml}
+    DEPENDS ${_oplist_yaml} ${GEN_CUSTOM_OPS_YAML} ${GEN_FUNCTIONS_YAML}
             ${_codegen_templates} ${_torchgen_srcs}
     WORKING_DIRECTORY ${EXECUTORCH_ROOT})
   # Make generated file list available in parent scope
@@ -107,18 +120,46 @@ function(gen_custom_ops_aot_lib lib_name kernel_sources)
 endfunction()
 
 # Generate a runtime lib for registering operators in Executorch
-function(gen_operators_lib lib_name kernel_lib deps)
+function(gen_operators_lib lib_name)
+  set(multi_arg_names KERNEL_LIBS DEPS)
+  cmake_parse_arguments(GEN "" "" "${multi_arg_names}" ${ARGN})
+
+  message(STATUS "Generating operator lib:")
+  message(STATUS "  LIB_NAME: ${lib_name}")
+  message(STATUS "  KERNEL_LIBS: ${GEN_KERNEL_LIBS}")
+  message(STATUS "  DEPS: ${GEN_DEPS}")
+
   add_library(${lib_name})
   target_sources(
     ${lib_name}
     PRIVATE
       ${CMAKE_CURRENT_BINARY_DIR}/RegisterCodegenUnboxedKernelsEverything.cpp
       ${CMAKE_CURRENT_BINARY_DIR}/Functions.h
       ${CMAKE_CURRENT_BINARY_DIR}/NativeFunctions.h)
-  target_link_libraries(${lib_name} PRIVATE ${deps})
-  if(kernel_lib)
-    target_link_libraries(${lib_name} PRIVATE ${kernel_lib})
+  target_link_libraries(${lib_name} PRIVATE ${GEN_DEPS})
+  if(GEN_KERNEL_LIBS)
+    target_link_libraries(${lib_name} PRIVATE ${GEN_KERNEL_LIBS})
   endif()
 
   target_link_options_shared_lib(${lib_name})
 endfunction()
+
+# Merge two kernel yaml files, prioritizing functions from FUNCTIONS_YAML
+# and taking functions from FALLBACK_YAML when no implementation is found.
+# This corresponds to the merge_yaml buck implementation in codegen/tools.
+function(merge_yaml)
+  set(arg_names FUNCTIONS_YAML FALLBACK_YAML OUTPUT_DIR)
+  cmake_parse_arguments(GEN "" "${arg_names}" "" ${ARGN})
+
+  set(_gen_command
+      "${PYTHON_EXECUTABLE}" -m codegen.tools.merge_yaml
+      --functions_yaml_path=${GEN_FUNCTIONS_YAML}
+      --fallback_yaml_path=${GEN_FALLBACK_YAML}
+      --output_dir=${GEN_OUTPUT_DIR})
+
+  add_custom_command(
+    COMMENT "Merging kernel yaml files"
+    OUTPUT ${GEN_OUTPUT_DIR}/merged.yaml
+    COMMAND ${_gen_command}
+    WORKING_DIRECTORY ${EXECUTORCH_ROOT})
+endfunction()
@@ -45,6 +45,22 @@ deps = [
   "executorch",
 ]
 
+[targets.optimized_kernels]
+buck_targets = [
+  "//kernels/optimized:generated_lib",
+]
+filters = [
+  ".cpp$",
+]
+excludes = [
+  # Exclude the codegen templates, which are picked up because the buck target
+  # is the generated_lib and not the unwrapped set of kernels.
+  "^codegen/templates",
+]
+deps = [
+  "executorch",
+]
+
 [targets.quantized_kernels]
 buck_targets = [
   "//kernels/quantized:generated_lib",
 
@@ -12,14 +12,20 @@
 
 import yaml
 
-from executorch.codegen.tools.yaml_util import BlankLineDumper
-
 try:
     from yaml import CSafeLoader as Loader
 except ImportError:
     from yaml import SafeLoader as Loader  # type: ignore[misc]
 
 
+class BlankLineDumper(yaml.SafeDumper):
+    def write_line_break(self, data=None):
+        super().write_line_break(data)
+        # insert a new line between entries.
+        if len(self.indents) == 1:
+            super().write_line_break()
+
+
 def merge(functions_yaml_path: str, fallback_yaml_path: Optional[str], output_dir: str):
     output_file = os.path.join(output_dir, "merged.yaml")
 
 
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(NOT PYTHON_EXECUTABLE)
+  set(PYTHON_EXECUTABLE python3)
+endif()
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
+endif()
+# Source root directory for pytorch. This is needed for kernel binding.
+if(NOT TORCH_ROOT)
+  set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
+endif()
+
+set(_common_compile_options -Wno-deprecated-declarations)
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+
+
+# Merge optimized and portable definitions, taking optimized where available.
+merge_yaml(
+    FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/optimized/optimized-oss.yaml
+    FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
+    OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+gen_selected_ops("${CMAKE_CURRENT_BINARY_DIR}/merged.yaml" "" "")
+
+generate_bindings_for_kernels(
+    FUNCTIONS_YAML ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml)
+message("Generated files ${gen_command_sources}")
+
+# optimized_native_cpu_ops_lib: Register optimized op kernels into the runtime
+gen_operators_lib(
+  "optimized_native_cpu_ops_lib"
+  KERNEL_LIBS portable_kernels optimized_kernels
+  DEPS executorch)
+
+install(TARGETS optimized_native_cpu_ops_lib DESTINATION lib)
@@ -143,16 +143,16 @@ We provide build time macros to help users to build their kernel registration li
 
 #### CMake
 
-`generate_bindings_for_kernels(functions_yaml, custom_ops_yaml)` takes a yaml file for core ATen op out variants and also a yaml file for custom ops, generate C++ bindings for kernel registration. It also depends on the selective build artifact generated by `gen_selected_ops()`, see selective build doc for more information. Then `gen_operators_lib` will package those bindings to be a C++ library. As an example:
+`generate_bindings_for_kernels(FUNCTIONS_YAML functions_yaml CUSTOM_OPS_YAML custom_ops_yaml)` takes a yaml file for core ATen op out variants and also a yaml file for custom ops, generate C++ bindings for kernel registration. It also depends on the selective build artifact generated by `gen_selected_ops()`, see selective build doc for more information. Then `gen_operators_lib` will package those bindings to be a C++ library. As an example:
 ```cmake
 # SELECT_OPS_LIST: aten::add.out,aten::mm.out
 gen_selected_ops("" "${SELECT_OPS_LIST}" "")
 
 # Look for functions.yaml associated with portable libs and generate C++ bindings
-generate_bindings_for_kernels(${EXECUTORCH_ROOT}/kernels/portable/functions.yaml "")
+generate_bindings_for_kernels(FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml)
 
 # Prepare a C++ library called "generated_lib" with _kernel_lib being the portable library, executorch is a dependency of it.
-gen_operators_lib("generated_lib" ${_kernel_lib} executorch)
+gen_operators_lib("generated_lib" KERNEL_LIBS ${_kernel_lib} DEPS executorch)
 
 # Link "generated_lib" into the application:
 target_link_libraries(executorch_binary generated_lib)
 
@@ -68,9 +68,12 @@ include(${EXECUTORCH_ROOT}/build/Utils.cmake)
 include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
 gen_selected_ops("" "" "ON")
 generate_bindings_for_kernels(
-  ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml ""
+  FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
 )
-gen_operators_lib("portable_ops_lib" portable_kernels executorch)
+gen_operators_lib(
+  "portable_ops_lib"
+  KERNEL_LIBS portable_kernels
+  DEPS executorch)
 
 set(mps_executor_runner_libs "-framework Foundation"
                               "-weak_framework MetalPerformanceShaders"
 
@@ -44,5 +44,7 @@ include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
 # Executorch (for runtime). Here select all ops in functions.yaml
 gen_selected_ops("" "${EXECUTORCH_SELECT_OPS_LIST}" "")
 generate_bindings_for_kernels(
-  ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml "")
-gen_operators_lib("arm_portable_ops_lib" portable_kernels executorch)
+  FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml)
+gen_operators_lib("arm_portable_ops_lib"
+  KERNEL_LIBS portable_kernels
+  DEPS executorch)
@@ -81,7 +81,8 @@ elseif(REGISTER_EXAMPLE_CUSTOM_OP EQUAL 2)
   gen_selected_ops("" "my_ops::mul4.out" "")
 endif()
 # Expect gen_selected_ops output file to be selected_operators.yaml
-generate_bindings_for_kernels("" ${CMAKE_CURRENT_LIST_DIR}/custom_ops.yaml)
+generate_bindings_for_kernels(
+  CUSTOM_OPS_YAML ${CMAKE_CURRENT_LIST_DIR}/custom_ops.yaml)
 message("Generated files ${gen_command_sources}")
 
 # Prepare for C++ libraries.
@@ -109,7 +110,10 @@ add_library(custom_kernels ${kernel_sources})
 target_link_libraries(custom_kernels PRIVATE executorch)
 target_compile_options(custom_kernels PUBLIC ${_common_compile_options})
 
-gen_operators_lib("custom_ops_lib" custom_kernels executorch)
+gen_operators_lib(
+  "custom_ops_lib"
+  KERNEL_LIBS custom_kernels
+  DEPS executorch)
 
 list(TRANSFORM _executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
 
@@ -58,9 +58,11 @@ set(_qnn_executor_runner__srcs ${_executor_runner__srcs})
 include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
 gen_selected_ops("" "" "ON")
 generate_bindings_for_kernels(
-  ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml ""
+  FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
 )
-gen_operators_lib("full_portable_ops_lib" portable_kernels executorch)
+gen_operators_lib("full_portable_ops_lib"
+  KERNEL_LIBS portable_kernels
+  DEPS executorch)
 target_compile_options(full_portable_ops_lib
     INTERFACE
     -DET_EVENT_TRACER_ENABLED
 
@@ -51,9 +51,12 @@ target_compile_options(executorch INTERFACE -DET_EVENT_TRACER_ENABLED)
 gen_selected_ops("" "" "ON")
 # Expect gen_selected_ops output file to be selected_operators.yaml
 generate_bindings_for_kernels(
-  ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml ""
+  FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
 )
-gen_operators_lib("portable_ops_lib" portable_kernels executorch)
+gen_operators_lib(
+  "portable_ops_lib"
+  KERNEL_LIBS portable_kernels
+  DEPS executorch)
 
 target_compile_options(portable_ops_lib INTERFACE -DET_EVENT_TRACER_ENABLED)
 target_include_directories(
 
@@ -99,9 +99,13 @@ gen_selected_ops(
   "${EXECUTORCH_SELECT_OPS_LIST}"
   "${EXECUTORCH_SELECT_ALL_OPS}")
 
-generate_bindings_for_kernels(${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
-                              "${_custom_ops_yaml}")
-gen_operators_lib("select_build_lib" ${_kernel_lib} executorch)
+generate_bindings_for_kernels(
+  FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
+  CUSTOM_OPS_YAML "${_custom_ops_yaml}")
+gen_operators_lib(
+  "select_build_lib"
+  KERNEL_LIBS ${_kernel_lib}
+  DEPS executorch)
 
 list(TRANSFORM _executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/")
 
 
@@ -42,7 +42,11 @@ target_link_libraries(custom_ops PRIVATE xtensa_kernels)
 # Generate C++ bindings to register kernels into both PyTorch (for AOT) and
 # Executorch (for runtime). Here select all ops in functions.yaml
 gen_selected_ops("${CMAKE_CURRENT_LIST_DIR}/functions.yaml" "" "")
-generate_bindings_for_kernels(${CMAKE_CURRENT_SOURCE_DIR}/functions.yaml "")
+generate_bindings_for_kernels(
+  FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/functions.yaml)
 message("Generated files ${gen_command_sources}")
 
-gen_operators_lib("xtensa_ops_lib" custom_ops aten_ops_xtensa)
+gen_operators_lib(
+  "xtensa_ops_lib"
+  KERNEL_LIBS custom_ops
+  DEPS aten_ops_xtensa)