pytorch
diff --git a/‎CMakeLists.txt
Lines changed: 4 additions & 1 deletion b/‎CMakeLists.txt
Lines changed: 4 additions & 1 deletion
diff --git a/‎build/Codegen.cmake
Lines changed: 59 additions & 15 deletions b/‎build/Codegen.cmake
Lines changed: 59 additions & 15 deletions
diff --git a/‎build/cmake_deps.toml
Lines changed: 16 additions & 0 deletions b/‎build/cmake_deps.toml
Lines changed: 16 additions & 0 deletions
diff --git a/‎codegen/tools/merge_yaml.py
Lines changed: 8 additions & 2 deletions b/‎codegen/tools/merge_yaml.py
Lines changed: 8 additions & 2 deletions
diff --git a/‎configurations/CMakeLists.txt
Lines changed: 51 additions & 0 deletions b/‎configurations/CMakeLists.txt
Lines changed: 51 additions & 0 deletions
diff --git a/‎kernels/optimized/CMakeLists.txt
Lines changed: 55 additions & 0 deletions b/‎kernels/optimized/CMakeLists.txt
Lines changed: 55 additions & 0 deletions
diff --git a/‎kernels/optimized/cpu/targets.bzl
Lines changed: 5 additions & 3 deletions b/‎kernels/optimized/cpu/targets.bzl
Lines changed: 5 additions & 3 deletions
diff --git a/‎kernels/optimized/op_registration_util.bzl
Lines changed: 5 additions & 1 deletion b/‎kernels/optimized/op_registration_util.bzl
Lines changed: 5 additions & 1 deletion
diff --git a/‎kernels/optimized/optimized-oss.yaml
Lines changed: 73 additions & 0 deletions b/‎kernels/optimized/optimized-oss.yaml
Lines changed: 73 additions & 0 deletions
@@ -309,6 +309,9 @@ endif()
 # operators necessary for the models that will run.
 #
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
+
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
 
 #
 # gflags: Commandline flag host library.
@@ -336,7 +339,7 @@ cmake_dependent_option(
   EXECUTORCH_BUILD_HOST_TARGETS OFF)
 if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   # Baseline libraries that executor_runner will link against.
-  set(_executor_runner_libs executorch portable_ops_lib gflags)
+  set(_executor_runner_libs executorch optimized_native_cpu_ops_lib gflags)
 
   # Generate lib to register quantized ops
   if(REGISTER_QUANTIZED_OPS)
 
@@ -36,9 +36,24 @@ function(gen_selected_ops ops_schema_yaml root_ops include_all_ops)
 
 endfunction()
 
-# Codegen for registering kernels. Kernels are defined in functions_yaml and
-# custom_ops_yaml
-function(generate_bindings_for_kernels functions_yaml custom_ops_yaml)
+# Codegen for registering kernels. Kernels are defined in functions_yaml,
+# custom_ops_yaml, and optionally fallback_yaml.
+#
+# Invoked as
+# generate_bindings_for_kernels(
+#   FUNCTIONS_YAML functions_yaml
+#   FALLBACK_YAML fallback_yaml
+#   CUSTOM_OPS_YAML custom_ops_yaml
+# )
+function(generate_bindings_for_kernels)
+  set(arg_names FUNCTIONS_YAML FALLBACK_YAML CUSTOM_OPS_YAML)
+  cmake_parse_arguments(GEN "" "${arg_names}" "" ${ARGN})
+
+  message(STATUS "Generating kernel bindings:")
+  message(STATUS "  FUNCTIONS_YAML: ${GEN_FUNCTIONS_YAML}")
+  message(STATUS "  FALLBACK_YAML: ${GEN_FALLBACK_YAML}")
+  message(STATUS "  CUSTOM_OPS_YAML: ${GEN_CUSTOM_OPS_YAML}")
+
   # Command to generate selected_operators.yaml from custom_ops.yaml.
   file(GLOB_RECURSE _codegen_templates "${EXECUTORCH_ROOT}/codegen/templates/*")
   file(GLOB_RECURSE _torchgen_srcs "${TORCH_ROOT}/torchgen/*.py")
@@ -60,11 +75,11 @@ function(generate_bindings_for_kernels functions_yaml custom_ops_yaml)
       ${CMAKE_CURRENT_BINARY_DIR}/Functions.h
       ${CMAKE_CURRENT_BINARY_DIR}/NativeFunctions.h)
 
-  if(functions_yaml)
-    list(APPEND _gen_command --functions-yaml-path=${functions_yaml})
+  if(GEN_FUNCTIONS_YAML)
+    list(APPEND _gen_command --functions-yaml-path=${GEN_FUNCTIONS_YAML})
   endif()
-  if(custom_ops_yaml)
-    list(APPEND _gen_command --custom-ops-yaml-path=${custom_ops_yaml})
+  if(GEN_CUSTOM_OPS_YAML)
+    list(APPEND _gen_command --custom-ops-yaml-path=${GEN_CUSTOM_OPS_YAML})
     list(
       APPEND
       _gen_command_sources
@@ -77,7 +92,7 @@ function(generate_bindings_for_kernels functions_yaml custom_ops_yaml)
     COMMENT "Generating code for kernel registration"
     OUTPUT ${_gen_command_sources}
     COMMAND ${_gen_command}
-    DEPENDS ${_oplist_yaml} ${custom_ops_yaml} ${functions_yaml}
+    DEPENDS ${_oplist_yaml} ${GEN_CUSTOM_OPS_YAML} ${GEN_FUNCTIONS_YAML}
             ${_codegen_templates} ${_torchgen_srcs}
     WORKING_DIRECTORY ${EXECUTORCH_ROOT})
   # Make generated file list available in parent scope
@@ -107,18 +122,47 @@ function(gen_custom_ops_aot_lib lib_name kernel_sources)
 endfunction()
 
 # Generate a runtime lib for registering operators in Executorch
-function(gen_operators_lib lib_name kernel_lib deps)
-  add_library(${lib_name})
+function(gen_operators_lib)
+  set(arg_names LIB_NAME)
+  set(multi_arg_names KERNEL_LIBS DEPS)
+  cmake_parse_arguments(GEN "" "${arg_names}" "${multi_arg_names}" ${ARGN})
+
+  message(STATUS "Generating operator lib:")
+  message(STATUS "  LIB_NAME: ${GEN_LIB_NAME}")
+  message(STATUS "  KERNEL_LIBS: ${GEN_KERNEL_LIBS}")
+  message(STATUS "  DEPS: ${GEN_DEPS}")
+
+  add_library(${GEN_LIB_NAME})
   target_sources(
-    ${lib_name}
+    ${GEN_LIB_NAME}
     PRIVATE
       ${CMAKE_CURRENT_BINARY_DIR}/RegisterCodegenUnboxedKernelsEverything.cpp
       ${CMAKE_CURRENT_BINARY_DIR}/Functions.h
       ${CMAKE_CURRENT_BINARY_DIR}/NativeFunctions.h)
-  target_link_libraries(${lib_name} PRIVATE ${deps})
-  if(kernel_lib)
-    target_link_libraries(${lib_name} PRIVATE ${kernel_lib})
+  target_link_libraries(${GEN_LIB_NAME} PRIVATE ${GEN_DEPS})
+  if(GEN_KERNEL_LIBS)
+    target_link_libraries(${GEN_LIB_NAME} PRIVATE ${GEN_KERNEL_LIBS})
   endif()
 
-  target_link_options_shared_lib(${lib_name})
+  target_link_options_shared_lib(${GEN_LIB_NAME})
 endfunction()
+
+# Merge two kernel yaml files, prioritizing functions from FUNCTIONS_YAML
+# and taking functions from FALLBACK_YAML when no implementation is found.
+# This corresponds to the merge_yaml buck implementation in codegen/tools.
+function(merge_yaml)
+  set(arg_names FUNCTIONS_YAML FALLBACK_YAML OUTPUT_DIR)
+  cmake_parse_arguments(GEN "" "${arg_names}" "" ${ARGN})
+
+  set(_gen_command
+      "${PYTHON_EXECUTABLE}" -m codegen.tools.merge_yaml
+      --functions_yaml_path=${GEN_FUNCTIONS_YAML}
+      --fallback_yaml_path=${GEN_FALLBACK_YAML}
+      --output_dir=${GEN_OUTPUT_DIR})
+
+  add_custom_command(
+    COMMENT "Merging kernel yaml files"
+    OUTPUT ${GEN_OUTPUT_DIR}/merged.yaml
+    COMMAND ${_gen_command}
+    WORKING_DIRECTORY ${EXECUTORCH_ROOT})
+endfunction()
@@ -45,6 +45,22 @@ deps = [
   "executorch",
 ]
 
+[targets.optimized_kernels]
+buck_targets = [
+  "//kernels/optimized:generated_lib",
+]
+filters = [
+  ".cpp$",
+]
+excludes = [
+  # Exclude the codegen templates, which are picked up because the buck target
+  # is the generated_lib and not the unwrapped set of kernels.
+  "^codegen/templates",
+]
+deps = [
+  "executorch",
+]
+
 [targets.quantized_kernels]
 buck_targets = [
   "//kernels/quantized:generated_lib",
 
@@ -12,14 +12,20 @@
 
 import yaml
 
-from executorch.codegen.tools.yaml_util import BlankLineDumper
-
 try:
     from yaml import CSafeLoader as Loader
 except ImportError:
     from yaml import SafeLoader as Loader  # type: ignore[misc]
 
 
+class BlankLineDumper(yaml.SafeDumper):
+    def write_line_break(self, data=None):
+        super().write_line_break(data)
+        # insert a new line between entries.
+        if len(self.indents) == 1:
+            super().write_line_break()
+
+
 def merge(functions_yaml_path: str, fallback_yaml_path: Optional[str], output_dir: str):
     output_file = os.path.join(output_dir, "merged.yaml")
 
 
@@ -0,0 +1,51 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(NOT PYTHON_EXECUTABLE)
+  set(PYTHON_EXECUTABLE python3)
+endif()
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/..)
+endif()
+# Source root directory for pytorch. This is needed for kernel binding.
+if(NOT TORCH_ROOT)
+  set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
+endif()
+
+set(_common_compile_options -Wno-deprecated-declarations)
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+
+
+# Merge optimized and portable definitions, taking optimized where available.
+merge_yaml(
+    FUNCTIONS_YAML ${EXECUTORCH_ROOT}/kernels/optimized/optimized-oss.yaml
+    FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/portable/functions.yaml
+    OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}
+)
+
+gen_selected_ops("${CMAKE_CURRENT_BINARY_DIR}/merged.yaml" "" "")
+
+generate_bindings_for_kernels(
+    FUNCTIONS_YAML ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml)
+message("Generated files ${gen_command_sources}")
+
+# optimized_native_cpu_ops_lib: Register optimized op kernels into the runtime
+gen_operators_lib(
+  LIB_NAME "optimized_native_cpu_ops_lib"
+  KERNEL_LIBS portable_kernels optimized_kernels
+  DEPS executorch)
+
+install(TARGETS optimized_native_cpu_ops_lib DESTINATION lib)
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Kernel library for optimized kernels. Please this file formatted by running:
+# ~~~
+# cmake-format --first-comment-is-literal=True CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(NOT PYTHON_EXECUTABLE)
+  set(PYTHON_EXECUTABLE python3)
+endif()
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+# Source root directory for pytorch. This is needed for kernel binding.
+if(NOT TORCH_ROOT)
+  set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
+endif()
+
+set(_common_compile_options -Wno-deprecated-declarations)
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+
+# Generate C++ bindings to register kernels into both PyTorch (for AOT) and
+# Executorch (for runtime). Here select all ops in optimized.yaml
+set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized-oss.yaml")
+gen_selected_ops("${_yaml}" "" "")
+
+generate_bindings_for_kernels(
+  FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/optimized-oss.yaml)
+message("Generated files ${gen_command_sources}")
+
+list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
+add_library(optimized_kernels ${_optimized_kernels__srcs})
+target_link_libraries(optimized_kernels PRIVATE executorch)
+target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})
+# Build a library for _optimized_kernels_srcs
+#
+# optimized_ops_lib: Register optimized ops kernels into Executorch runtime
+gen_operators_lib(
+  LIB_NAME "optimized_ops_lib"
+  KERNEL_LIBS optimized_kernels
+  DEPS executorch)
@@ -1,5 +1,5 @@
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
-load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "op_target")
+load("@fbsource//xplat/executorch/kernels/optimized:op_registration_util.bzl", "define_op_target", "is_op_disabled", "op_target")
 
 _OPTIMIZED_ATEN_OPS = (
     op_target(
@@ -81,11 +81,13 @@ def define_common_targets():
     TARGETS and BUCK files that call this function.
     """
 
+    enabled_ops = [op for op in _OPTIMIZED_ATEN_OPS if not is_op_disabled(op["name"])]
+
     # Define build targets for all operators registered in the tables above.
-    for op in _OPTIMIZED_ATEN_OPS:
+    for op in enabled_ops:
         define_op_target(**op)
 
-    aten_op_targets = [":{}".format(op["name"]) for op in _OPTIMIZED_ATEN_OPS]
+    aten_op_targets = [":{}".format(op["name"]) for op in enabled_ops]
     all_op_targets = aten_op_targets
 
     runtime.cxx_library(
 
@@ -1,5 +1,5 @@
-load("@fbsource//tools/build_defs:selects.bzl", "selects")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
     "get_vec_android_preprocessor_flags",
@@ -124,3 +124,7 @@ def define_op_target(name, deps):
         name = name,
         deps = deps,
     )
+
+def is_op_disabled(name):
+    # All ops are enabled for internal builds.
+    return False
@@ -0,0 +1,73 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This yaml file contains operators that have optimized kernels available.
+
+- op: add.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_add_out
+
+- op: add.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_add_scalar_out
+
+- op: bmm.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_bmm_out
+
+- op: div.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_div_out
+
+- op: div.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_div_scalar_out
+
+- op: exp.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_exp_out
+
+- op: le.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_le_scalar_out
+
+- op: le.Tensor_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_le_tensor_out
+
+- op: mul.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_mul_out
+
+- op: mul.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_mul_scalar_out
+
+- op: native_layer_norm.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_native_layer_norm_out
+
+- op: neg.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_neg_out
+
+- op: sub.out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_sub_out
+
+- op: sub.Scalar_out
+  kernels:
+    - arg_meta: null
+      kernel_name: torch::executor::opt_sub_scalar_out