Building optimized library with CMake

GregoryComer · GregoryComer · commit c48f51bc48ea · 2024-03-20T04:23:03.000-07:00
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -81,7 +81,7 @@ endif()
 set(EXECUTORCH_LOG_LEVEL "Info" CACHE STRING
       "Build with the given ET_MIN_LOG_LEVEL value")
 string(TOLOWER "${EXECUTORCH_LOG_LEVEL}" LOG_LEVEL_LOWER)
-if (LOG_LEVEL_LOWER STREQUAL "debug")
+if(LOG_LEVEL_LOWER STREQUAL "debug")
   add_definitions(-DET_MIN_LOG_LEVEL=Debug)
 elseif(LOG_LEVEL_LOWER STREQUAL "info")
   add_definitions(-DET_MIN_LOG_LEVEL=Info)
@@ -90,8 +90,9 @@ elseif(LOG_LEVEL_LOWER STREQUAL "error")
 elseif(LOG_LEVEL_LOWER STREQUAL "fatal")
   add_definitions(-DET_MIN_LOG_LEVEL=Fatal)
 else()
-  message(SEND_ERROR 
-      "Unknown log level \"${EXECUTORCH_LOG_LEVEL}\". Expected one of Debug, Info, Error, or Fatal.")
+  message(SEND_ERROR
+      "Unknown log level \"${EXECUTORCH_LOG_LEVEL}\"." + 
+      "Expected one of Debug, Info, Error, or Fatal.")
 endif()
 
 option(EXECUTORCH_ENABLE_PROGRAM_VERIFICATION
@@ -308,6 +309,7 @@ endif()
 # operators necessary for the models that will run.
 #
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/portable)
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 
 #
 # gflags: Commandline flag host library.
diff --git a/build/cmake_deps.toml b/build/cmake_deps.toml
@@ -45,6 +45,22 @@ deps = [
   "executorch",
 ]
 
+[targets.optimized_kernels]
+buck_targets = [
+  "//kernels/optimized:generated_lib",
+]
+filters = [
+  ".cpp$",
+]
+excludes = [
+  # Exclude the codegen templates, which are picked up because the buck target
+  # is the generated_lib and not the unwrapped set of kernels.
+  "^codegen/templates",
+]
+deps = [
+  "executorch",
+]
+
 [targets.quantized_kernels]
 buck_targets = [
   "//kernels/quantized:generated_lib",
diff --git a/kernels/optimized/CMakeLists.txt b/kernels/optimized/CMakeLists.txt
@@ -0,0 +1,55 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Kernel library for optimized kernels. Please this file formatted by running:
+# ~~~
+# cmake-format --first-comment-is-literal=True CMakeLists.txt
+# ~~~
+
+cmake_minimum_required(VERSION 3.19)
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+if(NOT CMAKE_CXX_STANDARD)
+  set(CMAKE_CXX_STANDARD 17)
+endif()
+
+if(NOT PYTHON_EXECUTABLE)
+  set(PYTHON_EXECUTABLE python3)
+endif()
+# Source root directory for executorch.
+if(NOT EXECUTORCH_ROOT)
+  set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../..)
+endif()
+# Source root directory for pytorch.
+if(NOT TORCH_ROOT)
+  set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
+endif()
+
+set(_common_compile_options -Wno-deprecated-declarations)
+
+# Set architecture-dependent flags.
+set(_arch_compile_flags "")
+# TODO
+
+include(${EXECUTORCH_ROOT}/build/Utils.cmake)
+include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
+
+# Generate C++ bindings to register kernels into both PyTorch (for AOT) and
+# Executorch (for runtime). Here select all ops in optimized.yaml
+set(_yaml "${CMAKE_CURRENT_LIST_DIR}/optimized.yaml")
+gen_selected_ops("${_yaml}" "" "")
+
+generate_bindings_for_kernels(${CMAKE_CURRENT_SOURCE_DIR}/optimized.yaml "")
+message("Generated files ${gen_command_sources}")
+
+list(TRANSFORM _optimized_kernels__srcs PREPEND "${EXECUTORCH_ROOT}/")
+add_library(optimized_kernels ${_optimized_kernels__srcs})
+target_link_libraries(optimized_kernels PRIVATE executorch)
+target_compile_options(optimized_kernels PUBLIC ${_common_compile_options})
+# Build a library for _optimized_kernels_srcs
+#
+# optimized_ops_lib: Register optimized ops kernels into Executorch runtime
+gen_operators_lib("optimized_ops_lib" optimized_kernels executorch)
diff --git a/kernels/optimized/op_registration_util.bzl b/kernels/optimized/op_registration_util.bzl
@@ -1,5 +1,5 @@
-load("@fbsource//tools/build_defs:selects.bzl", "selects")
 load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
 load(
     "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
     "get_vec_android_preprocessor_flags",
diff --git a/shim/tools/build_defs/default_platform_defs.bzl b/shim/tools/build_defs/default_platform_defs.bzl
@@ -0,0 +1 @@
+DEVSERVER_PLATFORM_REGEX = "UNUSED"
diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim/xplat/executorch/build/env_interface.bzl
@@ -117,7 +117,7 @@ def _remove_platform_specific_args(kwargs):
     """
     keys = []
     for key in kwargs:
-        if key.endswith("_platform_preprocessor_flags") or key.endswith("_platform_deps"):
+        if key.endswith("_platform_preprocessor_flags") or key.endswith("_platform_deps") or key.startswith("fbobjc"):
             keys.append(key)
     for key in keys:
         kwargs.pop(key)
diff --git a/shim/xplat/executorch/kernels/optimized/lib_defs.bzl b/shim/xplat/executorch/kernels/optimized/lib_defs.bzl
@@ -0,0 +1,142 @@
+load("@fbsource//tools/build_defs:default_platform_defs.bzl", "DEVSERVER_PLATFORM_REGEX")
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+
+# Because vec exists as a collection of header files, compile and preprocessor
+# flags applied to the vec target do not have any effect, since no compilation
+# actually occurs for the target.
+#
+# Targets using the vec library must therefore call the get_vec_*_flags
+# functions in order to declare the required compiler flags needed in order to
+# access CPU vector intrinsics.
+
+def get_vec_android_preprocessor_flags():
+    preprocessor_flags = [
+        (
+            "^android-arm64.*$",
+            [
+                "-DET_BUILD_ARM_VEC256_WITH_SLEEF",
+            ],
+        ),
+    ]
+    return preprocessor_flags
+
+def get_vec_cxx_preprocessor_flags():
+    preprocessor_flags = [
+        (
+            DEVSERVER_PLATFORM_REGEX,
+            [
+                "-DCPU_CAPABILITY_AVX2",
+            ],
+        ),
+    ]
+    return preprocessor_flags
+
+def get_vec_fbcode_preprocessor_flags():
+    preprocessor_flags = [
+        "-DCPU_CAPABILITY_AVX2",
+    ]
+    return preprocessor_flags
+
+# Currently, having a dependency on fbsource//third-party/sleef:sleef may cause
+# duplicate symbol errors when linking fbcode targets in opt mode that also
+# depend on ATen. This is because ATen accesses sleef via the third-party folder
+# in caffe2 (caffe2/third-party//sleef:sleef).
+# TODO(ssjia): Enable -DCPU_CAPABILITY_AVX2 in fbcode, which requires sleef.
+def define_libs():
+    runtime.cxx_library(
+        name = "libvec",
+        srcs = [],
+        exported_headers = native.glob([
+            "vec/**/*.h",
+        ]),
+        header_namespace = "executorch/kernels/optimized",
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        cxx_platform_deps = select({
+            "DEFAULT": [
+                (
+                    DEVSERVER_PLATFORM_REGEX,
+                    [
+                        "fbsource//third-party/sleef:sleef",
+                    ],
+                ),
+            ],
+            "ovr_config//cpu:arm64": [
+                (
+                    DEVSERVER_PLATFORM_REGEX,
+                    [
+                        "fbsource//third-party/sleef:sleef_arm",
+                    ],
+                ),
+            ],
+        }),
+        fbandroid_platform_deps = [
+            (
+                "^android-arm64.*$",
+                [
+                    "fbsource//third-party/sleef:sleef_arm",
+                ],
+            ),
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "libutils",
+        srcs = [],
+        exported_headers = native.glob([
+            "utils/**/*.h",
+        ]),
+        header_namespace = "executorch/kernels/optimized",
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        exported_deps = [
+            # Needed to access the __ET_INLINE macro
+            "//executorch/runtime/platform:compiler",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "libblas",
+        srcs = native.glob([
+            "blas/**/*.cpp",
+        ]),
+        exported_headers = native.glob([
+            "blas/**/*.h",
+        ]),
+        header_namespace = "executorch/kernels/optimized",
+        visibility = [
+            "//executorch/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        fbandroid_platform_preprocessor_flags = [
+            (
+                "^android-arm64.*$",
+                [
+                    "-DET_BUILD_WITH_BLAS",
+                ],
+            ),
+        ],
+        fbandroid_platform_deps = [
+            (
+                "^android-arm64.*$",
+                [
+                    "fbsource//third-party/openblas:openblas",
+                ],
+            ),
+        ],
+        fbobjc_exported_preprocessor_flags = [
+            "-DET_BUILD_WITH_BLAS",
+            "-DET_BUILD_FOR_APPLE",
+        ],
+        fbobjc_frameworks = [
+            "Accelerate",
+        ],
+        exported_deps = [
+            "//executorch/kernels/optimized:libutils",
+            "//executorch/runtime/core/exec_aten:lib",
+        ],
+    )
diff --git a/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl b/shim/xplat/executorch/kernels/optimized/op_registration_util.bzl
@@ -0,0 +1,126 @@
+load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime")
+load("@fbsource//xplat/executorch/build:selects.bzl", "selects")
+load(
+    "@fbsource//xplat/executorch/kernels/optimized:lib_defs.bzl",
+    "get_vec_android_preprocessor_flags",
+)
+
+def op_target(name, deps = []):
+    """Registers an optimized implementation for an operator overload group.
+
+    An operator overload group is a set of operator overloads with a common
+    operator name. That common operator name should be the base name of this
+    target.
+
+    E.g., the "add" operator overload group, named "op_add" in this target,
+    might implement:
+    - add.Tensor
+    - add_.Tensor
+    - add.out
+    - add.Scalar
+
+    If an op target would like to share a header/sources with a different op
+    target (e.g., helpers/utilities), it should declare a separate cxx_library
+    and add it as a dep.
+
+    Args:
+        name: The name of the operator overload group; e.g.,
+            "op_add". This directory must contain a source file named
+            "<name>.cpp"; e.g., "op_add.cpp".
+        deps: Optional extra deps to add to the cxx_library(). Note:
+            - op targets may not depend on other op targets, to keep the
+              dependencies manageable. If two op targets would like to share
+              code, define a separate runtime.cxx_library that they both depend
+              on.
+    """
+
+    # Note that this doesn't actually define the target, but helps register
+    # it in a table that's used to define the target.
+    return {
+        "deps": deps,
+        "name": name,
+    }
+
+def _enforce_deps(deps, name):
+    """Fails if any of the deps are not allowed.
+
+    Args:
+        deps: A list of build target strings.
+        name: The name of the target; e.g., "op_add"
+    """
+    for dep in deps:
+        if dep.startswith(":op_"):
+            # op targets may not depend on other op targets, to keep the
+            # dependencies manageable. If two op targets would like to share
+            # code, define a separate runtime.cxx_library that they both depend
+            # on.
+            fail("op_target {} may not depend on other op_target {}".format(
+                name,
+                dep,
+            ))
+
+def define_op_library(name, deps):
+    """Defines a cxx_library target for the named operator overload group.
+
+    Args:
+        name: The name of the target; e.g., "op_add"
+        deps: List of deps for the target.
+    """
+    selects.apply(obj = deps, function = native.partial(_enforce_deps, name = name))
+
+    augmented_deps = deps + [
+        "//executorch/kernels/optimized:libvec",
+        "//executorch/kernels/optimized:libutils",
+    ]
+
+    runtime.cxx_library(
+        name = "{}".format(name),
+        srcs = [
+            "{}.cpp".format(name),
+        ],
+        visibility = [
+            "//executorch/kernels/portable/test/...",
+            "//executorch/kernels/quantized/test/...",
+            "//executorch/kernels/optimized/test/...",
+            "//executorch/kernels/test/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+        # kernels often have helpers with no prototypes just disabling the warning here as the headers
+        # are codegend and linked in later
+        compiler_flags = ["-Wno-missing-prototypes"],
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+        ] + augmented_deps,
+        fbandroid_platform_preprocessor_flags = get_vec_android_preprocessor_flags(),
+        # sleef needs to be added as a direct dependency of the operator target when building for Android,
+        # or a linker error may occur. Not sure why this happens; it seems that fbandroid_platform_deps of
+        # dependencies are not transitive
+        fbandroid_platform_deps = [
+            (
+                "^android-arm64.*$",
+                [
+                    "fbsource//third-party/sleef:sleef_arm",
+                ],
+            ),
+        ],
+        # link_whole is necessary because the operators register themselves
+        # via static initializers that run at program startup.
+        # @lint-ignore BUCKLINT link_whole
+        link_whole = True,
+    )
+
+def define_op_target(name, deps):
+    """Possibly defines cxx_library targets for the named operator group.
+
+    Args:
+        name: The base name of the target; e.g., "op_add"
+        deps: List of deps for the targets.
+    """
+
+    # When building in ATen mode, ATen-compatible (non-custom) operators will
+    # use the implementations provided by ATen, so we should not build the
+    # versions defined here.
+    define_op_library(
+        name = name,
+        deps = deps,
+    )