Update on "Add quantized op support to llama runner"

larryliu0820 · larryliu0820 · commit 41abbb522b23 · 2024-04-16T17:21:44.000-07:00
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: [D56197863](https://our.internmc.facebook.com/intern/diff/D56197863) [ghstack-poisoned]
diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh
@@ -75,7 +75,6 @@ cmake_install_executorch_libraries() {
         -DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
         -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
-        -DEXECUTORCH_BUILD_QUANTIZED="$QE" \
         -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out .
@@ -91,7 +90,6 @@ cmake_build_llama_runner() {
         -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
         -DEXECUTORCH_BUILD_OPTIMIZED=ON \
         -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-        -DEXECUTORCH_BUILD_QUANTIZED="$QE" \
         -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
         -Bcmake-out/${dir} \
         ${dir}
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -164,8 +164,6 @@ option(EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" OFF)
 
 option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF)
 
-option(EXECUTORCH_BUILD_QUANTIZED "Build the quantized kernels" OFF)
-
 option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK")
 
 option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF)
@@ -413,9 +411,7 @@ if(EXECUTORCH_BUILD_OPTIMIZED)
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
 endif()
 
-if(EXECUTORCH_BUILD_QUANTIZED)
-  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
-endif()
+add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
 
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
 
@@ -445,19 +441,14 @@ cmake_dependent_option(
   EXECUTORCH_BUILD_HOST_TARGETS OFF)
 if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
   # Baseline libraries that executor_runner will link against.
-  set(_executor_runner_libs executorch gflags)
+  set(_executor_runner_libs executorch gflags quantized_ops_lib)
 
   if(EXECUTORCH_BUILD_OPTIMIZED)
     list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
   else()
     list(APPEND _executor_runner_libs portable_ops_lib)
   endif()
 
-  # Generate lib to register quantized ops
-  if(EXECUTORCH_BUILD_QUANTIZED)
-    list(APPEND _executor_runner_libs quantized_ops_lib)
-  endif()
-
   add_executable(executor_runner ${_executor_runner__srcs})
   if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE)
     target_link_options(executor_runner PRIVATE "LINKER:--gc-sections")
diff --git a/build/Utils.cmake b/build/Utils.cmake
@@ -74,8 +74,6 @@ function(executorch_print_configuration_summary)
     STATUS "  EXECUTORCH_BUILD_QNN                   : ${EXECUTORCH_BUILD_QNN}")
   message(STATUS "  EXECUTORCH_BUILD_OPTIMIZED             : "
                  "${EXECUTORCH_BUILD_OPTIMIZED}")
-  message(STATUS "  EXECUTORCH_BUILD_QUANTIZED             : "
-                 "${EXECUTORCH_BUILD_QUANTIZED}")
   message(
     STATUS "  EXECUTORCH_BUILD_SDK                   : ${EXECUTORCH_BUILD_SDK}")
   message(
diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh
@@ -22,7 +22,7 @@ CUSTOM=OFF
 MPS=OFF
 OPTIMIZED=OFF
 PORTABLE=OFF
-QUANTIZED=OFF
+QUANTIZED=ON
 XNNPACK=OFF
 HEADERS_PATH="include"
 EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libexecutorch_no_prim_ops.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
@@ -51,7 +51,6 @@ usage() {
   echo "  --mps                Include this flag to build the Metal Performance Shaders backend."
   echo "  --optimized          Include this flag to build the Optimized backend."
   echo "  --portable           Include this flag to build the Portable backend."
-  echo "  --quantized          Include this flag to build the Quantized backend."
   echo "  --xnnpack            Include this flag to build the XNNPACK backend."
   echo
   echo "Example:"
@@ -74,7 +73,6 @@ for arg in "$@"; do
       --mps) MPS=ON ;;
       --optimized) OPTIMIZED=ON ;;
       --portable) PORTABLE=ON ;;
-      --quantized) QUANTIZED=ON ;;
       --xnnpack) XNNPACK=ON ;;
       *)
       if [[ -z "$SOURCE_ROOT_DIR" ]]; then
@@ -137,7 +135,6 @@ cmake_build() {
         -DEXECUTORCH_BUILD_CUSTOM=$CUSTOM \
         -DEXECUTORCH_BUILD_MPS=$MPS \
         -DEXECUTORCH_BUILD_OPTIMIZED=$OPTIMIZED \
-        -DEXECUTORCH_BUILD_QUANTIZED=$QUANTIZED \
         -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \
         ${platform_flag:+-DIOS_PLATFORM=$platform_flag}
     cmake --build . --config $MODE
@@ -181,7 +178,7 @@ append_framework_flag "$CUSTOM" "$CUSTOM_FRAMEWORK"
 append_framework_flag "$MPS" "$MPS_FRAMEWORK"
 append_framework_flag "$OPTIMIZED" "$OPTIMIZED_FRAMEWORK"
 append_framework_flag "$PORTABLE" "$PORTABLE_FRAMEWORK"
-append_framework_flag "$QUANTIZED" "$QUANTIZED_FRAMEWORK"
+append_framework_flag "ON" "$QUANTIZED_FRAMEWORK"
 append_framework_flag "$XNNPACK" "$XNNPACK_FRAMEWORK"
 
 "$SOURCE_ROOT_DIR"/build/create_frameworks.sh "${FRAMEWORK_FLAGS[@]}"
diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt
@@ -120,28 +120,25 @@ else()
   target_link_options_shared_lib(portable_ops_lib)
 endif()
 
-if(EXECUTORCH_BUILD_QUANTIZED)
-  # TODO(larryliu0820): after we delete llama_quantized ops we should be able to reuse
-  # quantized_kernels and quantized_ops_lib directly.
-  merge_yaml(
-    FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/ops/quantized.yaml
-    FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/quantized/quantized.yaml
-    OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
-
-  gen_selected_ops("${CMAKE_CURRENT_BINARY_DIR}/merged.yaml" "" "")
-  generate_bindings_for_kernels(
-      FUNCTIONS_YAML ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml)
-  message("Generated files ${gen_command_sources}")
-
-  # quantized_merge_ops_lib: Register quantized op kernels into the runtime
-  gen_operators_lib(
-    "quantized_merge_ops_lib"
-    KERNEL_LIBS quantized_kernels
-    DEPS executorch)
-  target_include_directories(quantized_merge_ops_lib PUBLIC ${_common_include_directories})
-  target_link_options_shared_lib(quantized_merge_ops_lib)
-  list(APPEND link_libraries quantized_kernels quantized_merge_ops_lib)
-endif()
+# quantized ops yaml file operation
+merge_yaml(
+  FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/ops/quantized.yaml
+  FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/quantized/quantized.yaml
+  OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
+
+gen_selected_ops("${CMAKE_CURRENT_BINARY_DIR}/merged.yaml" "" "")
+generate_bindings_for_kernels(
+    FUNCTIONS_YAML ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml)
+message("Generated files ${gen_command_sources}")
+
+# quantized_merge_ops_lib: Register quantized op kernels into the runtime
+gen_operators_lib(
+  "quantized_merge_ops_lib"
+  KERNEL_LIBS quantized_kernels
+  DEPS executorch)
+target_include_directories(quantized_merge_ops_lib PUBLIC ${_common_include_directories})
+target_link_options_shared_lib(quantized_merge_ops_lib)
+list(APPEND link_libraries quantized_kernels quantized_merge_ops_lib)
 
 if(EXECUTORCH_BUILD_CUSTOM)
   target_link_options_shared_lib(custom_ops)
diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl
@@ -4,7 +4,7 @@ def _get_operator_lib(aten = False):
     if aten:
         return ["//executorch/kernels/aten:generated_lib"]
     elif runtime.is_oss:
-        return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops"]
+        return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"]
     else:
         return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"]