Skip to content

Add quantized op support to llama runner #3062

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions .ci/scripts/test_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
MODEL_NAME=$1 # stories110M.pt
BUILD_TOOL=$2 # buck2 or cmake
DTYPE=$3 # fp16 or fp32
MODE=${4:-"xnnpack"} # portable or xnnpack
MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
echo "Expecting atleast 4 positional arguments"
echo "Usage: [...]"
Expand All @@ -37,7 +37,7 @@ if [[ -z "${MODE:-}" ]]; then
exit 1
fi

if [[ "${MODE}" =~ xnnpack.* ]]; then
if [[ "${MODE}" =~ .*xnnpack.* ]]; then
XNNPACK=ON
else
XNNPACK=OFF
Expand All @@ -49,6 +49,12 @@ else
CUSTOM=OFF
fi

if [[ "${MODE}" =~ .*qe.* ]]; then
QE=ON
else
QE=OFF
fi

if [[ -z "${BUCK:-}" ]]; then
BUCK=buck2
fi
Expand Down Expand Up @@ -84,7 +90,6 @@ cmake_build_llama_runner() {
-DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-Bcmake-out/${dir} \
${dir}
Expand Down Expand Up @@ -126,9 +131,15 @@ fi
# Export model.
EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
echo "Exporting ${EXPORTED_MODEL_NAME}"
EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}"
if [[ "${MODE}" == "xnnpack+kv+custom" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128"
EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
if [[ "${XNNPACK}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} -X -qmode 8da4w -G 128"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: does += operator work?

fi
if [[ "${CUSTOM}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
fi
if [[ "${QE}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for adding tests!

fi
# Add dynamically linked library location
$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
Expand Down
2 changes: 1 addition & 1 deletion .ci/scripts/test_quantized_aot_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ build_cmake_quantized_aot_lib() {
&& retry cmake -DBUCK2=buck2 \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-DEXECUTORCH_BUILD_QUANTIZED=ON \
-DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)

cmake --build ${CMAKE_OUTPUT_DIR} -j4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ jobs:
matrix:
dtype: [fp32]
build-tool: [buck2, cmake]
mode: [portable, xnnpack+kv+custom]
mode: [portable, xnnpack+custom, xnnpack+custom+qe]
fail-fast: false
with:
runner: linux.2xlarge
Expand Down
13 changes: 2 additions & 11 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,6 @@ option(EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" OFF)

option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF)

option(EXECUTORCH_BUILD_QUANTIZED "Build the quantized kernels" OFF)

option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK")

option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF)
Expand Down Expand Up @@ -413,9 +411,7 @@ if(EXECUTORCH_BUILD_OPTIMIZED)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
endif()

if(EXECUTORCH_BUILD_QUANTIZED)
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
endif()
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)

add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)

Expand Down Expand Up @@ -445,19 +441,14 @@ cmake_dependent_option(
EXECUTORCH_BUILD_HOST_TARGETS OFF)
if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
# Baseline libraries that executor_runner will link against.
set(_executor_runner_libs executorch gflags)
set(_executor_runner_libs executorch gflags quantized_ops_lib)

if(EXECUTORCH_BUILD_OPTIMIZED)
list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
else()
list(APPEND _executor_runner_libs portable_ops_lib)
endif()

# Generate lib to register quantized ops
if(EXECUTORCH_BUILD_QUANTIZED)
list(APPEND _executor_runner_libs quantized_ops_lib)
endif()

add_executable(executor_runner ${_executor_runner__srcs})
if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE)
target_link_options(executor_runner PRIVATE "LINKER:--gc-sections")
Expand Down
2 changes: 0 additions & 2 deletions build/Utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,6 @@ function(executorch_print_configuration_summary)
STATUS " EXECUTORCH_BUILD_QNN : ${EXECUTORCH_BUILD_QNN}")
message(STATUS " EXECUTORCH_BUILD_OPTIMIZED : "
"${EXECUTORCH_BUILD_OPTIMIZED}")
message(STATUS " EXECUTORCH_BUILD_QUANTIZED : "
"${EXECUTORCH_BUILD_QUANTIZED}")
message(
STATUS " EXECUTORCH_BUILD_SDK : ${EXECUTORCH_BUILD_SDK}")
message(
Expand Down
7 changes: 2 additions & 5 deletions build/build_apple_frameworks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ CUSTOM=OFF
MPS=OFF
OPTIMIZED=OFF
PORTABLE=OFF
QUANTIZED=OFF
QUANTIZED=ON
XNNPACK=OFF
HEADERS_PATH="include"
EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libexecutorch_no_prim_ops.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
Expand Down Expand Up @@ -51,7 +51,6 @@ usage() {
echo " --mps Include this flag to build the Metal Performance Shaders backend."
echo " --optimized Include this flag to build the Optimized backend."
echo " --portable Include this flag to build the Portable backend."
echo " --quantized Include this flag to build the Quantized backend."
echo " --xnnpack Include this flag to build the XNNPACK backend."
echo
echo "Example:"
Expand All @@ -74,7 +73,6 @@ for arg in "$@"; do
--mps) MPS=ON ;;
--optimized) OPTIMIZED=ON ;;
--portable) PORTABLE=ON ;;
--quantized) QUANTIZED=ON ;;
--xnnpack) XNNPACK=ON ;;
*)
if [[ -z "$SOURCE_ROOT_DIR" ]]; then
Expand Down Expand Up @@ -137,7 +135,6 @@ cmake_build() {
-DEXECUTORCH_BUILD_CUSTOM=$CUSTOM \
-DEXECUTORCH_BUILD_MPS=$MPS \
-DEXECUTORCH_BUILD_OPTIMIZED=$OPTIMIZED \
-DEXECUTORCH_BUILD_QUANTIZED=$QUANTIZED \
-DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \
${platform_flag:+-DIOS_PLATFORM=$platform_flag}
cmake --build . --config $MODE
Expand Down Expand Up @@ -181,7 +178,7 @@ append_framework_flag "$CUSTOM" "$CUSTOM_FRAMEWORK"
append_framework_flag "$MPS" "$MPS_FRAMEWORK"
append_framework_flag "$OPTIMIZED" "$OPTIMIZED_FRAMEWORK"
append_framework_flag "$PORTABLE" "$PORTABLE_FRAMEWORK"
append_framework_flag "$QUANTIZED" "$QUANTIZED_FRAMEWORK"
append_framework_flag "ON" "$QUANTIZED_FRAMEWORK"
append_framework_flag "$XNNPACK" "$XNNPACK_FRAMEWORK"

"$SOURCE_ROOT_DIR"/build/create_frameworks.sh "${FRAMEWORK_FLAGS[@]}"
Expand Down
2 changes: 1 addition & 1 deletion build/executorch-config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ set(lib_list
etdump bundled_program extension_data_loader ${FLATCCRT_LIB} mpsdelegate
qnn_executorch_backend portable_ops_lib extension_module xnnpack_backend
XNNPACK cpuinfo pthreadpool vulkan_backend optimized_kernels cpublas eigen_blas
optimized_ops_lib optimized_native_cpu_ops_lib
optimized_ops_lib optimized_native_cpu_ops_lib quantized_kernels quantized_ops_lib
)
foreach(lib ${lib_list})
# Name of the variable which stores result of the find_library search
Expand Down
22 changes: 22 additions & 0 deletions examples/models/llama2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)

include(${EXECUTORCH_ROOT}/build/Utils.cmake)
include(${EXECUTORCH_ROOT}/build/Codegen.cmake)

if(NOT PYTHON_EXECUTABLE)
resolve_python_executable()
Expand Down Expand Up @@ -91,6 +92,7 @@ add_subdirectory(runner)
if(EXECUTORCH_USE_TIKTOKEN)
# find RE2 for tokenizer
set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh we depend on abseil for tiktoken?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, tiktoken -> re2 -> abseil

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no tests using this path yet right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not yet

set(_pic_flag
${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
Expand Down Expand Up @@ -118,6 +120,26 @@ else()
target_link_options_shared_lib(portable_ops_lib)
endif()

# quantized ops yaml file operation
merge_yaml(
FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/ops/quantized.yaml
FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/quantized/quantized.yaml
OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})

gen_selected_ops("${CMAKE_CURRENT_BINARY_DIR}/merged.yaml" "" "")
generate_bindings_for_kernels(
FUNCTIONS_YAML ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml)
message("Generated files ${gen_command_sources}")

# quantized_merge_ops_lib: Register quantized op kernels into the runtime
gen_operators_lib(
"quantized_merge_ops_lib"
KERNEL_LIBS quantized_kernels
DEPS executorch)
target_include_directories(quantized_merge_ops_lib PUBLIC ${_common_include_directories})
target_link_options_shared_lib(quantized_merge_ops_lib)
list(APPEND link_libraries quantized_kernels quantized_merge_ops_lib)

if(EXECUTORCH_BUILD_CUSTOM)
target_link_options_shared_lib(custom_ops)
list(APPEND link_libraries custom_ops)
Expand Down
4 changes: 2 additions & 2 deletions examples/models/llama2/ops/quantized.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
- func: llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
- arg_meta: null
kernel_name: torch::executor::quantized_embedding_byte_out

- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
- func: llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
- arg_meta: null
Expand Down
22 changes: 14 additions & 8 deletions examples/models/llama2/ops/quantized_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,22 @@
"llama_quantized", "DEF"
) # to not be confused with torch.ops.quantized.* ops.
quantized_lib.define(
"embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"DEPRECATED_DO_NOT_USE_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",
)

quantized_lib.define(
"embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"DEPRECATED_DO_NOT_USE_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
)

quantized_lib.define(
"embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"DEPRECATED_DO_NOT_USE_embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
)

quantized_lib.define(
"embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
)

Expand Down Expand Up @@ -66,7 +66,9 @@ def embedding_byte_weight_checks(weight, weight_scales, weight_zero_points):
), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}"


@impl(quantized_lib, "embedding_byte", "CompositeExplicitAutograd")
@impl(
quantized_lib, "DEPRECATED_DO_NOT_USE_embedding_byte", "CompositeExplicitAutograd"
)
def embedding_byte(
weight: torch.Tensor,
weight_scales: torch.Tensor,
Expand All @@ -92,7 +94,7 @@ def embedding_byte(
return torch.ops.aten.embedding.default(weight, indices)


@impl_abstract("llama_quantized::embedding_byte.out")
@impl_abstract("llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.out")
def embedding_byte_out_meta(
weight: torch.Tensor,
weight_scales: torch.Tensor,
Expand All @@ -112,7 +114,11 @@ def embedding_byte_out_meta(
)


@impl(quantized_lib, "embedding_byte.dtype", "CompositeExplicitAutograd")
@impl(
quantized_lib,
"DEPRECATED_DO_NOT_USE_embedding_byte.dtype",
"CompositeExplicitAutograd",
)
def embedding_byte_dtype(
weight: torch.Tensor,
weight_scales: torch.Tensor,
Expand Down Expand Up @@ -140,7 +146,7 @@ def embedding_byte_dtype(
return torch.ops.aten.embedding.default(weight, indices)


@impl_abstract("llama_quantized::embedding_byte.dtype_out")
@impl_abstract("llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out")
def embedding_byte_dtype_out_meta(
weight: torch.Tensor,
weight_scales: torch.Tensor,
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/quant_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def check_embedding_byte_registered():
'Use `python -c "import torch as _; print(_.__path__)"` to find where torch package is installed.\n'
"Set that as TORCH_PACKAGE_DIR.\n"
"Then from root executorch dir do the following:\n"
"rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED=ON ..) && cmake --build . -j16\n"
"rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON ..) && cmake --build . -j16\n"
'To find the location of the lib: find cmake-out -name "libquantized_ops_aot_lib*"\n'
"Then specify the said library via -s <path to libquantized_ops_aot_lib.so\n"
)
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ def __init__(

@torch.no_grad()
def forward(self, indices: torch.Tensor) -> torch.Tensor:
return torch.ops.llama_quantized.embedding_byte.dtype(
return torch.ops.llama_quantized.DEPRECATED_DO_NOT_USE_embedding_byte.dtype(
self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
)

Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/runner/targets.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ def _get_operator_lib(aten = False):
if aten:
return ["//executorch/kernels/aten:generated_lib"]
elif runtime.is_oss:
return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops"]
return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"]
else:
return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"]

Expand Down
5 changes: 4 additions & 1 deletion kernels/quantized/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
# ~~~
cmake_minimum_required(VERSION 3.19)

option(EXECUTORCH_BUILD_QUANTIZED_OPS_AOT
"Build the optimized ops library for AOT export usage" OFF)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if(NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
Expand Down Expand Up @@ -49,7 +52,7 @@ message("Generated files ${gen_command_sources}")
# quantized_ops_aot_lib quantized_ops_lib but none of these is a common
# dependency of the other(s). This is not allowed by the Xcode "new build
# system".
if(NOT CMAKE_GENERATOR STREQUAL "Xcode")
if(NOT CMAKE_GENERATOR STREQUAL "Xcode" AND EXECUTORCH_BUILD_QUANTIZED_OPS_AOT)
# Build a AOT library to register quantized ops into PyTorch. This is a hack.
set(_quantized_sources
${_quantized_kernels__srcs}
Expand Down