Skip to content

Commit 1f4b631

Browse files
larryliu0820facebook-github-bot
authored andcommitted
Add quantized op support to llama runner (#3062)
Summary: Pull Request resolved: #3062 Reviewed By: lucylq, mikekgfb Differential Revision: D56197863 fbshipit-source-id: c564a99d10be70fb69e554687bd506d8ff13268e
1 parent f14dc83 commit 1f4b631

File tree

14 files changed

+69
-41
lines changed

14 files changed

+69
-41
lines changed

.ci/scripts/test_llama.sh

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1212
MODEL_NAME=$1 # stories110M.pt
1313
BUILD_TOOL=$2 # buck2 or cmake
1414
DTYPE=$3 # fp16 or fp32
15-
MODE=${4:-"xnnpack"} # portable or xnnpack
15+
MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
1616
if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
1717
echo "Expecting atleast 4 positional arguments"
1818
echo "Usage: [...]"
@@ -37,7 +37,7 @@ if [[ -z "${MODE:-}" ]]; then
3737
exit 1
3838
fi
3939

40-
if [[ "${MODE}" =~ xnnpack.* ]]; then
40+
if [[ "${MODE}" =~ .*xnnpack.* ]]; then
4141
XNNPACK=ON
4242
else
4343
XNNPACK=OFF
@@ -49,6 +49,12 @@ else
4949
CUSTOM=OFF
5050
fi
5151

52+
if [[ "${MODE}" =~ .*qe.* ]]; then
53+
QE=ON
54+
else
55+
QE=OFF
56+
fi
57+
5258
if [[ -z "${BUCK:-}" ]]; then
5359
BUCK=buck2
5460
fi
@@ -84,7 +90,6 @@ cmake_build_llama_runner() {
8490
-DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
8591
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
8692
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
87-
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
8893
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
8994
-Bcmake-out/${dir} \
9095
${dir}
@@ -126,9 +131,15 @@ fi
126131
# Export model.
127132
EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
128133
echo "Exporting ${EXPORTED_MODEL_NAME}"
129-
EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}"
130-
if [[ "${MODE}" == "xnnpack+kv+custom" ]]; then
131-
EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128"
134+
EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
135+
if [[ "${XNNPACK}" == "ON" ]]; then
136+
EXPORT_ARGS="${EXPORT_ARGS} -X -qmode 8da4w -G 128"
137+
fi
138+
if [[ "${CUSTOM}" == "ON" ]]; then
139+
EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
140+
fi
141+
if [[ "${QE}" == "ON" ]]; then
142+
EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
132143
fi
133144
# Add dynamically linked library location
134145
$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}

.ci/scripts/test_quantized_aot_lib.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ build_cmake_quantized_aot_lib() {
2424
&& retry cmake -DBUCK2=buck2 \
2525
-DCMAKE_BUILD_TYPE=Release \
2626
-DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
27-
-DEXECUTORCH_BUILD_QUANTIZED=ON \
27+
-DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \
2828
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
2929

3030
cmake --build ${CMAKE_OUTPUT_DIR} -j4

.github/workflows/pull.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ jobs:
9090
matrix:
9191
dtype: [fp32]
9292
build-tool: [buck2, cmake]
93-
mode: [portable, xnnpack+kv+custom]
93+
mode: [portable, xnnpack+custom, xnnpack+custom+qe]
9494
fail-fast: false
9595
with:
9696
runner: linux.2xlarge

CMakeLists.txt

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,6 @@ option(EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" OFF)
164164

165165
option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF)
166166

167-
option(EXECUTORCH_BUILD_QUANTIZED "Build the quantized kernels" OFF)
168-
169167
option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK")
170168

171169
option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF)
@@ -413,9 +411,7 @@ if(EXECUTORCH_BUILD_OPTIMIZED)
413411
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
414412
endif()
415413

416-
if(EXECUTORCH_BUILD_QUANTIZED)
417-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
418-
endif()
414+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
419415

420416
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
421417

@@ -445,19 +441,14 @@ cmake_dependent_option(
445441
EXECUTORCH_BUILD_HOST_TARGETS OFF)
446442
if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
447443
# Baseline libraries that executor_runner will link against.
448-
set(_executor_runner_libs executorch gflags)
444+
set(_executor_runner_libs executorch gflags quantized_ops_lib)
449445

450446
if(EXECUTORCH_BUILD_OPTIMIZED)
451447
list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
452448
else()
453449
list(APPEND _executor_runner_libs portable_ops_lib)
454450
endif()
455451

456-
# Generate lib to register quantized ops
457-
if(EXECUTORCH_BUILD_QUANTIZED)
458-
list(APPEND _executor_runner_libs quantized_ops_lib)
459-
endif()
460-
461452
add_executable(executor_runner ${_executor_runner__srcs})
462453
if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE)
463454
target_link_options(executor_runner PRIVATE "LINKER:--gc-sections")

build/Utils.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,6 @@ function(executorch_print_configuration_summary)
7474
STATUS " EXECUTORCH_BUILD_QNN : ${EXECUTORCH_BUILD_QNN}")
7575
message(STATUS " EXECUTORCH_BUILD_OPTIMIZED : "
7676
"${EXECUTORCH_BUILD_OPTIMIZED}")
77-
message(STATUS " EXECUTORCH_BUILD_QUANTIZED : "
78-
"${EXECUTORCH_BUILD_QUANTIZED}")
7977
message(
8078
STATUS " EXECUTORCH_BUILD_SDK : ${EXECUTORCH_BUILD_SDK}")
8179
message(

build/build_apple_frameworks.sh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ CUSTOM=OFF
2222
MPS=OFF
2323
OPTIMIZED=OFF
2424
PORTABLE=OFF
25-
QUANTIZED=OFF
25+
QUANTIZED=ON
2626
XNNPACK=OFF
2727
HEADERS_PATH="include"
2828
EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libexecutorch_no_prim_ops.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
@@ -51,7 +51,6 @@ usage() {
5151
echo " --mps Include this flag to build the Metal Performance Shaders backend."
5252
echo " --optimized Include this flag to build the Optimized backend."
5353
echo " --portable Include this flag to build the Portable backend."
54-
echo " --quantized Include this flag to build the Quantized backend."
5554
echo " --xnnpack Include this flag to build the XNNPACK backend."
5655
echo
5756
echo "Example:"
@@ -74,7 +73,6 @@ for arg in "$@"; do
7473
--mps) MPS=ON ;;
7574
--optimized) OPTIMIZED=ON ;;
7675
--portable) PORTABLE=ON ;;
77-
--quantized) QUANTIZED=ON ;;
7876
--xnnpack) XNNPACK=ON ;;
7977
*)
8078
if [[ -z "$SOURCE_ROOT_DIR" ]]; then
@@ -137,7 +135,6 @@ cmake_build() {
137135
-DEXECUTORCH_BUILD_CUSTOM=$CUSTOM \
138136
-DEXECUTORCH_BUILD_MPS=$MPS \
139137
-DEXECUTORCH_BUILD_OPTIMIZED=$OPTIMIZED \
140-
-DEXECUTORCH_BUILD_QUANTIZED=$QUANTIZED \
141138
-DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \
142139
${platform_flag:+-DIOS_PLATFORM=$platform_flag}
143140
cmake --build . --config $MODE
@@ -181,7 +178,7 @@ append_framework_flag "$CUSTOM" "$CUSTOM_FRAMEWORK"
181178
append_framework_flag "$MPS" "$MPS_FRAMEWORK"
182179
append_framework_flag "$OPTIMIZED" "$OPTIMIZED_FRAMEWORK"
183180
append_framework_flag "$PORTABLE" "$PORTABLE_FRAMEWORK"
184-
append_framework_flag "$QUANTIZED" "$QUANTIZED_FRAMEWORK"
181+
append_framework_flag "ON" "$QUANTIZED_FRAMEWORK"
185182
append_framework_flag "$XNNPACK" "$XNNPACK_FRAMEWORK"
186183

187184
"$SOURCE_ROOT_DIR"/build/create_frameworks.sh "${FRAMEWORK_FLAGS[@]}"

build/executorch-config.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ set(lib_list
3838
etdump bundled_program extension_data_loader ${FLATCCRT_LIB} mpsdelegate
3939
qnn_executorch_backend portable_ops_lib extension_module xnnpack_backend
4040
XNNPACK cpuinfo pthreadpool vulkan_backend optimized_kernels cpublas eigen_blas
41-
optimized_ops_lib optimized_native_cpu_ops_lib
41+
optimized_ops_lib optimized_native_cpu_ops_lib quantized_kernels quantized_ops_lib
4242
)
4343
foreach(lib ${lib_list})
4444
# Name of the variable which stores result of the find_library search

examples/models/llama2/CMakeLists.txt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
4444
set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
4545

4646
include(${EXECUTORCH_ROOT}/build/Utils.cmake)
47+
include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
4748

4849
if(NOT PYTHON_EXECUTABLE)
4950
resolve_python_executable()
@@ -91,6 +92,7 @@ add_subdirectory(runner)
9192
if(EXECUTORCH_USE_TIKTOKEN)
9293
# find RE2 for tokenizer
9394
set(ABSL_ENABLE_INSTALL ON)
95+
set(ABSL_PROPAGATE_CXX_STD ON)
9496
set(_pic_flag
9597
${CMAKE_POSITION_INDEPENDENT_CODE})
9698
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@@ -118,6 +120,26 @@ else()
118120
target_link_options_shared_lib(portable_ops_lib)
119121
endif()
120122

123+
# quantized ops yaml file operation
124+
merge_yaml(
125+
FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/ops/quantized.yaml
126+
FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/quantized/quantized.yaml
127+
OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
128+
129+
gen_selected_ops("${CMAKE_CURRENT_BINARY_DIR}/merged.yaml" "" "")
130+
generate_bindings_for_kernels(
131+
FUNCTIONS_YAML ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml)
132+
message("Generated files ${gen_command_sources}")
133+
134+
# quantized_merge_ops_lib: Register quantized op kernels into the runtime
135+
gen_operators_lib(
136+
"quantized_merge_ops_lib"
137+
KERNEL_LIBS quantized_kernels
138+
DEPS executorch)
139+
target_include_directories(quantized_merge_ops_lib PUBLIC ${_common_include_directories})
140+
target_link_options_shared_lib(quantized_merge_ops_lib)
141+
list(APPEND link_libraries quantized_kernels quantized_merge_ops_lib)
142+
121143
if(EXECUTORCH_BUILD_CUSTOM)
122144
target_link_options_shared_lib(custom_ops)
123145
list(APPEND link_libraries custom_ops)

examples/models/llama2/ops/quantized.yaml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
1-
- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
1+
- func: llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
22
variants: function
33
kernels:
44
- arg_meta: null
55
kernel_name: torch::executor::quantized_embedding_byte_out
66

7-
- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
7+
- func: llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
88
variants: function
99
kernels:
1010
- arg_meta: null

examples/models/llama2/ops/quantized_ops.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,22 +15,22 @@
1515
"llama_quantized", "DEF"
1616
) # to not be confused with torch.ops.quantized.* ops.
1717
quantized_lib.define(
18-
"embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
18+
"DEPRECATED_DO_NOT_USE_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
1919
"int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",
2020
)
2121

2222
quantized_lib.define(
23-
"embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
23+
"DEPRECATED_DO_NOT_USE_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
2424
"int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
2525
)
2626

2727
quantized_lib.define(
28-
"embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
28+
"DEPRECATED_DO_NOT_USE_embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
2929
"int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
3030
)
3131

3232
quantized_lib.define(
33-
"embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
33+
"DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
3434
"int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
3535
)
3636

@@ -66,7 +66,9 @@ def embedding_byte_weight_checks(weight, weight_scales, weight_zero_points):
6666
), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}"
6767

6868

69-
@impl(quantized_lib, "embedding_byte", "CompositeExplicitAutograd")
69+
@impl(
70+
quantized_lib, "DEPRECATED_DO_NOT_USE_embedding_byte", "CompositeExplicitAutograd"
71+
)
7072
def embedding_byte(
7173
weight: torch.Tensor,
7274
weight_scales: torch.Tensor,
@@ -92,7 +94,7 @@ def embedding_byte(
9294
return torch.ops.aten.embedding.default(weight, indices)
9395

9496

95-
@impl_abstract("llama_quantized::embedding_byte.out")
97+
@impl_abstract("llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.out")
9698
def embedding_byte_out_meta(
9799
weight: torch.Tensor,
98100
weight_scales: torch.Tensor,
@@ -112,7 +114,11 @@ def embedding_byte_out_meta(
112114
)
113115

114116

115-
@impl(quantized_lib, "embedding_byte.dtype", "CompositeExplicitAutograd")
117+
@impl(
118+
quantized_lib,
119+
"DEPRECATED_DO_NOT_USE_embedding_byte.dtype",
120+
"CompositeExplicitAutograd",
121+
)
116122
def embedding_byte_dtype(
117123
weight: torch.Tensor,
118124
weight_scales: torch.Tensor,
@@ -140,7 +146,7 @@ def embedding_byte_dtype(
140146
return torch.ops.aten.embedding.default(weight, indices)
141147

142148

143-
@impl_abstract("llama_quantized::embedding_byte.dtype_out")
149+
@impl_abstract("llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out")
144150
def embedding_byte_dtype_out_meta(
145151
weight: torch.Tensor,
146152
weight_scales: torch.Tensor,

examples/models/llama2/quant_lib.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def check_embedding_byte_registered():
105105
'Use `python -c "import torch as _; print(_.__path__)"` to find where torch package is installed.\n'
106106
"Set that as TORCH_PACKAGE_DIR.\n"
107107
"Then from root executorch dir do the following:\n"
108-
"rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED=ON ..) && cmake --build . -j16\n"
108+
"rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON ..) && cmake --build . -j16\n"
109109
'To find the location of the lib: find cmake-out -name "libquantized_ops_aot_lib*"\n'
110110
"Then specify the said library via -s <path to libquantized_ops_aot_lib.so\n"
111111
)

examples/models/llama2/quantize.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,7 @@ def __init__(
377377

378378
@torch.no_grad()
379379
def forward(self, indices: torch.Tensor) -> torch.Tensor:
380-
return torch.ops.llama_quantized.embedding_byte.dtype(
380+
return torch.ops.llama_quantized.DEPRECATED_DO_NOT_USE_embedding_byte.dtype(
381381
self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
382382
)
383383

examples/models/llama2/runner/targets.bzl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ def _get_operator_lib(aten = False):
44
if aten:
55
return ["//executorch/kernels/aten:generated_lib"]
66
elif runtime.is_oss:
7-
return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops"]
7+
return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"]
88
else:
99
return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"]
1010

kernels/quantized/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
# ~~~
1111
cmake_minimum_required(VERSION 3.19)
1212

13+
option(EXECUTORCH_BUILD_QUANTIZED_OPS_AOT
14+
"Build the optimized ops library for AOT export usage" OFF)
15+
1316
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
1417
if(NOT CMAKE_CXX_STANDARD)
1518
set(CMAKE_CXX_STANDARD 17)
@@ -49,7 +52,7 @@ message("Generated files ${gen_command_sources}")
4952
# quantized_ops_aot_lib quantized_ops_lib but none of these is a common
5053
# dependency of the other(s). This is not allowed by the Xcode "new build
5154
# system".
52-
if(NOT CMAKE_GENERATOR STREQUAL "Xcode")
55+
if(NOT CMAKE_GENERATOR STREQUAL "Xcode" AND EXECUTORCH_BUILD_QUANTIZED_OPS_AOT)
5356
# Build a AOT library to register quantized ops into PyTorch. This is a hack.
5457
set(_quantized_sources
5558
${_quantized_kernels__srcs}

0 commit comments

Comments
 (0)