Skip to content

Commit 69551ba

Browse files
committed
Add quantized op support to llama runner
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: 5ebb8b6 Pull Request resolved: #3062
1 parent 25a5595 commit 69551ba

File tree

7 files changed

+52
-11
lines changed

7 files changed

+52
-11
lines changed

.ci/scripts/test_llama.sh

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1212
MODEL_NAME=$1 # stories110M.pt
1313
BUILD_TOOL=$2 # buck2 or cmake
1414
DTYPE=$3 # fp16 or fp32
15-
MODE=${4:-"xnnpack"} # portable or xnnpack
15+
MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
1616
if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
1717
echo "Expecting atleast 4 positional arguments"
1818
echo "Usage: [...]"
@@ -37,7 +37,7 @@ if [[ -z "${MODE:-}" ]]; then
3737
exit 1
3838
fi
3939

40-
if [[ "${MODE}" =~ xnnpack.* ]]; then
40+
if [[ "${MODE}" =~ .*xnnpack.* ]]; then
4141
XNNPACK=ON
4242
else
4343
XNNPACK=OFF
@@ -49,6 +49,12 @@ else
4949
CUSTOM=OFF
5050
fi
5151

52+
if [[ "${MODE}" =~ .*qe.* ]]; then
53+
QE=ON
54+
else
55+
QE=OFF
56+
fi
57+
5258
if [[ -z "${BUCK:-}" ]]; then
5359
BUCK=buck2
5460
fi
@@ -69,6 +75,7 @@ cmake_install_executorch_libraries() {
6975
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
7076
-DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
7177
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
78+
-DEXECUTORCH_BUILD_QUANTIZED="$QE" \
7279
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
7380
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
7481
-Bcmake-out .
@@ -84,7 +91,7 @@ cmake_build_llama_runner() {
8491
-DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
8592
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
8693
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
87-
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
94+
-DEXECUTORCH_BUILD_QUANTIZED="$QE" \
8895
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
8996
-Bcmake-out/${dir} \
9097
${dir}
@@ -126,9 +133,15 @@ fi
126133
# Export model.
127134
EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
128135
echo "Exporting ${EXPORTED_MODEL_NAME}"
129-
EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}"
130-
if [[ "${MODE}" == "xnnpack+kv+custom" ]]; then
131-
EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128"
136+
EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
137+
if [[ "${XNNPACK}" == "ON" ]]; then
138+
EXPORT_ARGS="${EXPORT_ARGS} -X -qmode 8da4w -G 128"
139+
fi
140+
if [[ "${CUSTOM}" == "ON" ]]; then
141+
EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
142+
fi
143+
if [[ "${QE}" == "ON" ]]; then
144+
EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize '8,1024'"
132145
fi
133146
# Add dynamically linked library location
134147
$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}

.ci/scripts/test_quantized_aot_lib.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ build_cmake_quantized_aot_lib() {
2424
&& retry cmake -DBUCK2=buck2 \
2525
-DCMAKE_BUILD_TYPE=Release \
2626
-DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
27-
-DEXECUTORCH_BUILD_QUANTIZED=ON \
27+
-DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \
2828
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
2929

3030
cmake --build ${CMAKE_OUTPUT_DIR} -j4

.github/workflows/pull.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ jobs:
9090
matrix:
9191
dtype: [fp32]
9292
build-tool: [buck2, cmake]
93-
mode: [portable, xnnpack+kv+custom]
93+
mode: [portable, xnnpack+custom, xnnpack+custom+qe]
9494
fail-fast: false
9595
with:
9696
runner: linux.2xlarge

build/executorch-config.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ set(lib_list
3838
etdump bundled_program extension_data_loader ${FLATCCRT_LIB} mpsdelegate
3939
qnn_executorch_backend portable_ops_lib extension_module xnnpack_backend
4040
XNNPACK cpuinfo pthreadpool vulkan_backend optimized_kernels cpublas eigen_blas
41-
optimized_ops_lib optimized_native_cpu_ops_lib
41+
optimized_ops_lib optimized_native_cpu_ops_lib quantized_kernels quantized_ops_lib
4242
)
4343
foreach(lib ${lib_list})
4444
# Name of the variable which stores result of the find_library search

examples/models/llama2/CMakeLists.txt

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
4444
set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
4545

4646
include(${EXECUTORCH_ROOT}/build/Utils.cmake)
47+
include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
4748

4849
if(NOT PYTHON_EXECUTABLE)
4950
resolve_python_executable()
@@ -91,6 +92,7 @@ add_subdirectory(runner)
9192
if(EXECUTORCH_USE_TIKTOKEN)
9293
# find RE2 for tokenizer
9394
set(ABSL_ENABLE_INSTALL ON)
95+
set(ABSL_PROPAGATE_CXX_STD ON)
9496
set(_pic_flag
9597
${CMAKE_POSITION_INDEPENDENT_CODE})
9698
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@@ -118,6 +120,29 @@ else()
118120
target_link_options_shared_lib(portable_ops_lib)
119121
endif()
120122

123+
if(EXECUTORCH_BUILD_QUANTIZED)
124+
# TODO(larryliu0820): after we delete llama_quantized ops we should be able to reuse
125+
# quantized_kernels and quantized_ops_lib directly.
126+
merge_yaml(
127+
FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/ops/quantized.yaml
128+
FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/quantized/quantized.yaml
129+
OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
130+
131+
gen_selected_ops("${CMAKE_CURRENT_BINARY_DIR}/merged.yaml" "" "")
132+
generate_bindings_for_kernels(
133+
FUNCTIONS_YAML ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml)
134+
message("Generated files ${gen_command_sources}")
135+
136+
# quantized_merge_ops_lib: Register quantized op kernels into the runtime
137+
gen_operators_lib(
138+
"quantized_merge_ops_lib"
139+
KERNEL_LIBS quantized_kernels
140+
DEPS executorch)
141+
target_include_directories(quantized_merge_ops_lib PUBLIC ${_common_include_directories})
142+
target_link_options_shared_lib(quantized_merge_ops_lib)
143+
list(APPEND link_libraries quantized_kernels quantized_merge_ops_lib)
144+
endif()
145+
121146
if(EXECUTORCH_BUILD_CUSTOM)
122147
target_link_options_shared_lib(custom_ops)
123148
list(APPEND link_libraries custom_ops)

examples/models/llama2/quant_lib.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ def check_embedding_byte_registered():
105105
'Use `python -c "import torch as _; print(_.__path__)"` to find where torch package is installed.\n'
106106
"Set that as TORCH_PACKAGE_DIR.\n"
107107
"Then from root executorch dir do the following:\n"
108-
"rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED=ON ..) && cmake --build . -j16\n"
108+
"rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON ..) && cmake --build . -j16\n"
109109
'To find the location of the lib: find cmake-out -name "libquantized_ops_aot_lib*"\n'
110110
"Then specify the said library via -s <path to libquantized_ops_aot_lib.so\n"
111111
)

kernels/quantized/CMakeLists.txt

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
# ~~~
1111
cmake_minimum_required(VERSION 3.19)
1212

13+
option(EXECUTORCH_BUILD_QUANTIZED_OPS_AOT
14+
"Build the optimized ops library for AOT export usage" OFF)
15+
1316
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
1417
if(NOT CMAKE_CXX_STANDARD)
1518
set(CMAKE_CXX_STANDARD 17)
@@ -49,7 +52,7 @@ message("Generated files ${gen_command_sources}")
4952
# quantized_ops_aot_lib quantized_ops_lib but none of these is a common
5053
# dependency of the other(s). This is not allowed by the Xcode "new build
5154
# system".
52-
if(NOT CMAKE_GENERATOR STREQUAL "Xcode")
55+
if(NOT CMAKE_GENERATOR STREQUAL "Xcode" AND EXECUTORCH_BUILD_QUANTIZED_OPS_AOT)
5356
# Build a AOT library to register quantized ops into PyTorch. This is a hack.
5457
set(_quantized_sources
5558
${_quantized_kernels__srcs}

0 commit comments

Comments
 (0)