Skip to content

Commit 44f704e

Browse files
committed
Update base for Update on "[ET-VK] Enable additional specialization constants in compute shaders"
## Context Building on top of the previous changeset in the stack, this changeset modifies shader dispatch APIs to accept additional specialization constants for a shader. Differential Revision: [D56225042](https://our.internmc.facebook.com/intern/diff/D56225042/) [ghstack-poisoned]
2 parents 9237abc + bae0387 commit 44f704e

File tree

26 files changed

+417
-54
lines changed

26 files changed

+417
-54
lines changed

.ci/scripts/test_llama.sh

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
1212
MODEL_NAME=$1 # stories110M.pt
1313
BUILD_TOOL=$2 # buck2 or cmake
1414
DTYPE=$3 # fp16 or fp32
15-
MODE=${4:-"xnnpack"} # portable or xnnpack
15+
MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
1616
if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
1717
echo "Expecting atleast 4 positional arguments"
1818
echo "Usage: [...]"
@@ -37,7 +37,7 @@ if [[ -z "${MODE:-}" ]]; then
3737
exit 1
3838
fi
3939

40-
if [[ "${MODE}" =~ xnnpack.* ]]; then
40+
if [[ "${MODE}" =~ .*xnnpack.* ]]; then
4141
XNNPACK=ON
4242
else
4343
XNNPACK=OFF
@@ -49,6 +49,12 @@ else
4949
CUSTOM=OFF
5050
fi
5151

52+
if [[ "${MODE}" =~ .*qe.* ]]; then
53+
QE=ON
54+
else
55+
QE=OFF
56+
fi
57+
5258
if [[ -z "${BUCK:-}" ]]; then
5359
BUCK=buck2
5460
fi
@@ -84,7 +90,6 @@ cmake_build_llama_runner() {
8490
-DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
8591
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
8692
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
87-
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
8893
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
8994
-Bcmake-out/${dir} \
9095
${dir}
@@ -126,9 +131,15 @@ fi
126131
# Export model.
127132
EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
128133
echo "Exporting ${EXPORTED_MODEL_NAME}"
129-
EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}"
130-
if [[ "${MODE}" == "xnnpack+kv+custom" ]]; then
131-
EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128"
134+
EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
135+
if [[ "${XNNPACK}" == "ON" ]]; then
136+
EXPORT_ARGS="${EXPORT_ARGS} -X -qmode 8da4w -G 128"
137+
fi
138+
if [[ "${CUSTOM}" == "ON" ]]; then
139+
EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
140+
fi
141+
if [[ "${QE}" == "ON" ]]; then
142+
EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
132143
fi
133144
# Add dynamically linked library location
134145
$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}

.ci/scripts/test_quantized_aot_lib.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ build_cmake_quantized_aot_lib() {
2424
&& retry cmake -DBUCK2=buck2 \
2525
-DCMAKE_BUILD_TYPE=Release \
2626
-DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
27-
-DEXECUTORCH_BUILD_QUANTIZED=ON \
27+
-DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \
2828
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)
2929

3030
cmake --build ${CMAKE_OUTPUT_DIR} -j4

.github/workflows/android.yml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,8 @@ on:
1010
- .ci/docker/**
1111
- .github/workflows/android.yml
1212
- install_requirements.sh
13-
- examples/demo-apps/**
13+
- examples/demo-apps/android/**
14+
- extension/android/**
1415
- extension/module/**
1516
workflow_dispatch:
1617

@@ -101,7 +102,7 @@ jobs:
101102
android-app-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/app-debug.apk
102103
android-test-archive: https://gha-artifacts.s3.amazonaws.com/${{ github.repository }}/${{ github.run_id }}/artifact/app-debug-androidTest.apk
103104
# The test spec can be downloaded from https://ossci-assets.s3.amazonaws.com/android-llama2-device-farm-test-spec.yml
104-
test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/414cb54d-4d83-4576-8317-93244e4dc50e
105+
test-spec: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/abd86868-fa63-467e-a5c7-218194665a77
105106
# The exported llama2 model and its tokenizer, can be downloaded from https://ossci-assets.s3.amazonaws.com/executorch-android-llama2-7b.zip.
106107
# Among the input, this is the biggest file and uploading it to AWS beforehand makes the test run much faster
107108
extra-data: arn:aws:devicefarm:us-west-2:308535385114:upload:02a2cf0f-6d9b-45ee-ba1a-a086587469e6/bd15825b-ddab-4e47-9fef-a9c8935778dd

.github/workflows/doc-build.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,13 @@ jobs:
6868
make html
6969
cd ..
7070
71+
# If it's main branch, add noindex tag to all .html files to exclude from Google Search indexing.
72+
GITHUB_REF=${{ github.ref }}
73+
echo "GitHub Ref: ${GITHUB_REF}"
74+
if [[ "${{ github.ref }}" == 'refs/heads/main' ]]; then
75+
find docs/_build/html/ -name "*.html" -print0 | xargs -0 sed -i '/<head>/a \ \ <meta name="robots" content="noindex">';
76+
fi
77+
7178
cp -rf docs/_build/html/* "${RUNNER_DOCS_DIR}"
7279
7380
mv docs/_build/html "${RUNNER_ARTIFACT_DIR}"

.github/workflows/pull.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ jobs:
9090
matrix:
9191
dtype: [fp32]
9292
build-tool: [buck2, cmake]
93-
mode: [portable, xnnpack+kv+custom]
93+
mode: [portable, xnnpack+custom, xnnpack+custom+qe]
9494
fail-fast: false
9595
with:
9696
runner: linux.2xlarge

CMakeLists.txt

Lines changed: 2 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -164,8 +164,6 @@ option(EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" OFF)
164164

165165
option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF)
166166

167-
option(EXECUTORCH_BUILD_QUANTIZED "Build the quantized kernels" OFF)
168-
169167
option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK")
170168

171169
option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF)
@@ -413,9 +411,7 @@ if(EXECUTORCH_BUILD_OPTIMIZED)
413411
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized)
414412
endif()
415413

416-
if(EXECUTORCH_BUILD_QUANTIZED)
417-
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
418-
endif()
414+
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized)
419415

420416
add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations)
421417

@@ -445,19 +441,14 @@ cmake_dependent_option(
445441
EXECUTORCH_BUILD_HOST_TARGETS OFF)
446442
if(EXECUTORCH_BUILD_EXECUTOR_RUNNER)
447443
# Baseline libraries that executor_runner will link against.
448-
set(_executor_runner_libs executorch gflags)
444+
set(_executor_runner_libs executorch gflags quantized_ops_lib)
449445

450446
if(EXECUTORCH_BUILD_OPTIMIZED)
451447
list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib)
452448
else()
453449
list(APPEND _executor_runner_libs portable_ops_lib)
454450
endif()
455451

456-
# Generate lib to register quantized ops
457-
if(EXECUTORCH_BUILD_QUANTIZED)
458-
list(APPEND _executor_runner_libs quantized_ops_lib)
459-
endif()
460-
461452
add_executable(executor_runner ${_executor_runner__srcs})
462453
if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE)
463454
target_link_options(executor_runner PRIVATE "LINKER:--gc-sections")

backends/vulkan/runtime/api/Pipeline.cpp

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -157,9 +157,7 @@ bool operator==(const SpecVar& lhs, const SpecVar& rhs) {
157157
return false;
158158
}
159159

160-
SpecVarList::SpecVarList() {
161-
vars.reserve(8);
162-
}
160+
SpecVarList::SpecVarList() {}
163161

164162
SpecVarList::SpecVarList(std::initializer_list<SpecVar> init_list) {
165163
vars.resize(init_list.size());
@@ -176,7 +174,7 @@ std::vector<VkSpecializationMapEntry> SpecVarList::generate_map_entries()
176174
map_entries.resize(vars.size());
177175
uint32_t cur_offset = 0u;
178176
for (uint32_t i = 0; i < vars.size(); ++i) {
179-
map_entries[i] = {
177+
map_entries.at(i) = {
180178
i, cur_offset + vars.at(i).val_offset(), vars.at(i).val_size()};
181179
cur_offset += sizeof(SpecVar);
182180
}

backends/vulkan/runtime/api/Pipeline.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,12 +53,17 @@ struct SpecVar final {
5353

5454
bool operator==(const SpecVar& lhs, const SpecVar& rhs);
5555

56-
struct SpecVarList final {
56+
class SpecVarList final {
5757
std::vector<SpecVar> vars;
5858

59+
public:
5960
SpecVarList();
6061
SpecVarList(std::initializer_list<SpecVar> init_list);
6162

63+
inline const SpecVar& at(const size_t index) const {
64+
return vars.at(index);
65+
}
66+
6267
inline const SpecVar* data() const {
6368
return vars.data();
6469
}
@@ -235,7 +240,7 @@ class ComputePipelineCache final {
235240
seed = utils::hash_combine(seed, std::hash<uint32_t>()(spec_vars.size()));
236241

237242
for (int i = 0; i < spec_vars.size(); ++i) {
238-
const SpecVar& spec_var = spec_vars.vars.at(i);
243+
const SpecVar& spec_var = spec_vars.at(i);
239244
size_t new_seed = 0;
240245
switch (spec_var.type) {
241246
case SpecVar::Type::FLOAT:

build/Utils.cmake

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,6 @@ function(executorch_print_configuration_summary)
7474
STATUS " EXECUTORCH_BUILD_QNN : ${EXECUTORCH_BUILD_QNN}")
7575
message(STATUS " EXECUTORCH_BUILD_OPTIMIZED : "
7676
"${EXECUTORCH_BUILD_OPTIMIZED}")
77-
message(STATUS " EXECUTORCH_BUILD_QUANTIZED : "
78-
"${EXECUTORCH_BUILD_QUANTIZED}")
7977
message(
8078
STATUS " EXECUTORCH_BUILD_SDK : ${EXECUTORCH_BUILD_SDK}")
8179
message(

build/build_apple_frameworks.sh

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ CUSTOM=OFF
2222
MPS=OFF
2323
OPTIMIZED=OFF
2424
PORTABLE=OFF
25-
QUANTIZED=OFF
25+
QUANTIZED=ON
2626
XNNPACK=OFF
2727
HEADERS_PATH="include"
2828
EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libexecutorch_no_prim_ops.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH"
@@ -51,7 +51,6 @@ usage() {
5151
echo " --mps Include this flag to build the Metal Performance Shaders backend."
5252
echo " --optimized Include this flag to build the Optimized backend."
5353
echo " --portable Include this flag to build the Portable backend."
54-
echo " --quantized Include this flag to build the Quantized backend."
5554
echo " --xnnpack Include this flag to build the XNNPACK backend."
5655
echo
5756
echo "Example:"
@@ -74,7 +73,6 @@ for arg in "$@"; do
7473
--mps) MPS=ON ;;
7574
--optimized) OPTIMIZED=ON ;;
7675
--portable) PORTABLE=ON ;;
77-
--quantized) QUANTIZED=ON ;;
7876
--xnnpack) XNNPACK=ON ;;
7977
*)
8078
if [[ -z "$SOURCE_ROOT_DIR" ]]; then
@@ -137,7 +135,6 @@ cmake_build() {
137135
-DEXECUTORCH_BUILD_CUSTOM=$CUSTOM \
138136
-DEXECUTORCH_BUILD_MPS=$MPS \
139137
-DEXECUTORCH_BUILD_OPTIMIZED=$OPTIMIZED \
140-
-DEXECUTORCH_BUILD_QUANTIZED=$QUANTIZED \
141138
-DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \
142139
${platform_flag:+-DIOS_PLATFORM=$platform_flag}
143140
cmake --build . --config $MODE
@@ -181,7 +178,7 @@ append_framework_flag "$CUSTOM" "$CUSTOM_FRAMEWORK"
181178
append_framework_flag "$MPS" "$MPS_FRAMEWORK"
182179
append_framework_flag "$OPTIMIZED" "$OPTIMIZED_FRAMEWORK"
183180
append_framework_flag "$PORTABLE" "$PORTABLE_FRAMEWORK"
184-
append_framework_flag "$QUANTIZED" "$QUANTIZED_FRAMEWORK"
181+
append_framework_flag "ON" "$QUANTIZED_FRAMEWORK"
185182
append_framework_flag "$XNNPACK" "$XNNPACK_FRAMEWORK"
186183

187184
"$SOURCE_ROOT_DIR"/build/create_frameworks.sh "${FRAMEWORK_FLAGS[@]}"

build/executorch-config.cmake

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ set(lib_list
3838
etdump bundled_program extension_data_loader ${FLATCCRT_LIB} mpsdelegate
3939
qnn_executorch_backend portable_ops_lib extension_module xnnpack_backend
4040
XNNPACK cpuinfo pthreadpool vulkan_backend optimized_kernels cpublas eigen_blas
41-
optimized_ops_lib optimized_native_cpu_ops_lib
41+
optimized_ops_lib optimized_native_cpu_ops_lib quantized_kernels quantized_ops_lib
4242
)
4343
foreach(lib ${lib_list})
4444
# Name of the variable which stores result of the find_library search
Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
version: 0.1
2+
3+
android_test_host: amazon_linux_2
4+
5+
phases:
6+
install:
7+
commands:
8+
9+
pre_test:
10+
commands:
11+
# Prepare the model and the tokenizer
12+
- adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /sdcard/"
13+
- adb -s $DEVICEFARM_DEVICE_UDID shell "mkdir -p /data/local/tmp/llama/"
14+
- adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/tokenizer.bin /data/local/tmp/llama/tokenizer.bin"
15+
- adb -s $DEVICEFARM_DEVICE_UDID shell "mv /sdcard/xnnpack_llama2.pte /data/local/tmp/llama/xnnpack_llama2.pte"
16+
- adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/tokenizer.bin"
17+
- adb -s $DEVICEFARM_DEVICE_UDID shell "chmod 664 /data/local/tmp/llama/xnnpack_llama2.pte"
18+
- adb -s $DEVICEFARM_DEVICE_UDID shell "ls -la /data/local/tmp/llama/"
19+
20+
test:
21+
commands:
22+
# By default, the following ADB command is used by Device Farm to run your Instrumentation test.
23+
# Please refer to Android's documentation for more options on running instrumentation tests with adb:
24+
# https://developer.android.com/studio/test/command-line#run-tests-with-adb
25+
- echo "Starting the Instrumentation test"
26+
- |
27+
adb -s $DEVICEFARM_DEVICE_UDID shell "am instrument -r -w --no-window-animation \
28+
$DEVICEFARM_TEST_PACKAGE_NAME/$DEVICEFARM_TEST_PACKAGE_RUNNER 2>&1 || echo \": -1\"" |
29+
tee $DEVICEFARM_LOG_DIR/instrument.log
30+
31+
# Parse the results
32+
- |-
33+
INSTRUMENT_LOG="$DEVICEFARM_LOG_DIR/instrument.log"
34+
35+
DID_ANY_TESTS_START=$(grep "INSTRUMENTATION_STATUS_CODE: 1" $INSTRUMENT_LOG | wc -l);
36+
TESTS_PASSED=$(grep "INSTRUMENTATION_STATUS_CODE: 0" $INSTRUMENT_LOG | wc -l);
37+
TESTS_ERRORED=$(grep "INSTRUMENTATION_STATUS_CODE: -1" $INSTRUMENT_LOG | wc -l);
38+
TESTS_FAILED=$(grep "INSTRUMENTATION_STATUS_CODE: -2" $INSTRUMENT_LOG | wc -l);
39+
TESTS_IGNORED=$(grep "INSTRUMENTATION_STATUS_CODE: -3" $INSTRUMENT_LOG | wc -l);
40+
TESTS_ASSUMPTION_FAILED=$(grep "INSTRUMENTATION_STATUS_CODE: -4" $INSTRUMENT_LOG | wc -l);
41+
TESTS_PROCESSES_CRASHED=$(grep "INSTRUMENTATION_RESULT: shortMsg=Process crashed." $INSTRUMENT_LOG | wc -l);
42+
43+
# And print the results so that the CI job can show them later
44+
- |
45+
INSTRUMENT_LOG="$DEVICEFARM_LOG_DIR/instrument.log"
46+
47+
if [ $DID_ANY_TESTS_START -eq 0 ];
48+
then
49+
echo "[PyTorch] Marking the test suite as failed because no tests started!";
50+
false;
51+
elif [ $TESTS_FAILED -ne 0 ];
52+
then
53+
OBSERVED_TPS=$(grep "The observed TPS " $INSTRUMENT_LOG | tail -n 1)
54+
55+
if [ -n "${OBSERVED_TPS}" ];
56+
then
57+
echo "[PyTorch] ${OBSERVED_TPS}";
58+
else
59+
echo "[PyTorch] Marking the test suite as failed because it failed to load the model";
60+
fi
61+
elif [ $TESTS_ERRORED -ne 0 ];
62+
then
63+
echo "[PyTorch] Marking the test suite as failed because $TESTS_ERRORED tests errored!";
64+
false;
65+
elif [ $TESTS_PROCESSES_CRASHED -ne 0 ];
66+
then
67+
echo "[PyTorch] Marking the test suite as failed because the app crashed due to OOM!";
68+
false;
69+
fi;
70+
71+
post_test:
72+
commands:
73+
74+
artifacts:
75+
# By default, Device Farm will collect your artifacts from the $DEVICEFARM_LOG_DIR directory.
76+
- $DEVICEFARM_LOG_DIR

examples/models/llama2/CMakeLists.txt

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
4444
set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)
4545

4646
include(${EXECUTORCH_ROOT}/build/Utils.cmake)
47+
include(${EXECUTORCH_ROOT}/build/Codegen.cmake)
4748

4849
if(NOT PYTHON_EXECUTABLE)
4950
resolve_python_executable()
@@ -91,6 +92,7 @@ add_subdirectory(runner)
9192
if(EXECUTORCH_USE_TIKTOKEN)
9293
# find RE2 for tokenizer
9394
set(ABSL_ENABLE_INSTALL ON)
95+
set(ABSL_PROPAGATE_CXX_STD ON)
9496
set(_pic_flag
9597
${CMAKE_POSITION_INDEPENDENT_CODE})
9698
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
@@ -118,6 +120,26 @@ else()
118120
target_link_options_shared_lib(portable_ops_lib)
119121
endif()
120122

123+
# quantized ops yaml file operation
124+
merge_yaml(
125+
FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/ops/quantized.yaml
126+
FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/quantized/quantized.yaml
127+
OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})
128+
129+
gen_selected_ops("${CMAKE_CURRENT_BINARY_DIR}/merged.yaml" "" "")
130+
generate_bindings_for_kernels(
131+
FUNCTIONS_YAML ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml)
132+
message("Generated files ${gen_command_sources}")
133+
134+
# quantized_merge_ops_lib: Register quantized op kernels into the runtime
135+
gen_operators_lib(
136+
"quantized_merge_ops_lib"
137+
KERNEL_LIBS quantized_kernels
138+
DEPS executorch)
139+
target_include_directories(quantized_merge_ops_lib PUBLIC ${_common_include_directories})
140+
target_link_options_shared_lib(quantized_merge_ops_lib)
141+
list(APPEND link_libraries quantized_kernels quantized_merge_ops_lib)
142+
121143
if(EXECUTORCH_BUILD_CUSTOM)
122144
target_link_options_shared_lib(custom_ops)
123145
list(APPEND link_libraries custom_ops)

0 commit comments

Comments
 (0)