Skip to content

Commit af604d5

Browse files
committed
ggml-qnn: update script build-run-android.sh to compare peformance of ggml-qnn
1 parent 986a37d commit af604d5

File tree

2 files changed

+56
-7
lines changed

2 files changed

+56
-7
lines changed

scripts/build-run-android.sh

Lines changed: 43 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -7,13 +7,17 @@ ANDROID_PLATFORM=android-34
77
ANDROID_NDK=${PWD}/android-ndk-r26c
88
REMOTE_PATH=/data/local/tmp/
99
GGUF_MODEL_NAME=/sdcard/deepseek-r1-distill-qwen-1.5b-q4_0.gguf
10+
GGUF_MODEL_NAME=/sdcard/qwen1_5-1_8b-chat-q4_0.gguf
1011

1112
#QNN SDK could be found at:
1213
#https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
1314
#https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
1415
QNN_SDK_URL=https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
1516
QNN_SDK_PATH=/opt/qcom/aistack/qairt/2.31.0.250130/
1617

18+
#default is QNN NPU
19+
qnnbackend=2
20+
1721
function dump_vars()
1822
{
1923
echo -e "ANDROID_NDK: ${ANDROID_NDK}"
@@ -137,10 +141,28 @@ function run_llamacli()
137141

138142
adb shell "cd ${REMOTE_PATH} \
139143
&& export LD_LIBRARY_PATH=${REMOTE_PATH} \
140-
&& ${REMOTE_PATH}/llama-cli -mg 2 -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
144+
&& ${REMOTE_PATH}/llama-cli -mg ${qnnbackend} -m ${GGUF_MODEL_NAME} -p \"introduce the movie Once Upon a Time in America briefly.\n\""
141145

142146
}
143147

148+
149+
function run_llamabench()
150+
{
151+
check_qnn_libs
152+
153+
if [ -f ./out/android/bin/libggml-qnn.so ]; then
154+
adb push ./out/android/bin/*.so ${REMOTE_PATH}/
155+
fi
156+
adb push ./out/android/bin/llama-bench ${REMOTE_PATH}/
157+
adb shell chmod +x ${REMOTE_PATH}/llama-bench
158+
159+
adb shell "cd ${REMOTE_PATH} \
160+
&& export LD_LIBRARY_PATH=${REMOTE_PATH} \
161+
&& ${REMOTE_PATH}/llama-bench -mg ${qnnbackend} -m ${GGUF_MODEL_NAME}"
162+
163+
}
164+
165+
144166
function run_test-backend-ops()
145167
{
146168
check_qnn_libs
@@ -163,8 +185,9 @@ function show_usage()
163185
echo "Usage:"
164186
echo " $0 build"
165187
echo " $0 updateqnnlib"
166-
echo " $0 run_llamacli"
167188
echo " $0 run_testop"
189+
echo " $0 run_llamacli 0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
190+
echo " $0 run_llamabench 0 (QNN_CPU) / 1 (QNN_GPU) / 2 (QNN_NPU) / 3 (ggml)"
168191
echo -e "\n\n\n"
169192
}
170193

@@ -186,15 +209,30 @@ elif [ $# == 1 ]; then
186209
elif [ "$1" == "build" ]; then
187210
build_ggml_qnn
188211
exit 0
189-
elif [ "$1" == "run_llamacli" ]; then
190-
run_llamacli
191-
exit 0
212+
192213
elif [ "$1" == "run_testop" ]; then
193214
run_test-backend-ops
194215
exit 0
195216
elif [ "$1" == "updateqnnlib" ]; then
196217
update_qnn_libs
197218
exit 0
219+
else
220+
show_usage
221+
exit 1
222+
fi
223+
elif [ $# == 2 ]; then
224+
qnnbackend=$2
225+
if [ ${qnnbackend} -gt 3 ]; then
226+
show_usage
227+
exit 1
228+
fi
229+
230+
if [ "$1" == "run_llamacli" ]; then
231+
run_llamacli
232+
exit 0
233+
elif [ "$1" == "run_llamabench" ]; then
234+
run_llamabench
235+
exit 0
198236
fi
199237
else
200238
show_usage

src/llama.cpp

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,10 @@
1414
#include "ggml-backend.h"
1515
#include "ggml-cpp.h"
1616

17+
#ifdef GGML_USE_QNN
18+
#include "ggml-qnn.h"
19+
#endif
20+
1721
#include <algorithm>
1822
#include <array>
1923
#include <cassert>
@@ -9710,11 +9714,18 @@ struct llama_context * llama_init_from_model(
97109714
// add ACCEL backends (such as BLAS)
97119715
for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
97129716
ggml_backend_dev_t dev = ggml_backend_dev_get(i);
9717+
9718+
#ifdef GGML_USE_QNN // avoid side-effect to other backends
9719+
if (QNN_BACKEND_GGML == model->params.main_gpu) {
9720+
break;
9721+
}
9722+
#endif
97139723
if (ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_ACCEL) {
9724+
ggml_backend_t backend = nullptr;
97149725
#ifndef GGML_USE_QNN
9715-
ggml_backend_t backend = ggml_backend_dev_init(dev, nullptr);
9726+
backend = ggml_backend_dev_init(dev, nullptr);
97169727
#else
9717-
ggml_backend_t backend = ggml_backend_dev_init(dev, reinterpret_cast<const char *>(model->params.main_gpu));
9728+
backend = ggml_backend_dev_init(dev, reinterpret_cast<const char *>(model->params.main_gpu));
97189729
#endif
97199730
if (backend == nullptr) {
97209731
LLAMA_LOG_ERROR("%s: failed to initialize %s backend\n", __func__, ggml_backend_dev_name(dev));

0 commit comments

Comments
 (0)