Skip to content

Commit 986a37d

Browse files
committed
ggml-qnn: santiy check
1 parent 74029f3 commit 986a37d

File tree

1 file changed

+31
-36
lines changed

1 file changed

+31
-36
lines changed

ggml/src/ggml-qnn/ggml-qnn.cpp

Lines changed: 31 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,6 @@
11
/*
22
* Copyright (c) 2023-2024 The ggml authors
33
*
4-
* this is implementation of ggml-qnn(ggml-qnn backend of Qualcomm QNN(Qualcomm Neural Network,
5-
* aka Qualcomm AI Engine Direct)
6-
*
74
* Qualcomm QNN SDK and reference tech guides could be found at:
85
* https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
96
* https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
@@ -17,7 +14,7 @@
1714
* section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
1815
*
1916
* currently only provide GGML_OP_ADD's QNN backend implementation:
20-
* - GGML_OP_ADD: this is skeleton, can expand other ggml ops as expertise
17+
* - GGML_OP_ADD: this is skeleton, can expand other ggml ops according to expertise
2118
*
2219
* of course, can porting ggml-qnn to Windows on ARM as need.
2320
*
@@ -105,10 +102,6 @@ class qnn_instance;
105102
struct ggml_backend_qnn_context;
106103
static int free_qnn_tensor(Qnn_Tensor_t * tensor);
107104
static enum ggml_status ggml_backend_qnn_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph);
108-
109-
#if (defined __ANDROID__) || (defined ANDROID)
110-
extern "C" int __android_log_print(int prio, const char * tag, const char * fmt, ...) __attribute__((__format__(printf, 3, 4)));
111-
#endif
112105
static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
113106

114107
// =================================================================================================
@@ -142,13 +135,13 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
142135
int len = vsnprintf(s_ggmlqnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
143136
if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
144137
#if (defined __ANDROID__) || (defined ANDROID)
145-
//for Android APK
138+
//for Android application(standard APP or command line tool)
146139
__android_log_print(ANDROID_LOG_INFO, "ggml-qnn", "%s\n", s_ggmlqnn_log_internal_buf);
147140
#endif
148141
#if (defined __ANDROID__) || (defined ANDROID)
149-
//do nothing when running on Android phone
142+
//do nothing when running on Snapdragon based Android device
150143
#else
151-
//for Windows on ARM
144+
//for Snapdragon based WoA(Windows on ARM) device
152145
printf("%s\n", s_ggmlqnn_log_internal_buf);
153146
#endif
154147
}
@@ -851,7 +844,6 @@ static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
851844
free(src_qparam.bwAxisScaleOffsetEncoding.offsets);
852845
}
853846
}
854-
//GGMLQNN_LOG_DEBUG("tensor dims %p", QNN_TENSOR_GET_DIMENSIONS(*tensor));
855847
free(QNN_TENSOR_GET_DIMENSIONS(*tensor));
856848
free(tensor);
857849

@@ -1367,8 +1359,8 @@ static struct qcom_socinfo * qnn_get_socinfo_from_socmodel(uint32_t soc_model) {
13671359
return nullptr;
13681360
}
13691361

1370-
static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
1371-
const ggml_tensor * src1, ggml_tensor * dst) {
1362+
static bool ggmlqnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
1363+
const ggml_tensor * src1, ggml_tensor * dst) {
13721364
if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
13731365
GGMLQNN_LOG_WARN("invalid params\n");
13741366
return false;
@@ -1383,9 +1375,9 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso
13831375
return true;
13841376
}
13851377

1386-
#define CHECK_PARAMS(ctx, src0, src1, dst) \
1378+
#define GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst) \
13871379
do { \
1388-
if (!qnn_is_valid_params((ctx), (src0), (src1), (dst))) { \
1380+
if (!ggmlqnn_is_valid_params((ctx), (src0), (src1), (dst))) { \
13891381
return; \
13901382
} \
13911383
} while (0)
@@ -1516,7 +1508,7 @@ static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
15161508
return GGML_TYPE_COUNT;
15171509
}
15181510

1519-
//TODO:
1511+
//TODO: add more ops
15201512
static const char * qnn_opname_from_ggmlop(enum ggml_op ggmlop) {
15211513
switch (ggmlop) {
15221514
case GGML_OP_ADD:
@@ -1540,7 +1532,7 @@ static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & o
15401532
int len = 0;
15411533
switch (ggml_n_dims(tensor)) {
15421534
case 1:
1543-
len = snprintf(buffer, sizeof(buffer), "%ld%s", (long)tensor->ne[0], type_name);
1535+
len = snprintf(buffer, sizeof(buffer), "%ldx1%s", (long)tensor->ne[0], type_name);
15441536
break;
15451537
case 2:
15461538
len = snprintf(buffer, sizeof(buffer), "%ldx%ld%s", (long)tensor->ne[0], (long)tensor->ne[1], type_name);
@@ -1913,7 +1905,7 @@ class qnn_instance {
19131905
void unregister_rpcmem();
19141906
void unregister_rpcmem(Qnn_MemHandle_t mem_handle);
19151907

1916-
void *alloc_rpcmem(size_t bytes, size_t alignment);
1908+
void * alloc_rpcmem(size_t bytes, size_t alignment);
19171909

19181910
void free_rpcmem(void * buf);
19191911

@@ -2252,7 +2244,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
22522244
_loaded_lib_handle[backend_id] = lib_handle;
22532245
_backend_id = backend_id;
22542246

2255-
#if 0 //not used in PR, keep them here for further use
2247+
#if 0 // keep them here for further use
22562248
QnnSaver_Config_t outputdir_cfg;
22572249
outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY;
22582250
outputdir_cfg.outputDirectory = "/data/local/tmp/";
@@ -2307,8 +2299,8 @@ int qnn_instance::load_system() {
23072299
_system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
23082300
if (nullptr == _system_lib_handle) {
23092301
GGMLQNN_LOG_WARN("can not open QNN library %s, error: %s\n", system_lib_path.c_str(), dlerror());
2310-
//re-try with Android APK's internal QNN runtime lib path
2311-
_lib_path = "/data/data/com.cdeos.kantv/qnnlib/";
2302+
//re-try with default path of QNN binary runtime lib
2303+
_lib_path = "/data/local/tmp/";
23122304
system_lib_path = _lib_path + "libQnnSystem.so";
23132305
_system_lib_handle = dlopen(system_lib_path.c_str(), RTLD_NOW | RTLD_LOCAL);
23142306
if (nullptr == _system_lib_handle) {
@@ -2604,7 +2596,6 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
26042596
}
26052597
_qnn_raw_interface.deviceFreePlatformInfo(nullptr, p_info);
26062598

2607-
//TODO: faster approach to probe the accurate capacity of QNN RPC ion memory
26082599
size_t candidate_size = 0;
26092600
uint8_t * rpc_buffer = nullptr;
26102601
const int SIZE_IN_MB = (1 << 20);
@@ -2648,7 +2639,7 @@ int qnn_instance::qnn_finalize() {
26482639
//FIXME:should be removed in the future
26492640
reset_idx();
26502641

2651-
if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's mobile SoC equipped low-end phone happy
2642+
if (nullptr != _pfn_rpc_mem_deinit)
26522643
_pfn_rpc_mem_deinit();
26532644

26542645
if (dlclose(_rpc_lib_handle) != 0) {
@@ -2922,8 +2913,8 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
29222913
}
29232914
#if 0
29242915
//TODO: offload mul_mat to QNN backend
2925-
//we need to process type traint in func ggml_qnn_mul_mat(...) with following case:
2926-
//src0: q4_0, q6_k
2916+
//need to process type trait in func ggml_qnn_mul_mat(...):
2917+
//src0: q4_0, q6_k, ...
29272918
//src1: f32
29282919
//dst : f32
29292920
return (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
@@ -2959,13 +2950,15 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
29592950
Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
29602951
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
29612952
Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
2953+
const ggml_tensor * src0 = op->src[0];
2954+
const ggml_tensor * src1 = op->src[1];
2955+
ggml_tensor * dst = op;
2956+
2957+
GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
29622958

29632959
instance = ctx->instance;
29642960
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
29652961

2966-
const ggml_tensor * src0 = op->src[0];
2967-
const ggml_tensor * src1 = op->src[1];
2968-
ggml_tensor * dst = op;
29692962
op_perf.start();
29702963

29712964
std::string map_entry;
@@ -3174,17 +3167,17 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
31743167
#endif
31753168
}
31763169

3177-
//TODO: type trait with op->src[0]
3170+
//TODO:
31783171
/*
3179-
* the procedure of ggml_qnn_mul_mat is similar to ggml_qnn_add,but there are type trait process
3180-
* for ggml_qnn_mul_mat, so it's a standalone function.
3172+
* the logic of ggml_qnn_mul_mat is similar to ggml_qnn_add,but type trait and matrix transpose are required
3173+
* for offload mulmat to QNN backend, so it's a standalone function.
31813174
*
31823175
* MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, we should focus on MUL_MAT.
31833176
*
31843177
* we have three kinds of MUL_MAT to compute:
31853178
* mul_mat_f32: both src0 and src1 are F32, this will be naturally handled in QNN backend
31863179
* mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
3187-
* mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, quantize in src0 -> f32 in src0', then src0' * src1
3180+
* mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
31883181
*/
31893182
static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
31903183
Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -3205,13 +3198,15 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
32053198
Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
32063199
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
32073200
Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
3201+
const ggml_tensor * src0 = op->src[0];
3202+
const ggml_tensor * src1 = op->src[1];
3203+
ggml_tensor * dst = op;
3204+
3205+
GGMLQNN_CHECK_PARAMS(ctx, src0, src1, dst);
32083206

32093207
instance = ctx->instance;
32103208
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
32113209

3212-
const ggml_tensor * src0 = op->src[0];
3213-
const ggml_tensor * src1 = op->src[1];
3214-
ggml_tensor * dst = op;
32153210
op_perf.start();
32163211

32173212
std::string map_entry;

0 commit comments

Comments
 (0)