1
1
/*
2
2
* Copyright (c) 2023-2024 The ggml authors
3
3
*
4
- * this is implementation of ggml-qnn(ggml-qnn backend of Qualcomm QNN(Qualcomm Neural Network,
5
- * aka Qualcomm AI Engine Direct)
6
- *
7
4
* Qualcomm QNN SDK and reference tech guides could be found at:
8
5
* https://www.qualcomm.com/developer/software/qualcomm-ai-engine-direct-sdk
9
6
* https://developer.qualcomm.com/software/hexagon-dsp-sdk/tools
17
14
* section-6 does implementation of ggml-qnn backend according to ggml's backend subsystem
18
15
*
19
16
* currently only provide GGML_OP_ADD's QNN backend implementation:
20
- * - GGML_OP_ADD: this is skeleton, can expand other ggml ops as expertise
17
+ * - GGML_OP_ADD: this is skeleton, can expand other ggml ops according to expertise
21
18
*
22
19
* of course, can porting ggml-qnn to Windows on ARM as need.
23
20
*
@@ -105,10 +102,6 @@ class qnn_instance;
105
102
struct ggml_backend_qnn_context ;
106
103
static int free_qnn_tensor (Qnn_Tensor_t * tensor);
107
104
static enum ggml_status ggml_backend_qnn_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
108
-
109
- #if (defined __ANDROID__) || (defined ANDROID)
110
- extern " C" int __android_log_print (int prio, const char * tag, const char * fmt, ...) __attribute__((__format__(printf, 3 , 4 )));
111
- #endif
112
105
static void ggmlqnn_log_internal (ggml_log_level level, const char * file, const char * func, int line, const char * format, ...);
113
106
114
107
// =================================================================================================
@@ -142,13 +135,13 @@ static void ggmlqnn_log_internal(ggml_log_level level, const char * file, const
142
135
int len = vsnprintf (s_ggmlqnn_log_internal_buf + len_prefix, GGML_QNN_LOGBUF_LEN - len_prefix, format, args);
143
136
if (len < (GGML_QNN_LOGBUF_LEN - len_prefix)) {
144
137
#if (defined __ANDROID__) || (defined ANDROID)
145
- // for Android APK
138
+ // for Android application(standard APP or command line tool)
146
139
__android_log_print (ANDROID_LOG_INFO, " ggml-qnn" , " %s\n " , s_ggmlqnn_log_internal_buf);
147
140
#endif
148
141
#if (defined __ANDROID__) || (defined ANDROID)
149
- // do nothing when running on Android phone
142
+ // do nothing when running on Snapdragon based Android device
150
143
#else
151
- // for Windows on ARM
144
+ // for Snapdragon based WoA( Windows on ARM) device
152
145
printf (" %s\n " , s_ggmlqnn_log_internal_buf);
153
146
#endif
154
147
}
@@ -851,7 +844,6 @@ static int free_qnn_tensor(Qnn_Tensor_t * tensor) {
851
844
free (src_qparam.bwAxisScaleOffsetEncoding .offsets );
852
845
}
853
846
}
854
- // GGMLQNN_LOG_DEBUG("tensor dims %p", QNN_TENSOR_GET_DIMENSIONS(*tensor));
855
847
free (QNN_TENSOR_GET_DIMENSIONS (*tensor));
856
848
free (tensor);
857
849
@@ -1367,8 +1359,8 @@ static struct qcom_socinfo * qnn_get_socinfo_from_socmodel(uint32_t soc_model) {
1367
1359
return nullptr ;
1368
1360
}
1369
1361
1370
- static bool qnn_is_valid_params (ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
1371
- const ggml_tensor * src1, ggml_tensor * dst) {
1362
+ static bool ggmlqnn_is_valid_params (ggml_backend_qnn_context * ctx, const ggml_tensor * src0,
1363
+ const ggml_tensor * src1, ggml_tensor * dst) {
1372
1364
if ((nullptr == ctx) || (nullptr == src0) || (nullptr == src1) || (nullptr == dst)) {
1373
1365
GGMLQNN_LOG_WARN (" invalid params\n " );
1374
1366
return false ;
@@ -1383,9 +1375,9 @@ static bool qnn_is_valid_params(ggml_backend_qnn_context * ctx, const ggml_tenso
1383
1375
return true ;
1384
1376
}
1385
1377
1386
- #define CHECK_PARAMS (ctx, src0, src1, dst ) \
1378
+ #define GGMLQNN_CHECK_PARAMS (ctx, src0, src1, dst ) \
1387
1379
do { \
1388
- if (!qnn_is_valid_params ((ctx), (src0), (src1), (dst))) { \
1380
+ if (!ggmlqnn_is_valid_params ((ctx), (src0), (src1), (dst))) { \
1389
1381
return ; \
1390
1382
} \
1391
1383
} while (0 )
@@ -1516,7 +1508,7 @@ static ggml_type ggml_datatype_from_qnn_datatype(Qnn_DataType_t qnn_type) {
1516
1508
return GGML_TYPE_COUNT;
1517
1509
}
1518
1510
1519
- // TODO:
1511
+ // TODO: add more ops
1520
1512
static const char * qnn_opname_from_ggmlop (enum ggml_op ggmlop) {
1521
1513
switch (ggmlop) {
1522
1514
case GGML_OP_ADD:
@@ -1540,7 +1532,7 @@ static void append_tensor_dimensions(const ggml_tensor * tensor, std::string & o
1540
1532
int len = 0 ;
1541
1533
switch (ggml_n_dims (tensor)) {
1542
1534
case 1 :
1543
- len = snprintf (buffer, sizeof (buffer), " %ld %s" , (long )tensor->ne [0 ], type_name);
1535
+ len = snprintf (buffer, sizeof (buffer), " %ldx1 %s" , (long )tensor->ne [0 ], type_name);
1544
1536
break ;
1545
1537
case 2 :
1546
1538
len = snprintf (buffer, sizeof (buffer), " %ldx%ld%s" , (long )tensor->ne [0 ], (long )tensor->ne [1 ], type_name);
@@ -1913,7 +1905,7 @@ class qnn_instance {
1913
1905
void unregister_rpcmem ();
1914
1906
void unregister_rpcmem (Qnn_MemHandle_t mem_handle);
1915
1907
1916
- void *alloc_rpcmem (size_t bytes, size_t alignment);
1908
+ void * alloc_rpcmem (size_t bytes, size_t alignment);
1917
1909
1918
1910
void free_rpcmem (void * buf);
1919
1911
@@ -2252,7 +2244,7 @@ int qnn_instance::load_backend(std::string & lib_path, const QnnSaver_Config_t *
2252
2244
_loaded_lib_handle[backend_id] = lib_handle;
2253
2245
_backend_id = backend_id;
2254
2246
2255
- #if 0 //not used in PR, keep them here for further use
2247
+ #if 0 // keep them here for further use
2256
2248
QnnSaver_Config_t outputdir_cfg;
2257
2249
outputdir_cfg.option = QNN_SAVER_CONFIG_OPTION_OUTPUT_DIRECTORY;
2258
2250
outputdir_cfg.outputDirectory = "/data/local/tmp/";
@@ -2307,8 +2299,8 @@ int qnn_instance::load_system() {
2307
2299
_system_lib_handle = dlopen (system_lib_path.c_str (), RTLD_NOW | RTLD_LOCAL);
2308
2300
if (nullptr == _system_lib_handle) {
2309
2301
GGMLQNN_LOG_WARN (" can not open QNN library %s, error: %s\n " , system_lib_path.c_str (), dlerror ());
2310
- // re-try with Android APK's internal QNN runtime lib path
2311
- _lib_path = " /data/data/com.cdeos.kantv/qnnlib /" ;
2302
+ // re-try with default path of QNN binary runtime lib
2303
+ _lib_path = " /data/local/tmp /" ;
2312
2304
system_lib_path = _lib_path + " libQnnSystem.so" ;
2313
2305
_system_lib_handle = dlopen (system_lib_path.c_str (), RTLD_NOW | RTLD_LOCAL);
2314
2306
if (nullptr == _system_lib_handle) {
@@ -2604,7 +2596,6 @@ int qnn_instance::qnn_init(const QnnSaver_Config_t ** saver_config) {
2604
2596
}
2605
2597
_qnn_raw_interface.deviceFreePlatformInfo (nullptr , p_info);
2606
2598
2607
- // TODO: faster approach to probe the accurate capacity of QNN RPC ion memory
2608
2599
size_t candidate_size = 0 ;
2609
2600
uint8_t * rpc_buffer = nullptr ;
2610
2601
const int SIZE_IN_MB = (1 << 20 );
@@ -2648,7 +2639,7 @@ int qnn_instance::qnn_finalize() {
2648
2639
// FIXME:should be removed in the future
2649
2640
reset_idx ();
2650
2641
2651
- if (nullptr != _pfn_rpc_mem_deinit) // make Qualcomm's mobile SoC equipped low-end phone happy
2642
+ if (nullptr != _pfn_rpc_mem_deinit)
2652
2643
_pfn_rpc_mem_deinit ();
2653
2644
2654
2645
if (dlclose (_rpc_lib_handle) != 0 ) {
@@ -2922,8 +2913,8 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor, bool b_dum
2922
2913
}
2923
2914
#if 0
2924
2915
//TODO: offload mul_mat to QNN backend
2925
- //we need to process type traint in func ggml_qnn_mul_mat(...) with following case :
2926
- //src0: q4_0, q6_k
2916
+ //need to process type trait in func ggml_qnn_mul_mat(...):
2917
+ //src0: q4_0, q6_k, ...
2927
2918
//src1: f32
2928
2919
//dst : f32
2929
2920
return (src1->type == GGML_TYPE_F32 || src1->type == GGML_TYPE_F16)
@@ -2959,13 +2950,15 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
2959
2950
Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
2960
2951
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
2961
2952
Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
2953
+ const ggml_tensor * src0 = op->src [0 ];
2954
+ const ggml_tensor * src1 = op->src [1 ];
2955
+ ggml_tensor * dst = op;
2956
+
2957
+ GGMLQNN_CHECK_PARAMS (ctx, src0, src1, dst);
2962
2958
2963
2959
instance = ctx->instance ;
2964
2960
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface ;
2965
2961
2966
- const ggml_tensor * src0 = op->src [0 ];
2967
- const ggml_tensor * src1 = op->src [1 ];
2968
- ggml_tensor * dst = op;
2969
2962
op_perf.start ();
2970
2963
2971
2964
std::string map_entry;
@@ -3174,17 +3167,17 @@ static void ggml_qnn_add(ggml_backend_t backend, ggml_tensor * op) {
3174
3167
#endif
3175
3168
}
3176
3169
3177
- // TODO: type trait with op->src[0]
3170
+ // TODO:
3178
3171
/*
3179
- * the procedure of ggml_qnn_mul_mat is similar to ggml_qnn_add,but there are type trait process
3180
- * for ggml_qnn_mul_mat , so it's a standalone function.
3172
+ * the logic of ggml_qnn_mul_mat is similar to ggml_qnn_add,but type trait and matrix transpose are required
3173
+ * for offload mulmat to QNN backend , so it's a standalone function.
3181
3174
*
3182
3175
* MUL_MAT take most of the compute time (about 95%).so to speed up llama inference, we should focus on MUL_MAT.
3183
3176
*
3184
3177
* we have three kinds of MUL_MAT to compute:
3185
3178
* mul_mat_f32: both src0 and src1 are F32, this will be naturally handled in QNN backend
3186
3179
* mul_mat_f16_f32: src0 is F16 and src1 is F32, f16 in src0 -> f32 in src0', then src0' * src1
3187
- * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, quantize in src0 -> f32 in src0', then src0' * src1
3180
+ * mul_mat_q_f32: src0 is quantized (Q4_0, Q4_1, ...) and src1 is F32, src0 -> f32 in src0', then src0' * src1
3188
3181
*/
3189
3182
static void ggml_qnn_mul_mat (ggml_backend_t backend, ggml_tensor * op) {
3190
3183
Qnn_ErrorHandle_t error = QNN_SUCCESS;
@@ -3205,13 +3198,15 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3205
3198
Qnn_DataType_t src0_qnn_type = QNN_DATATYPE_FLOAT_32;
3206
3199
Qnn_DataType_t src1_qnn_type = QNN_DATATYPE_FLOAT_32;
3207
3200
Qnn_DataType_t dst_qnn_type = QNN_DATATYPE_FLOAT_32;
3201
+ const ggml_tensor * src0 = op->src [0 ];
3202
+ const ggml_tensor * src1 = op->src [1 ];
3203
+ ggml_tensor * dst = op;
3204
+
3205
+ GGMLQNN_CHECK_PARAMS (ctx, src0, src1, dst);
3208
3206
3209
3207
instance = ctx->instance ;
3210
3208
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface ;
3211
3209
3212
- const ggml_tensor * src0 = op->src [0 ];
3213
- const ggml_tensor * src1 = op->src [1 ];
3214
- ggml_tensor * dst = op;
3215
3210
op_perf.start ();
3216
3211
3217
3212
std::string map_entry;
0 commit comments