@@ -1483,15 +1483,13 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
1483
1483
GGMLQNN_LOG_DEBUG (" init_tensor %d" , get_idx ());
1484
1484
inc_idx ();
1485
1485
1486
- // there are different dimension order between ggml tensor and qnn tensor
1487
1486
uint32_t dimensions_transpose[GGML_MAX_DIMS] = {};
1488
1487
uint32_t * tensor_dims = nullptr ;
1489
-
1490
1488
if (nullptr != tensor) {
1491
- dimensions_transpose[ 0 ] = ( uint32_t ) tensor-> ne [ 1 ];
1492
- dimensions_transpose[ 1 ] = ( uint32_t ) tensor-> ne [ 0 ];
1493
- dimensions_transpose[2 ] = (uint32_t ) tensor->ne [2 ];
1494
- dimensions_transpose[ 3 ] = ( uint32_t ) tensor-> ne [ 3 ];
1489
+ // there are different dimension order between ggml tensor and qnn tensor
1490
+ for ( size_t idx = 0 ; idx < rank; idx++) {
1491
+ dimensions_transpose[idx ] = (uint32_t )tensor->ne [rank - 1 - idx ];
1492
+ }
1495
1493
tensor_dims = dimensions_transpose;
1496
1494
}
1497
1495
// re-assign tensor_dims
@@ -2058,7 +2056,7 @@ class qnn_instance {
2058
2056
std::unordered_map<void *, void *> _rpcmem_store_map;
2059
2057
std::unordered_map<void *, size_t > _rpcmem_usage_map;
2060
2058
size_t _rpcmem_capacity = 512 ; // mempool size in Mbytes
2061
- size_t _rpcmem_usage = 0 ; // mempool usage in MBytes
2059
+ size_t _rpcmem_usage = 0 ; // mempool usage in Mbytes
2062
2060
2063
2061
std::string _graph_name;
2064
2062
QNNBackend _device_id;
@@ -2968,33 +2966,27 @@ static void print_tensors_info(const char * func_name, ggml_backend_qnn_context
2968
2966
if (nullptr != func_name && nullptr != ctx) {
2969
2967
GGMLQNN_LOG_DEBUG (" call %s in dev %s\n " , func_name, ctx->name );
2970
2968
}
2971
- GGMLQNN_LOG_DEBUG (" %15s : type = %i (%5s ) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " , nb = (%5zi, %5zi, %5zi) \n " ,
2969
+ GGMLQNN_LOG_DEBUG (" %-6s : type = %i (%s ) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5 " PRIi64 " , nb = (%5zi, %5zi, %5zi, %5zi) " ,
2972
2970
src0->name ,
2973
- src0->type , ggml_type_name (src0->type ), src0->ne [0 ], src0->ne [1 ], src0->ne [2 ],
2974
- src0->nb [0 ], src0->nb [1 ], src0->nb [2 ]);
2975
- GGMLQNN_LOG_DEBUG (" %15s : type = %i (%5s ) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " , nb = (%5zi, %5zi, %5zi) \n " ,
2971
+ src0->type , ggml_type_name (src0->type ), src0->ne [0 ], src0->ne [1 ], src0->ne [2 ], src0-> ne [ 3 ],
2972
+ src0->nb [0 ], src0->nb [1 ], src0->nb [2 ], src0-> nb [ 3 ] );
2973
+ GGMLQNN_LOG_DEBUG (" %-6s : type = %i (%s ) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5 " PRIi64 " , nb = (%5zi, %5zi, %5zi, %5zi) " ,
2976
2974
src1->name ,
2977
- src1->type , ggml_type_name (src1->type ), src1->ne [0 ], src1->ne [1 ], src1->ne [2 ],
2978
- src1->nb [0 ], src1->nb [1 ], src1->nb [2 ]);
2979
- GGMLQNN_LOG_DEBUG (" %15s : type = %i (%5s ) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " , nb = (%5zi, %5zi, %5zi) \n " ,
2975
+ src1->type , ggml_type_name (src1->type ), src1->ne [0 ], src1->ne [1 ], src1->ne [2 ], src1-> ne [ 3 ],
2976
+ src1->nb [0 ], src1->nb [1 ], src1->nb [2 ], src1-> nb [ 3 ] );
2977
+ GGMLQNN_LOG_DEBUG (" %-6s : type = %i (%s ) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5 " PRIi64 " , nb = (%5zi, %5zi, %5zi, %5zi) " ,
2980
2978
dst->name ,
2981
- dst->type , ggml_type_name (dst->type ), dst->ne [0 ], dst->ne [1 ], dst->ne [2 ], dst->nb [0 ],
2982
- dst->nb [1 ], dst->nb [2 ]);
2983
- GGMLQNN_LOG_DEBUG (" %d, %d, %d, %d" , src0->ne [0 ], src0->ne [1 ], src0->ne [2 ], src0->ne [3 ]);
2984
- GGMLQNN_LOG_DEBUG (" tensor0 name %s" , src0->name );
2985
- GGMLQNN_LOG_DEBUG (" tensor1 name %s" , src1->name );
2986
- GGMLQNN_LOG_DEBUG (" tensor2 name %s" , dst->name );
2979
+ dst->type , ggml_type_name (dst->type ), dst->ne [0 ], dst->ne [1 ], dst->ne [2 ], dst->ne [3 ],
2980
+ dst->nb [0 ], dst->nb [1 ], dst->nb [2 ], dst->nb [3 ]);
2981
+ GGMLQNN_LOG_DEBUG (" \n " );
2987
2982
}
2988
2983
2989
- static void dump_tensors_info (const struct ggml_tensor * tensor) {
2984
+ static void dump_op_info (const struct ggml_tensor * tensor) {
2990
2985
// skip sanity check of params
2991
2986
const struct ggml_tensor * src0 = tensor->src [0 ];
2992
- struct ggml_tensor * src1 = tensor->src [1 ];
2993
- struct ggml_tensor * dst = const_cast <ggml_tensor *>(tensor);
2994
- GGMLQNN_LOG_DEBUG (" op name:%s, tensor type:%s" , ggml_op_name (tensor->op ),
2995
- ggml_type_name (tensor->type ));
2996
- GGMLQNN_LOG_DEBUG (" src0 type:%s" , ggml_type_name (tensor->src [0 ]->type ));
2997
- GGMLQNN_LOG_DEBUG (" src1 type:%s" , ggml_type_name (tensor->src [1 ]->type ));
2987
+ struct ggml_tensor * src1 = tensor->src [1 ];
2988
+ struct ggml_tensor * dst = const_cast <ggml_tensor *>(tensor);
2989
+ GGMLQNN_LOG_DEBUG (" op name:%s, tensor type:%s" , ggml_op_name (tensor->op ), ggml_type_name (tensor->type ));
2998
2990
print_tensors_info (nullptr , nullptr , src0, src1, dst);
2999
2991
}
3000
2992
@@ -3008,8 +3000,13 @@ static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, u
3008
3000
GGMLQNN_LOG_WARN (" invalid params" );
3009
3001
return ;
3010
3002
}
3011
- qnn_dimensions[0 ] = ggml_dimensions[1 ];
3012
- qnn_dimensions[1 ] = ggml_dimensions[0 ];
3003
+ for (size_t idx = 0 ; idx < GGML_MAX_DIMS; idx++)
3004
+ qnn_dimensions[idx] = ggml_dimensions[idx];
3005
+
3006
+ if (rank >= 2 ) {
3007
+ qnn_dimensions[rank - 1 ] = ggml_dimensions[rank - 2 ];
3008
+ qnn_dimensions[rank - 2 ] = ggml_dimensions[rank - 1 ];
3009
+ }
3013
3010
}
3014
3011
3015
3012
// =================================================================================================
@@ -3060,9 +3057,16 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
3060
3057
}
3061
3058
3062
3059
if (tensor->op == GGML_OP_MUL_MAT) {
3063
- // dump_tensors_info(tensor);
3064
- if ((src0_rank != 2 ) || (src1_rank != 2 )) // TODO: only support offload 2D matrix mulmat to QNN backend
3060
+ dump_op_info (tensor);
3061
+ if (src0_rank != src1_rank) // make QNN SDK happy
3062
+ return false ;
3063
+ if (src0_rank < 2 ) // make QNN SDK happy
3064
+ return false ;
3065
+ if (src0_rank > 3 ) // TODO: 4D matrix
3065
3066
return false ;
3067
+ if ((src1->ne [2 ] != src0->ne [2 ]) || (src1->ne [3 ] != src0->ne [3 ])) // make QNN SDK happy
3068
+ return false ;
3069
+
3066
3070
// TODO: support more data type in func ggml_qnn_mul_mat(...)
3067
3071
// src0: q4_0, q6_k, ...
3068
3072
// src1: f32
@@ -3073,8 +3077,8 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
3073
3077
}
3074
3078
3075
3079
if (tensor->op == GGML_OP_MUL) {
3076
- dump_tensors_info (tensor);
3077
- if ((src0_rank != 2 ) || (src1_rank != 2 )) // TODO: only support offload 2D matrix mul to QNN backend
3080
+ // dump_tensors_info(tensor);
3081
+ if ((src0_rank != 2 ) || (src1_rank != 2 )) // TODO: 3D and 4D matrix
3078
3082
return false ;
3079
3083
return (src0->type == GGML_TYPE_F32)
3080
3084
&& (src1->type == GGML_TYPE_F32)
@@ -3340,6 +3344,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3340
3344
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface ;
3341
3345
op_perf.start ();
3342
3346
3347
+ uint32_t src0_rank = ggml_get_tensor_rank (src0);
3348
+ uint32_t src1_rank = ggml_get_tensor_rank (src1);
3349
+ GGML_ASSERT (src0_rank == src1_rank);
3350
+ GGML_ASSERT (src0_rank >= 2 ); // QNN SDK's limitation
3351
+
3343
3352
std::string graph_name;
3344
3353
get_graph_key_from_op (op, graph_name);
3345
3354
if (instance->_qnn_graph_map .find (graph_name) != instance->_qnn_graph_map .end ()) {
@@ -3353,12 +3362,12 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3353
3362
p_param_tensor = tensors[3 ];
3354
3363
p_tensor2_transpose = tensors[4 ];
3355
3364
} else {
3356
- p_tensor0 = ggml_qnn_create_general_tensor (src0, nullptr , QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2 , nullptr , nullptr , 0 );
3357
- p_tensor1 = ggml_qnn_create_general_tensor (src1, nullptr , QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2 , nullptr , nullptr , 0 );
3358
- p_tensor2 = ggml_qnn_create_general_tensor (dst, nullptr , QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2 , nullptr , nullptr , 0 );
3365
+ p_tensor0 = ggml_qnn_create_general_tensor (src0, nullptr , QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank , nullptr , nullptr , 0 );
3366
+ p_tensor1 = ggml_qnn_create_general_tensor (src1, nullptr , QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank , nullptr , nullptr , 0 );
3367
+ p_tensor2 = ggml_qnn_create_general_tensor (dst, nullptr , QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank , nullptr , nullptr , 0 );
3359
3368
}
3360
3369
3361
- // print_tensors_info(__func__, ctx, src0, src1, dst);
3370
+ print_tensors_info (__func__, ctx, src0, src1, dst);
3362
3371
3363
3372
// ensure QNN tensor has correct tensor type
3364
3373
QNN_VER_PTR (*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -3403,9 +3412,16 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3403
3412
return ;
3404
3413
}
3405
3414
// step-2: create param tensor for mulmat of 2d matrix
3406
- uint32_t param_tensor_dims[] = {2 };
3407
- uint32_t param_tensor_data[2 ] = {1 , 0 };
3408
- p_param_tensor = ggml_qnn_create_general_tensor (nullptr , " param" , QNN_TENSOR_TYPE_STATIC,QNN_DATATYPE_UINT_32, 1 , param_tensor_dims, param_tensor_data, 8 );
3415
+ const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
3416
+ {0 },
3417
+ {1 , 0 },
3418
+ {0 , 2 , 1 },
3419
+ {0 , 1 , 3 , 2 },
3420
+ };
3421
+ uint32_t param_tensor_dims[1 ] = {src0_rank};
3422
+ p_param_tensor = ggml_qnn_create_general_tensor (nullptr , " param" , QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32,
3423
+ 1 , param_tensor_dims,
3424
+ (void *) (param_tensor_data[src0_rank - 1 ]), src0_rank * sizeof (uint32_t ));
3409
3425
CHECK_QNN_API (error, qnn_raw_interface.tensorCreateGraphTensor (graph_handle, p_param_tensor));
3410
3426
3411
3427
// step-3: create compute tensor from ggml tensor
@@ -3419,7 +3435,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3419
3435
3420
3436
// step-4: create a transpose tensor
3421
3437
uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
3422
- p_tensor2_transpose = ggml_qnn_create_general_tensor (dst, " transpose" , QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 2 , nullptr , nullptr , 0 );
3438
+ p_tensor2_transpose = ggml_qnn_create_general_tensor (dst, " transpose" , QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank , nullptr , nullptr , 0 );
3423
3439
get_qnn_dimensions_from_ggml_dimensions (tensor2_transpose_dims, tensor_2_dimensions, ggml_get_tensor_rank (dst));
3424
3440
// save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
3425
3441
uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR (*p_tensor2_transpose)->dimensions ;
@@ -3435,7 +3451,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3435
3451
}
3436
3452
};
3437
3453
3438
- Qnn_Tensor_t out_0_inputs[] = {*p_tensor0,*p_tensor1};
3454
+ Qnn_Tensor_t out_0_inputs[] = {*p_tensor0,*p_tensor1};
3439
3455
Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
3440
3456
Qnn_OpConfig_t out_0 = {
3441
3457
QNN_OPCONFIG_VERSION_1, .v1 =
@@ -3455,7 +3471,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3455
3471
" perm" , .tensorParam = *p_param_tensor
3456
3472
}
3457
3473
};
3458
- Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
3474
+ Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
3459
3475
Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
3460
3476
Qnn_OpConfig_t out_trans1_0 = {
3461
3477
QNN_OPCONFIG_VERSION_1,
@@ -3472,7 +3488,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3472
3488
3473
3489
// step-6: finalize qnn graph and execute qnn graph
3474
3490
CHECK_QNN_API (error, qnn_raw_interface.graphFinalize (graph_handle, NULL , NULL ));
3475
- Qnn_Tensor_t input_tensors_0[] = {*p_tensor0,*p_tensor1};
3491
+ Qnn_Tensor_t input_tensors_0[] = {*p_tensor0,*p_tensor1};
3476
3492
Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
3477
3493
CHECK_QNN_API (error, qnn_raw_interface.graphExecute (graph_handle,
3478
3494
input_tensors_0, 2 ,
@@ -3495,9 +3511,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
3495
3511
// restore pointer to avoid memory leak
3496
3512
QNN_VER_PTR (*p_tensor2_transpose)->dimensions = tensor2_dimensions_transpose;
3497
3513
// free_qnn_tensor(p_tensor2_transpose);
3498
-
3499
3514
} else {
3500
-
3501
3515
QNN_VER_PTR (*p_tensor0)->clientBuf = {src0->data , ggml_get_tensor_data_size (src0)};
3502
3516
QNN_VER_PTR (*p_tensor1)->clientBuf = {src1->data , ggml_get_tensor_data_size (src1)};
3503
3517
QNN_VER_PTR (*p_tensor2)->clientBuf = {dst->data , ggml_get_tensor_data_size (dst)};
0 commit comments