Skip to content

Commit c6a04c6

Browse files
committed
ggml-qnn: sync from branch kantvai-ggmlqnn-npurpc
1 parent 84317c7 commit c6a04c6

File tree

1 file changed

+60
-46
lines changed

1 file changed

+60
-46
lines changed

ggml/src/ggml-qnn/ggml-qnn.cpp

Lines changed: 60 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -1483,15 +1483,13 @@ static Qnn_Tensor_t * ggml_qnn_create_general_tensor(const ggml_tensor * tensor,
14831483
GGMLQNN_LOG_DEBUG("init_tensor %d", get_idx());
14841484
inc_idx();
14851485

1486-
//there are different dimension order between ggml tensor and qnn tensor
14871486
uint32_t dimensions_transpose[GGML_MAX_DIMS] = {};
14881487
uint32_t * tensor_dims = nullptr;
1489-
14901488
if (nullptr != tensor) {
1491-
dimensions_transpose[0] = (uint32_t) tensor->ne[1];
1492-
dimensions_transpose[1] = (uint32_t) tensor->ne[0];
1493-
dimensions_transpose[2] = (uint32_t) tensor->ne[2];
1494-
dimensions_transpose[3] = (uint32_t) tensor->ne[3];
1489+
//there are different dimension order between ggml tensor and qnn tensor
1490+
for (size_t idx = 0; idx < rank; idx++) {
1491+
dimensions_transpose[idx] = (uint32_t)tensor->ne[rank - 1 - idx];
1492+
}
14951493
tensor_dims = dimensions_transpose;
14961494
}
14971495
//re-assign tensor_dims
@@ -2058,7 +2056,7 @@ class qnn_instance {
20582056
std::unordered_map<void *, void *> _rpcmem_store_map;
20592057
std::unordered_map<void *, size_t> _rpcmem_usage_map;
20602058
size_t _rpcmem_capacity = 512; // mempool size in Mbytes
2061-
size_t _rpcmem_usage = 0; // mempool usage in MBytes
2059+
size_t _rpcmem_usage = 0; // mempool usage in Mbytes
20622060

20632061
std::string _graph_name;
20642062
QNNBackend _device_id;
@@ -2968,33 +2966,27 @@ static void print_tensors_info(const char * func_name, ggml_backend_qnn_context
29682966
if (nullptr != func_name && nullptr != ctx) {
29692967
GGMLQNN_LOG_DEBUG("call %s in dev %s\n", func_name, ctx->name);
29702968
}
2971-
GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
2969+
GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
29722970
src0->name,
2973-
src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2],
2974-
src0->nb[0], src0->nb[1], src0->nb[2]);
2975-
GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
2971+
src0->type, ggml_type_name(src0->type), src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3],
2972+
src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
2973+
GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
29762974
src1->name,
2977-
src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2],
2978-
src1->nb[0], src1->nb[1], src1->nb[2]);
2979-
GGMLQNN_LOG_DEBUG("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi)\n",
2975+
src1->type, ggml_type_name(src1->type), src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3],
2976+
src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]);
2977+
GGMLQNN_LOG_DEBUG("%-6s: type = %i (%s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi, %5zi)",
29802978
dst->name,
2981-
dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->nb[0],
2982-
dst->nb[1], dst->nb[2]);
2983-
GGMLQNN_LOG_DEBUG("%d, %d, %d, %d", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
2984-
GGMLQNN_LOG_DEBUG("tensor0 name %s", src0->name);
2985-
GGMLQNN_LOG_DEBUG("tensor1 name %s", src1->name);
2986-
GGMLQNN_LOG_DEBUG("tensor2 name %s", dst->name);
2979+
dst->type, ggml_type_name(dst->type), dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3],
2980+
dst->nb[0], dst->nb[1], dst->nb[2], dst->nb[3]);
2981+
GGMLQNN_LOG_DEBUG("\n");
29872982
}
29882983

2989-
static void dump_tensors_info(const struct ggml_tensor * tensor) {
2984+
static void dump_op_info(const struct ggml_tensor * tensor) {
29902985
//skip sanity check of params
29912986
const struct ggml_tensor * src0 = tensor->src[0];
2992-
struct ggml_tensor * src1 = tensor->src[1];
2993-
struct ggml_tensor * dst = const_cast<ggml_tensor *>(tensor);
2994-
GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op),
2995-
ggml_type_name(tensor->type));
2996-
GGMLQNN_LOG_DEBUG("src0 type:%s", ggml_type_name(tensor->src[0]->type));
2997-
GGMLQNN_LOG_DEBUG("src1 type:%s", ggml_type_name(tensor->src[1]->type));
2987+
struct ggml_tensor * src1 = tensor->src[1];
2988+
struct ggml_tensor * dst = const_cast<ggml_tensor *>(tensor);
2989+
GGMLQNN_LOG_DEBUG("op name:%s, tensor type:%s", ggml_op_name(tensor->op), ggml_type_name(tensor->type));
29982990
print_tensors_info(nullptr, nullptr, src0, src1, dst);
29992991
}
30002992

@@ -3008,8 +3000,13 @@ static void get_qnn_dimensions_from_ggml_dimensions(uint32_t * qnn_dimensions, u
30083000
GGMLQNN_LOG_WARN("invalid params");
30093001
return;
30103002
}
3011-
qnn_dimensions[0] = ggml_dimensions[1];
3012-
qnn_dimensions[1] = ggml_dimensions[0];
3003+
for (size_t idx = 0; idx < GGML_MAX_DIMS; idx++)
3004+
qnn_dimensions[idx] = ggml_dimensions[idx];
3005+
3006+
if (rank >= 2) {
3007+
qnn_dimensions[rank - 1] = ggml_dimensions[rank - 2];
3008+
qnn_dimensions[rank - 2] = ggml_dimensions[rank - 1];
3009+
}
30133010
}
30143011

30153012
// =================================================================================================
@@ -3060,9 +3057,16 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
30603057
}
30613058

30623059
if (tensor->op == GGML_OP_MUL_MAT) {
3063-
//dump_tensors_info(tensor);
3064-
if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mulmat to QNN backend
3060+
dump_op_info(tensor);
3061+
if (src0_rank != src1_rank) // make QNN SDK happy
3062+
return false;
3063+
if (src0_rank < 2) // make QNN SDK happy
3064+
return false;
3065+
if (src0_rank > 3) //TODO: 4D matrix
30653066
return false;
3067+
if ((src1->ne[2] != src0->ne[2]) || (src1->ne[3] != src0->ne[3])) // make QNN SDK happy
3068+
return false;
3069+
30663070
//TODO: support more data type in func ggml_qnn_mul_mat(...)
30673071
//src0: q4_0, q6_k, ...
30683072
//src1: f32
@@ -3073,8 +3077,8 @@ static bool ggml_qnn_can_handle_op(const struct ggml_tensor * tensor) {
30733077
}
30743078

30753079
if (tensor->op == GGML_OP_MUL) {
3076-
dump_tensors_info(tensor);
3077-
if ((src0_rank != 2) || (src1_rank != 2)) //TODO: only support offload 2D matrix mul to QNN backend
3080+
//dump_tensors_info(tensor);
3081+
if ((src0_rank != 2) || (src1_rank != 2)) //TODO: 3D and 4D matrix
30783082
return false;
30793083
return (src0->type == GGML_TYPE_F32)
30803084
&& (src1->type == GGML_TYPE_F32)
@@ -3340,6 +3344,11 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
33403344
QNN_INTERFACE_VER_TYPE qnn_raw_interface = ctx->raw_interface;
33413345
op_perf.start();
33423346

3347+
uint32_t src0_rank = ggml_get_tensor_rank(src0);
3348+
uint32_t src1_rank = ggml_get_tensor_rank(src1);
3349+
GGML_ASSERT(src0_rank == src1_rank);
3350+
GGML_ASSERT(src0_rank >= 2); //QNN SDK's limitation
3351+
33433352
std::string graph_name;
33443353
get_graph_key_from_op(op, graph_name);
33453354
if (instance->_qnn_graph_map.find(graph_name) != instance->_qnn_graph_map.end()) {
@@ -3353,12 +3362,12 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
33533362
p_param_tensor = tensors[3];
33543363
p_tensor2_transpose = tensors[4];
33553364
} else {
3356-
p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
3357-
p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
3358-
p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
3365+
p_tensor0 = ggml_qnn_create_general_tensor(src0, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
3366+
p_tensor1 = ggml_qnn_create_general_tensor(src1, nullptr, QNN_TENSOR_TYPE_APP_WRITE,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
3367+
p_tensor2 = ggml_qnn_create_general_tensor(dst, nullptr, QNN_TENSOR_TYPE_APP_READ,QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
33593368
}
33603369

3361-
//print_tensors_info(__func__, ctx, src0, src1, dst);
3370+
print_tensors_info(__func__, ctx, src0, src1, dst);
33623371

33633372
//ensure QNN tensor has correct tensor type
33643373
QNN_VER_PTR(*p_tensor0)->type = QNN_TENSOR_TYPE_APP_WRITE;
@@ -3403,9 +3412,16 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34033412
return;
34043413
}
34053414
//step-2: create param tensor for mulmat of 2d matrix
3406-
uint32_t param_tensor_dims[] = {2};
3407-
uint32_t param_tensor_data[2] = {1, 0};
3408-
p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC,QNN_DATATYPE_UINT_32, 1, param_tensor_dims, param_tensor_data, 8);
3415+
const uint32_t param_tensor_data[GGML_MAX_DIMS][GGML_MAX_DIMS] = {
3416+
{0},
3417+
{1, 0},
3418+
{0, 2, 1},
3419+
{0, 1, 3, 2},
3420+
};
3421+
uint32_t param_tensor_dims[1] = {src0_rank};
3422+
p_param_tensor = ggml_qnn_create_general_tensor(nullptr, "param", QNN_TENSOR_TYPE_STATIC, QNN_DATATYPE_UINT_32,
3423+
1, param_tensor_dims,
3424+
(void *) (param_tensor_data[src0_rank - 1]), src0_rank * sizeof(uint32_t));
34093425
CHECK_QNN_API(error, qnn_raw_interface.tensorCreateGraphTensor(graph_handle, p_param_tensor));
34103426

34113427
//step-3: create compute tensor from ggml tensor
@@ -3419,7 +3435,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34193435

34203436
//step-4: create a transpose tensor
34213437
uint32_t tensor2_transpose_dims[GGML_MAX_DIMS] = {};
3422-
p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, 2, nullptr, nullptr, 0);
3438+
p_tensor2_transpose = ggml_qnn_create_general_tensor(dst, "transpose", QNN_TENSOR_TYPE_NATIVE, QNN_DATATYPE_FLOAT_32, src0_rank, nullptr, nullptr, 0);
34233439
get_qnn_dimensions_from_ggml_dimensions(tensor2_transpose_dims, tensor_2_dimensions, ggml_get_tensor_rank(dst));
34243440
//save pointer because the dimensions of tensor p_tensor2_transpose will be changed later
34253441
uint32_t * tensor2_dimensions_transpose = QNN_VER_PTR(*p_tensor2_transpose)->dimensions;
@@ -3435,7 +3451,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34353451
}
34363452
};
34373453

3438-
Qnn_Tensor_t out_0_inputs[] = {*p_tensor0,*p_tensor1};
3454+
Qnn_Tensor_t out_0_inputs[] = {*p_tensor0,*p_tensor1};
34393455
Qnn_Tensor_t out_0_outputs[] = {*p_tensor2_transpose};
34403456
Qnn_OpConfig_t out_0 = {
34413457
QNN_OPCONFIG_VERSION_1, .v1 =
@@ -3455,7 +3471,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34553471
"perm", .tensorParam = *p_param_tensor
34563472
}
34573473
};
3458-
Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
3474+
Qnn_Tensor_t out_trans1_0_inputs[] = {*p_tensor2_transpose};
34593475
Qnn_Tensor_t out_trans1_0_outputs[] = {*p_tensor2};
34603476
Qnn_OpConfig_t out_trans1_0 = {
34613477
QNN_OPCONFIG_VERSION_1,
@@ -3472,7 +3488,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34723488

34733489
//step-6: finalize qnn graph and execute qnn graph
34743490
CHECK_QNN_API(error, qnn_raw_interface.graphFinalize(graph_handle, NULL, NULL));
3475-
Qnn_Tensor_t input_tensors_0[] = {*p_tensor0,*p_tensor1};
3491+
Qnn_Tensor_t input_tensors_0[] = {*p_tensor0,*p_tensor1};
34763492
Qnn_Tensor_t output_tensors_0[] = {*p_tensor2};
34773493
CHECK_QNN_API(error, qnn_raw_interface.graphExecute(graph_handle,
34783494
input_tensors_0, 2,
@@ -3495,9 +3511,7 @@ static void ggml_qnn_mul_mat(ggml_backend_t backend, ggml_tensor * op) {
34953511
//restore pointer to avoid memory leak
34963512
QNN_VER_PTR(*p_tensor2_transpose)->dimensions = tensor2_dimensions_transpose;
34973513
//free_qnn_tensor(p_tensor2_transpose);
3498-
34993514
} else {
3500-
35013515
QNN_VER_PTR(*p_tensor0)->clientBuf = {src0->data, ggml_get_tensor_data_size(src0)};
35023516
QNN_VER_PTR(*p_tensor1)->clientBuf = {src1->data, ggml_get_tensor_data_size(src1)};
35033517
QNN_VER_PTR(*p_tensor2)->clientBuf = {dst->data, ggml_get_tensor_data_size(dst)};

0 commit comments

Comments
 (0)