Skip to content

Commit c02b0a8

Browse files
authored
cann: support q4_0 model (#8822)
1 parent 0d6fb52 commit c02b0a8

File tree

7 files changed

+357
-45
lines changed

7 files changed

+357
-45
lines changed

ggml/src/ggml-cann.cpp

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,6 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
627627
GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
628628
const void* src,
629629
void* dst) {
630-
GGML_ASSERT(tensor->op == GGML_OP_NONE);
631630

632631
int64_t n_elems = ggml_nelements(tensor);
633632
int64_t groups = n_elems / QK4_0;
@@ -679,7 +678,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
679678
*/
680679
GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
681680
const ggml_tensor* tensor, void* src, void* dst) {
682-
GGML_ASSERT(tensor->op == GGML_OP_NONE);
683681

684682
int64_t n_elems = ggml_nelements(tensor);
685683
int64_t groups = n_elems / QK4_0;
@@ -1666,10 +1664,17 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
16661664
}
16671665
case GGML_OP_MUL_MAT: {
16681666
switch (op->src[0]->type) {
1669-
// case GGML_TYPE_Q4_0:
16701667
case GGML_TYPE_F16:
16711668
case GGML_TYPE_F32:
16721669
case GGML_TYPE_Q8_0:
1670+
// TODO: fix me
1671+
// Current groupsize should not be greater than k-1 in
1672+
// aclnnWeightQuantBatchMatmulV2GetWorkspaceSize().
1673+
if (op->src[0]->ne[0]-1 > QK8_0) {
1674+
return true;
1675+
}
1676+
return false;
1677+
case GGML_TYPE_Q4_0:
16731678
return true;
16741679
default:
16751680
return false;
@@ -1694,6 +1699,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
16941699
case GGML_TYPE_F32:
16951700
case GGML_TYPE_F16:
16961701
case GGML_TYPE_Q8_0:
1702+
case GGML_TYPE_Q4_0:
16971703
return true;
16981704
default:
16991705
return false;

ggml/src/ggml-cann/acl_tensor.cpp

Lines changed: 4 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
3737
return ACL_INT16;
3838
case GGML_TYPE_I32:
3939
return ACL_INT32;
40+
case GGML_TYPE_Q4_0:
41+
return ACL_INT4;
42+
case GGML_TYPE_Q8_0:
43+
return ACL_INT8;
4044
default:
4145
return ACL_DT_UNDEFINED;
4246
}
@@ -89,33 +93,6 @@ bool ggml_cann_need_bcast(const ggml_tensor* t0, const ggml_tensor* t1) {
8993
return false;
9094
}
9195

92-
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
93-
size_t type_size, int64_t* ne, size_t* nb,
94-
int64_t dims, aclFormat format,
95-
size_t offset) {
96-
int64_t tmp_ne[GGML_MAX_DIMS * 2];
97-
int64_t tmp_stride[GGML_MAX_DIMS * 2];
98-
99-
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
100-
for (int i = 0; i < dims; i++) {
101-
tmp_stride[i] = nb[i] / type_size;
102-
}
103-
104-
std::reverse(tmp_ne, tmp_ne + dims);
105-
std::reverse(tmp_stride, tmp_stride + dims);
106-
107-
int64_t acl_storage_len = 0;
108-
for (int i = 0; i < dims; i++) {
109-
acl_storage_len += (ne[i] - 1) * nb[i];
110-
}
111-
112-
aclTensor* acl_tensor =
113-
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
114-
format, &acl_storage_len, 1, data_ptr);
115-
116-
return acl_tensor;
117-
}
118-
11996
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
12097
const ggml_tensor* src1,
12198
int64_t* bcast_src0_ne,

ggml/src/ggml-cann/acl_tensor.h

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,9 @@
2323
#ifndef CANN_ACL_TENSOR_H
2424
#define CANN_ACL_TENSOR_H
2525

26+
#include <algorithm>
27+
#include <cstring>
28+
2629
#include <aclnn/aclnn_base.h>
2730
#include "common.h"
2831

@@ -65,7 +68,8 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
6568
size_t offset = 0);
6669

6770
/**
68-
* @brief Creates an ACL tensor from provided parameters.
71+
* @brief Template for creating an ACL tensor from provided parameters. typename TYPE
72+
* should be size_t or float.
6973
*
7074
* @details This function creates an ACL tensor using the provided data pointer,
7175
* data type, dimensions, strides, format, offset, and additional parameters.
@@ -83,10 +87,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
8387
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
8488
* @return Pointer to the created ACL tensor.
8589
*/
90+
template<typename TYPE>
8691
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
87-
size_t type_size, int64_t* ne, size_t* nb,
88-
int64_t dims, aclFormat format = ACL_FORMAT_ND,
89-
size_t offset = 0);
92+
TYPE type_size, int64_t* ne, TYPE* nb,
93+
int64_t dims,
94+
aclFormat format = ACL_FORMAT_ND,
95+
size_t offset = 0) {
96+
int64_t tmp_ne[GGML_MAX_DIMS * 2];
97+
int64_t tmp_stride[GGML_MAX_DIMS * 2];
98+
99+
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
100+
for (int i = 0; i < dims; i++) {
101+
tmp_stride[i] = nb[i] / type_size;
102+
}
103+
104+
std::reverse(tmp_ne, tmp_ne + dims);
105+
std::reverse(tmp_stride, tmp_stride + dims);
106+
107+
int64_t acl_storage_len = 0;
108+
for (int i = 0; i < dims; i++) {
109+
acl_storage_len += (ne[i] - 1) * nb[i];
110+
}
111+
112+
aclTensor* acl_tensor =
113+
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
114+
format, &acl_storage_len, 1, data_ptr);
115+
116+
return acl_tensor;
117+
}
90118

91119
/**
92120
* @brief Checks if tensors require broadcasting based on their shapes.

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
910910
((ggml_tensor*)dst->extra)->ne);
911911
return;
912912
}
913+
if (dst->type == GGML_TYPE_Q4_0) {
914+
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
915+
24, ctx.stream(), src->data, dst->data,
916+
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
917+
((ggml_tensor*)dst->extra)->ne);
918+
return;
919+
}
913920
if (dst->type == GGML_TYPE_F16) {
914921
if (ggml_are_same_shape(src, dst)) {
915922
cann_copy(ctx, acl_src, acl_dst);
@@ -971,6 +978,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
971978
((ggml_tensor*)dst->extra)->ne);
972979
return;
973980
}
981+
if (dst->type == GGML_TYPE_Q4_0) {
982+
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
983+
24, ctx.stream(), src->data, dst->data,
984+
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
985+
((ggml_tensor*)dst->extra)->ne);
986+
return;
987+
}
974988
if (dst->type == GGML_TYPE_F32) {
975989
if (ggml_are_same_shape(src, dst)) {
976990
cann_copy(ctx, acl_src, acl_dst);
@@ -2463,21 +2477,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
24632477
* @param dst The destination tensor where the result of the matrix
24642478
* multiplication will be stored.
24652479
*/
2466-
static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
2467-
ggml_tensor* dst) {
2480+
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2481+
ggml_tensor* dst,
2482+
const enum ggml_type type) {
24682483
ggml_tensor* src0 = dst->src[0]; // weight
24692484
ggml_tensor* src1 = dst->src[1]; // input
24702485

24712486
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
24722487
// is regarded as batch. weight need transpose.
24732488
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
2474-
size_t weight_elem_size = sizeof(uint8_t);
2475-
size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
2489+
float weight_elem_size;
2490+
if (type == GGML_TYPE_Q4_0) {
2491+
weight_elem_size = float(sizeof(uint8_t)) / 2;
2492+
}
2493+
else if (type == GGML_TYPE_Q8_0) {
2494+
weight_elem_size = float(sizeof(uint8_t));
2495+
}
2496+
else {
2497+
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
2498+
}
2499+
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
2500+
24762501
// size of one matrix is element_size * height * width.
24772502
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
24782503
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
24792504

24802505
// scale stored at the end of weight. Also need transpose.
2506+
GGML_ASSERT(QK4_0 == QK8_0);
24812507
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
24822508
size_t scale_elem_size = sizeof(uint16_t);
24832509
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
@@ -2541,8 +2567,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
25412567
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
25422568
input_elem_size, input_ne, input_nb, 2);
25432569
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
2544-
(char*)src0->data + batch0 * weight_stride, ACL_INT8,
2545-
weight_elem_size, weight_ne, weight_nb, 2);
2570+
(char*)src0->data + batch0 * weight_stride,
2571+
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
2572+
weight_nb, 2);
25462573
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
25472574
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
25482575
scale_elem_size, scale_ne, scale_nb, 2);
@@ -2596,11 +2623,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
25962623
case GGML_TYPE_F16:
25972624
ggml_cann_mat_mul_fp(ctx, dst);
25982625
break;
2599-
// case GGML_TYPE_Q4_0:
2600-
// ggml_cann_mul_mat_q4_0(ctx, dst);
2601-
// break;
2626+
case GGML_TYPE_Q4_0:
26022627
case GGML_TYPE_Q8_0:
2603-
ggml_cann_mul_mat_q8_0(ctx, dst);
2628+
ggml_cann_mul_mat_quant(ctx, dst, type);
26042629
break;
26052630
default:
26062631
GGML_ABORT("fatal error");

ggml/src/ggml-cann/kernels/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ file(GLOB SRC_FILES
99
get_row_q8_0.cpp
1010
quantize_f32_q8_0.cpp
1111
quantize_f16_q8_0.cpp
12+
quantize_float_to_q4_0.cpp
1213
dup.cpp
1314
)
1415

@@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
2930
${SRC_FILES}
3031
)
3132

32-
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
33+
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

ggml/src/ggml-cann/kernels/ascendc_kernels.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
1010
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
11+
#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
12+
#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
1113

1214
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
1315
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"

0 commit comments

Comments
 (0)