Skip to content

Commit 9bc327d

Browse files
committed
cann: support q4_0 model
1 parent 0d6fb52 commit 9bc327d

File tree

8 files changed

+402
-26
lines changed

8 files changed

+402
-26
lines changed

ggml/src/ggml-cann.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,6 @@ GGML_CALL static void* ggml_backend_cann_buffer_get_base(
627627
GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
628628
const void* src,
629629
void* dst) {
630-
GGML_ASSERT(tensor->op == GGML_OP_NONE);
631630

632631
int64_t n_elems = ggml_nelements(tensor);
633632
int64_t groups = n_elems / QK4_0;
@@ -679,7 +678,6 @@ GGML_CALL static void ggml_backend_cann_transform_q4_0(ggml_tensor* tensor,
679678
*/
680679
GGML_CALL static void ggml_backend_cann_transform_back_q4_0(
681680
const ggml_tensor* tensor, void* src, void* dst) {
682-
GGML_ASSERT(tensor->op == GGML_OP_NONE);
683681

684682
int64_t n_elems = ggml_nelements(tensor);
685683
int64_t groups = n_elems / QK4_0;
@@ -1666,10 +1664,10 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
16661664
}
16671665
case GGML_OP_MUL_MAT: {
16681666
switch (op->src[0]->type) {
1669-
// case GGML_TYPE_Q4_0:
16701667
case GGML_TYPE_F16:
16711668
case GGML_TYPE_F32:
16721669
case GGML_TYPE_Q8_0:
1670+
case GGML_TYPE_Q4_0:
16731671
return true;
16741672
default:
16751673
return false;
@@ -1694,6 +1692,7 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
16941692
case GGML_TYPE_F32:
16951693
case GGML_TYPE_F16:
16961694
case GGML_TYPE_Q8_0:
1695+
case GGML_TYPE_Q4_0:
16971696
return true;
16981697
default:
16991698
return false;

ggml/src/ggml-cann/acl_tensor.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,10 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
3737
return ACL_INT16;
3838
case GGML_TYPE_I32:
3939
return ACL_INT32;
40+
case GGML_TYPE_Q4_0:
41+
return ACL_INT4;
42+
case GGML_TYPE_Q8_0:
43+
return ACL_INT8;
4044
default:
4145
return ACL_DT_UNDEFINED;
4246
}
@@ -116,6 +120,33 @@ aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
116120
return acl_tensor;
117121
}
118122

123+
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
124+
float type_size, int64_t* ne, float* nb,
125+
int64_t dims, aclFormat format,
126+
size_t offset) {
127+
int64_t tmp_ne[GGML_MAX_DIMS * 2];
128+
int64_t tmp_stride[GGML_MAX_DIMS * 2];
129+
130+
memcpy(tmp_ne, ne, dims * sizeof(int64_t));
131+
for (int i = 0; i < dims; i++) {
132+
tmp_stride[i] = nb[i] / type_size;
133+
}
134+
135+
std::reverse(tmp_ne, tmp_ne + dims);
136+
std::reverse(tmp_stride, tmp_stride + dims);
137+
138+
int64_t acl_storage_len = 0;
139+
for (int i = 0; i < dims; i++) {
140+
acl_storage_len += (ne[i] - 1) * nb[i];
141+
}
142+
143+
aclTensor* acl_tensor =
144+
aclCreateTensor(tmp_ne, dims, dtype, tmp_stride, offset / type_size,
145+
format, &acl_storage_len, 1, data_ptr);
146+
147+
return acl_tensor;
148+
}
149+
119150
int64_t ggml_cann_get_bcast_shape(const ggml_tensor* src0,
120151
const ggml_tensor* src1,
121152
int64_t* bcast_src0_ne,

ggml/src/ggml-cann/acl_tensor.h

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,34 @@ aclTensor* ggml_cann_create_tensor(const ggml_tensor* tensor, int64_t* ne = null
8484
* @return Pointer to the created ACL tensor.
8585
*/
8686
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
87-
size_t type_size, int64_t* ne, size_t* nb,
88-
int64_t dims, aclFormat format = ACL_FORMAT_ND,
89-
size_t offset = 0);
87+
size_t type_size, int64_t* ne, size_t* nb,
88+
int64_t dims, aclFormat format = ACL_FORMAT_ND,
89+
size_t offset = 0);
90+
91+
/**
92+
* @brief Creates an ACL tensor from provided parameters, support float
93+
* type_size/nb for int4b_t data type.
94+
*
95+
* @details This function creates an ACL tensor using the provided data pointer,
96+
* data type, dimensions, strides, format, offset, and additional parameters.
97+
* It calculates necessary dimensions and strides based on the provided ne and nb
98+
* arrays, adjusting them for the ACL tensor creation. The ACL storage length
99+
* is also calculated based on the provided dimensions and strides.
100+
*
101+
* @param data_ptr Pointer to the data buffer for the ACL tensor.
102+
* @param dtype ACL data type of the tensor.
103+
* @param type_size Size of each element in the tensor data buffer.
104+
* @param ne Pointer to an array containing tensor dimensions.
105+
* @param nb Pointer to an array containing tensor strides.
106+
* @param dims Number of dimensions of the tensor.
107+
* @param format ACL tensor format. Defaults to ACL_FORMAT_ND.
108+
* @param offset Offset in bytes for the ACL tensor data. Defaults to 0.
109+
* @return Pointer to the created ACL tensor.
110+
*/
111+
aclTensor* ggml_cann_create_tensor(void* data_ptr, aclDataType dtype,
112+
float type_size, int64_t* ne, float* nb,
113+
int64_t dims, aclFormat format = ACL_FORMAT_ND,
114+
size_t offset = 0);
90115

91116
/**
92117
* @brief Checks if tensors require broadcasting based on their shapes.

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -910,6 +910,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
910910
((ggml_tensor*)dst->extra)->ne);
911911
return;
912912
}
913+
if (dst->type == GGML_TYPE_Q4_0) {
914+
aclrtlaunch_ascendc_quantize_f16_to_q4_0(
915+
24, ctx.stream(), src->data, dst->data,
916+
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
917+
((ggml_tensor*)dst->extra)->ne);
918+
return;
919+
}
913920
if (dst->type == GGML_TYPE_F16) {
914921
if (ggml_are_same_shape(src, dst)) {
915922
cann_copy(ctx, acl_src, acl_dst);
@@ -971,6 +978,13 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
971978
((ggml_tensor*)dst->extra)->ne);
972979
return;
973980
}
981+
if (dst->type == GGML_TYPE_Q4_0) {
982+
aclrtlaunch_ascendc_quantize_f32_to_q4_0(
983+
24, ctx.stream(), src->data, dst->data,
984+
((ggml_tensor*)src->extra)->ne, ((ggml_tensor*)src->extra)->nb,
985+
((ggml_tensor*)dst->extra)->ne);
986+
return;
987+
}
974988
if (dst->type == GGML_TYPE_F32) {
975989
if (ggml_are_same_shape(src, dst)) {
976990
cann_copy(ctx, acl_src, acl_dst);
@@ -2463,21 +2477,33 @@ static void ggml_cann_mat_mul_fp(ggml_backend_cann_context& ctx,
24632477
* @param dst The destination tensor where the result of the matrix
24642478
* multiplication will be stored.
24652479
*/
2466-
static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
2467-
ggml_tensor* dst) {
2480+
static void ggml_cann_mul_mat_quant(ggml_backend_cann_context& ctx,
2481+
ggml_tensor* dst,
2482+
const enum ggml_type type) {
24682483
ggml_tensor* src0 = dst->src[0]; // weight
24692484
ggml_tensor* src1 = dst->src[1]; // input
24702485

24712486
// The shape of the weight is NCHW. Matrix multiplication uses HW dims. HC
24722487
// is regarded as batch. weight need transpose.
24732488
int64_t weight_ne[] = {src0->ne[1], src0->ne[0]};
2474-
size_t weight_elem_size = sizeof(uint8_t);
2475-
size_t weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
2489+
float weight_elem_size;
2490+
if (type == GGML_TYPE_Q4_0) {
2491+
weight_elem_size = float(sizeof(uint8_t)) / 2;
2492+
}
2493+
else if (type == GGML_TYPE_Q8_0) {
2494+
weight_elem_size = float(sizeof(uint8_t));
2495+
}
2496+
else {
2497+
GGML_ABORT("Only support Q4_0 and Q8_0 MUL_MAT");
2498+
}
2499+
float weight_nb[] = {weight_elem_size * src0->ne[0], weight_elem_size};
2500+
24762501
// size of one matrix is element_size * height * width.
24772502
size_t weight_stride = weight_elem_size * src0->ne[0] * src0->ne[1];
24782503
size_t weight_size = weight_stride * src0->ne[2] * src0->ne[3];
24792504

24802505
// scale stored at the end of weight. Also need transpose.
2506+
GGML_ASSERT(QK4_0 == QK8_0);
24812507
int64_t scale_ne[] = {src0->ne[1], src0->ne[0] / QK8_0};
24822508
size_t scale_elem_size = sizeof(uint16_t);
24832509
size_t scale_nb[] = {src0->ne[0] / QK8_0 * scale_elem_size,
@@ -2541,8 +2567,9 @@ static void ggml_cann_mul_mat_q8_0(ggml_backend_cann_context& ctx,
25412567
(char*)input_buffer + batch1 * input_stride, ACL_FLOAT16,
25422568
input_elem_size, input_ne, input_nb, 2);
25432569
aclTensor* acl_weight_tensor = ggml_cann_create_tensor(
2544-
(char*)src0->data + batch0 * weight_stride, ACL_INT8,
2545-
weight_elem_size, weight_ne, weight_nb, 2);
2570+
(char*)src0->data + batch0 * weight_stride,
2571+
ggml_cann_type_mapping(type), weight_elem_size, weight_ne,
2572+
weight_nb, 2);
25462573
aclTensor* acl_scale_tensor = ggml_cann_create_tensor(
25472574
scale_offset + batch0 * scale_stride, ACL_FLOAT16,
25482575
scale_elem_size, scale_ne, scale_nb, 2);
@@ -2596,11 +2623,9 @@ void ggml_cann_mul_mat(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
25962623
case GGML_TYPE_F16:
25972624
ggml_cann_mat_mul_fp(ctx, dst);
25982625
break;
2599-
// case GGML_TYPE_Q4_0:
2600-
// ggml_cann_mul_mat_q4_0(ctx, dst);
2601-
// break;
2626+
case GGML_TYPE_Q4_0:
26022627
case GGML_TYPE_Q8_0:
2603-
ggml_cann_mul_mat_q8_0(ctx, dst);
2628+
ggml_cann_mul_mat_quant(ctx, dst, type);
26042629
break;
26052630
default:
26062631
GGML_ABORT("fatal error");

ggml/src/ggml-cann/kernels/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ file(GLOB SRC_FILES
99
get_row_q8_0.cpp
1010
quantize_f32_q8_0.cpp
1111
quantize_f16_q8_0.cpp
12+
quantize_float_to_q4_0.cpp
1213
dup.cpp
1314
)
1415

@@ -29,4 +30,4 @@ ascendc_library(ascendc_kernels STATIC
2930
${SRC_FILES}
3031
)
3132

32-
#ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
33+
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

ggml/src/ggml-cann/kernels/ascendc_kernels.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
#include "aclrtlaunch_ascendc_quantize_f32_q8_0.h"
1010
#include "aclrtlaunch_ascendc_quantize_f16_q8_0.h"
11+
#include "aclrtlaunch_ascendc_quantize_f16_to_q4_0.h"
12+
#include "aclrtlaunch_ascendc_quantize_f32_to_q4_0.h"
1113

1214
#include "aclrtlaunch_ascendc_dup_by_rows_fp16.h"
1315
#include "aclrtlaunch_ascendc_dup_by_rows_fp32.h"

0 commit comments

Comments
 (0)