Skip to content

Commit b0700ae

Browse files
committed
Copy operator support 310P
1 parent ecaf193 commit b0700ae

File tree

4 files changed

+35
-18
lines changed

4 files changed

+35
-18
lines changed

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
904904
return;
905905
}
906906
// TODO: simplify
907+
const size_t CANN_DUP_OP_SUPPORTED_MAX_ROWS = 65535;
907908
if (src->type == GGML_TYPE_F16) {
908909
if (dst->type == GGML_TYPE_Q8_0) {
909910
aclrtlaunch_ascendc_quantize_f16_q8_0(
@@ -931,7 +932,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
931932
if (src->nb[0] == src_type_size) {
932933
// src0 is contigous on first dimension, copy by rows
933934
int64_t rows_num = ggml_nrows(src);
934-
935+
GGML_ASSERT(rows_num <= CANN_DUP_OP_SUPPORTED_MAX_ROWS);
935936
aclrtlaunch_ascendc_dup_by_rows_fp16(
936937
rows_num, ctx.stream(), src->data, dst->data,
937938
((ggml_tensor*)src->extra)->ne,
@@ -956,6 +957,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
956957
if (src->nb[0] == src_type_size) {
957958
// src0 is contigous on first dimension, copy by rows
958959
int64_t rows_num = ggml_nrows(src);
960+
GGML_ASSERT(rows_num <= CANN_DUP_OP_SUPPORTED_MAX_ROWS);
959961
aclrtlaunch_ascendc_dup_by_rows_fp16_to_fp32(
960962
rows_num, ctx.stream(), src->data, dst->data,
961963
((ggml_tensor*)src->extra)->ne,
@@ -999,6 +1001,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
9991001
if (src->nb[0] == src_type_size) {
10001002
// src0 is contigous on first dimension, copy by rows
10011003
int64_t rows_num = ggml_nrows(src);
1004+
GGML_ASSERT(rows_num <= CANN_DUP_OP_SUPPORTED_MAX_ROWS);
10021005
aclrtlaunch_ascendc_dup_by_rows_fp32(
10031006
rows_num, ctx.stream(), src->data, dst->data,
10041007
((ggml_tensor*)src->extra)->ne,
@@ -1025,6 +1028,7 @@ void ggml_cann_dup(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
10251028
if (src->nb[0] == src_type_size) {
10261029
// src0 is contigous on first dimension, copy by rows
10271030
int64_t rows_num = ggml_nrows(src);
1031+
GGML_ASSERT(rows_num <= CANN_DUP_OP_SUPPORTED_MAX_ROWS);
10281032
aclrtlaunch_ascendc_dup_by_rows_fp32_to_fp16(
10291033
rows_num, ctx.stream(), src->data, dst->data,
10301034
((ggml_tensor*)src->extra)->ne,
@@ -2315,8 +2319,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23152319
{
23162320
if ((src0->ne[0] % 8) != 0) {
23172321
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
2318-
/* printf("\n\nggml_cann_get_rows: row elements:%d, src1->ne[0]:%d, src1->ne[1]:%d, src1->ne[2]%d, src0->ne[0]:%d, ggml_type_size(GGML_TYPE_F32):%d, dst_len:%d.\n", src0->ne[0],
2319-
src1->ne[0], src1->ne[1], src1->ne[2], src0->ne[0], ggml_type_size(GGML_TYPE_F32), dst_len); */
23202322
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
23212323
}
23222324
aclrtlaunch_ascendc_get_row_f32(
@@ -2332,8 +2334,6 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23322334
{
23332335
if ((src0->ne[0] % 16) != 0) {
23342336
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
2335-
/* printf("\n\nggml_cann_get_rows: row elements:%d, src1->ne[0]:%d, src1->ne[1]:%d, src1->ne[2]:%d, src0->ne[0]:%d, ggml_type_size(GGML_TYPE_F32):%d, dst_len:%d.\n", src0->ne[0],
2336-
src1->ne[0], src1->ne[1], src1->ne[2], src0->ne[0], ggml_type_size(GGML_TYPE_F32), dst_len); */
23372337
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
23382338
}
23392339
aclrtlaunch_ascendc_get_row_f16(

ggml/src/ggml-cann/kernels/CMakeLists.txt

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,4 +30,12 @@ ascendc_library(ascendc_kernels STATIC
3030
${SRC_FILES}
3131
)
3232

33+
string(FIND "${SOC_VERSION}" "ascend310p" FIRST_310P_INDEX)
34+
if(FIRST_310P_INDEX GREATER -1)
35+
ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCEND_310P)
36+
message(STATUS "Compile for Ascend310P.")
37+
else()
38+
message(STATUS "Compile for Ascend910B.")
39+
endif()
40+
3341
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

ggml/src/ggml-cann/kernels/dup.cpp

Lines changed: 18 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,23 +51,29 @@ class DupByRows {
5151

5252
__aicore__ inline void copy_in() {
5353
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
54-
55-
DataCopyExtParams dataCopyParams;
56-
dataCopyParams.blockCount = 1;
57-
dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
58-
DataCopyPadExtParams<SRC_T> padParams;
59-
DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
60-
54+
const size_t elem_per_block = 32 / sizeof(SRC_T);
55+
size_t tail = num_elem % elem_per_block;
56+
size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
57+
DataCopy(src_local, src_gm, cpy_elements_len);
6158
src_queue.EnQue(src_local);
6259
}
6360

6461
__aicore__ inline void copy_out() {
6562
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
66-
67-
DataCopyExtParams dataCopyParams;
68-
dataCopyParams.blockCount = 1;
69-
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
70-
DataCopyPad(dst_gm, dst_local, dataCopyParams);
63+
const size_t elem_per_block = 32 / sizeof(DST_T);
64+
size_t tail = num_elem % elem_per_block;
65+
size_t len = num_elem & ~(elem_per_block - 1);
66+
if (len > 0) {
67+
DataCopy(dst_gm, dst_local, len);
68+
}
69+
if(tail != 0) {
70+
for (size_t i = tail; i < elem_per_block; i++) {
71+
dst_local[len + i].SetValue(0, 0);
72+
}
73+
SetAtomicAdd<float>();
74+
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
75+
SetAtomicNone();
76+
}
7177

7278
dst_queue.FreeTensor(dst_local);
7379
}

ggml/src/ggml-cann/kernels/get_row_q4_0.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,12 @@ class GET_ROW_Q4_0 {
110110
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
111111

112112
// TODO: cast more data to speed up.
113+
#ifdef ASCEND_310P
114+
// TODO: 310P support quantification
115+
#else
113116
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
114117
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
115-
118+
#endif
116119
// Only mul need compile by group.
117120
half scale = scale_gm.GetValue(scale_offset);
118121

0 commit comments

Comments
 (0)