Skip to content

Commit c18610b

Browse files
authored
CANN: Support Ascend310P to accelerate F32 and F16 Model (#10216)
* CANN Support Ascend310P to accelerate F32 and F16 Model * Add compile option soc type macro ASCEND_310P to ggml-cann lib * Remove unused code * Remove the ascend soc_type hard code compile option in CMakelist.txt
1 parent a5e4759 commit c18610b

File tree

7 files changed

+123
-41
lines changed

7 files changed

+123
-41
lines changed

ggml/src/ggml-cann/CMakeLists.txt

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM
33
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
44
endif()
55

6+
# Auto-detech Soc type and Soc version, if detect failed, will abort build
7+
set(SOC_VERSION "")
8+
function(detect_ascend_soc_type SOC_VERSION)
9+
execute_process(
10+
COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
11+
OUTPUT_VARIABLE npu_info
12+
RESULT_VARIABLE npu_result
13+
OUTPUT_STRIP_TRAILING_WHITESPACE
14+
)
15+
if("${npu_info}" STREQUAL "" OR ${npu_result})
16+
message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
17+
endif()
18+
set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
19+
endfunction()
20+
21+
if(NOT SOC_TYPE)
22+
detect_ascend_soc_type(SOC_VERSION)
23+
set(SOC_TYPE "${SOC_VERSION}")
24+
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
25+
else()
26+
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
27+
endif()
28+
29+
# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
30+
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
31+
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")
32+
633
if (CANN_INSTALL_DIR)
734
# Only Support Linux.
835
if (NOT UNIX)
@@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR)
3966
target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
4067
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)
4168

69+
target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
70+
4271
message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
4372
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
4473
else()

ggml/src/ggml-cann/aclnn_ops.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23122312

23132313
switch (src0->type) {
23142314
case GGML_TYPE_F32:
2315+
{
2316+
#ifdef ASCEND_310P
2317+
// Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
2318+
if ((src0->ne[0] % 8) != 0) {
2319+
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
2320+
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2321+
}
2322+
#endif
23152323
aclrtlaunch_ascendc_get_row_f32(
23162324
24, ctx.stream(), src0->data, src1->data, dst->data,
23172325
((ggml_tensor*)src0->extra)->ne,
@@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23202328
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
23212329
((ggml_tensor*)dst->extra)->nb);
23222330
break;
2331+
}
23232332
case GGML_TYPE_F16:
2333+
{
2334+
#ifdef ASCEND_310P
2335+
// Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
2336+
if ((src0->ne[0] % 16) != 0) {
2337+
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
2338+
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
2339+
}
2340+
#endif
23242341
aclrtlaunch_ascendc_get_row_f16(
23252342
24, ctx.stream(), src0->data, src1->data, dst->data,
23262343
((ggml_tensor*)src0->extra)->ne,
@@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
23292346
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
23302347
((ggml_tensor*)dst->extra)->nb);
23312348
break;
2349+
}
23322350
case GGML_TYPE_Q4_0:
23332351
aclrtlaunch_ascendc_get_row_q4_0(
23342352
24, ctx.stream(), src0->data, src1->data, dst->data,

ggml/src/ggml-cann/kernels/CMakeLists.txt

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,3 @@
1-
if (NOT SOC_TYPE)
2-
set (SOC_TYPE "Ascend910B3")
3-
endif()
4-
51
file(GLOB SRC_FILES
62
get_row_f32.cpp
73
get_row_f16.cpp
@@ -13,7 +9,6 @@ file(GLOB SRC_FILES
139
dup.cpp
1410
)
1511

16-
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
1712
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
1813
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")
1914

@@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC
3025
${SRC_FILES}
3126
)
3227

28+
message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
29+
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
3330
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)

ggml/src/ggml-cann/kernels/dup.cpp

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
using namespace AscendC;
66

77
#define BUFFER_NUM 2
8+
const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>
89

910
template <typename SRC_T, typename DST_T>
1011
class DupByRows {
@@ -19,6 +20,7 @@ class DupByRows {
1920
// Input has four dims.
2021
int64_t op_block_num = GetBlockNum();
2122
int64_t op_block_idx = GetBlockIdx();
23+
assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);
2224

2325
// param
2426
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
@@ -51,24 +53,36 @@ class DupByRows {
5153

5254
__aicore__ inline void copy_in() {
5355
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();
54-
55-
DataCopyExtParams dataCopyParams;
56-
dataCopyParams.blockCount = 1;
57-
dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
58-
DataCopyPadExtParams<SRC_T> padParams;
59-
DataCopyPad(src_local, src_gm, dataCopyParams, padParams);
60-
56+
const size_t elem_per_block = 32 / sizeof(SRC_T);
57+
size_t tail = num_elem % elem_per_block;
58+
size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
59+
DataCopy(src_local, src_gm, cpy_elements_len);
6160
src_queue.EnQue(src_local);
6261
}
6362

6463
__aicore__ inline void copy_out() {
6564
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();
66-
65+
#ifdef ASCEND_310P
66+
const size_t elem_per_block = 32 / sizeof(DST_T);
67+
size_t tail = num_elem % elem_per_block;
68+
size_t len = num_elem & ~(elem_per_block - 1);
69+
if (len > 0) {
70+
DataCopy(dst_gm, dst_local, len);
71+
}
72+
if(tail != 0) {
73+
for (size_t i = tail; i < elem_per_block; i++) {
74+
dst_local[len + i].SetValue(0, 0);
75+
}
76+
SetAtomicAdd<float>();
77+
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
78+
SetAtomicNone();
79+
}
80+
#else
6781
DataCopyExtParams dataCopyParams;
6882
dataCopyParams.blockCount = 1;
6983
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
7084
DataCopyPad(dst_gm, dst_local, dataCopyParams);
71-
85+
#endif
7286
dst_queue.FreeTensor(dst_local);
7387
}
7488

ggml/src/ggml-cann/kernels/get_row_f16.cpp

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ class GET_ROW_F16 {
1414
int64_t *output_ne_ub, size_t *output_nb_ub) {
1515
// TODO, use template for F16/f32
1616
int64_t op_block_num = GetBlockNum();
17-
int64_t op_block_idx = GetBlockIdx();
17+
op_block_idx = GetBlockIdx();
1818

1919
for (int i = 0; i < 4; i++) {
2020
input_ne[i] = input_ne_ub[i];
@@ -59,32 +59,42 @@ class GET_ROW_F16 {
5959
}
6060

6161
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
62+
size_t origin_len = len;
6263
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
63-
size_t tail = len % 32;
64-
len = len & ~31;
65-
DataCopy(input_local, input_gm[offset], len);
64+
const size_t elem_per_block = 32 / sizeof(half);
65+
size_t tail = len % elem_per_block;
66+
len = len & ~(elem_per_block - 1);
6667
if(tail != 0) {
67-
DataCopyExtParams dataCopyParams;
68-
dataCopyParams.blockCount = 1;
69-
dataCopyParams.blockLen = tail * sizeof(half);
70-
DataCopyPadExtParams<half> padParams;
71-
DataCopyPad(input_local[len], input_gm[offset + len],
72-
dataCopyParams, padParams);
68+
len += elem_per_block;
7369
}
70+
DataCopy(input_local, input_gm[offset], len);
7471
input_queue.EnQue(input_local);
7572
}
7673

7774
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
7875
LocalTensor<float> output_local = output_queue.DeQue<float>();
79-
size_t tail = len % 32;
80-
len = len & ~31;
81-
DataCopy(output_gm[offset], output_local, len);
76+
const size_t elem_per_block = 32 / sizeof(float);
77+
size_t tail = len % elem_per_block;
78+
len = len & ~(elem_per_block - 1);
79+
if (len > 0) {
80+
DataCopy(output_gm[offset], output_local, len);
81+
}
82+
8283
if(tail != 0) {
84+
#ifdef ASCEND_310P
85+
for (size_t i = tail; i < elem_per_block; i++) {
86+
output_local[len + i].SetValue(0, 0);
87+
}
88+
SetAtomicAdd<float>();
89+
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
90+
SetAtomicNone();
91+
#else
8392
DataCopyExtParams dataCopyParams;
8493
dataCopyParams.blockCount = 1;
8594
dataCopyParams.blockLen = tail * sizeof(float);
8695
DataCopyPad(output_gm[offset + len], output_local[len],
8796
dataCopyParams);
97+
#endif
8898
}
8999
output_queue.FreeTensor(output_local);
90100
}
@@ -150,6 +160,7 @@ class GET_ROW_F16 {
150160
GlobalTensor<float> output_gm;
151161
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
152162
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
163+
int64_t op_block_idx;
153164
};
154165

155166
template <typename T>

ggml/src/ggml-cann/kernels/get_row_f32.cpp

Lines changed: 23 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ class GET_ROW_F32 {
1313
int64_t *indices_ne_ub, size_t *indices_nb_ub,
1414
int64_t *output_ne_ub, size_t *output_nb_ub) {
1515
int64_t op_block_num = GetBlockNum();
16-
int64_t op_block_idx = GetBlockIdx();
16+
op_block_idx = GetBlockIdx();
1717

1818
for (int i = 0; i < 4; i++) {
1919
input_ne[i] = input_ne_ub[i];
@@ -55,31 +55,40 @@ class GET_ROW_F32 {
5555

5656
__aicore__ inline void copy_in(uint32_t offset, size_t len) {
5757
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
58-
size_t tail = len % 32;
59-
len = len & ~31;
60-
DataCopy(input_local, input_gm[offset], len);
58+
const size_t elem_per_block = 32 / sizeof(float);
59+
size_t tail = len % elem_per_block;
60+
len = len & ~(elem_per_block - 1);
6161
if(tail != 0) {
62-
DataCopyExtParams dataCopyParams;
63-
dataCopyParams.blockCount = 1;
64-
dataCopyParams.blockLen = tail * sizeof(float);
65-
DataCopyPadExtParams<float> padParams;
66-
DataCopyPad(input_local[len], input_gm[offset + len],
67-
dataCopyParams, padParams);
62+
len += elem_per_block;
6863
}
64+
DataCopy(input_local, input_gm[offset], len);
6965
input_queue.EnQue(input_local);
7066
}
7167

7268
__aicore__ inline void copy_out(uint32_t offset, size_t len) {
7369
LocalTensor<float> output_local = output_queue.DeQue<float>();
74-
size_t tail = len % 32;
75-
len = len & ~31;
76-
DataCopy(output_gm[offset], output_local, len);
70+
const size_t elem_per_block = 32 / sizeof(float);
71+
size_t tail = len % elem_per_block;
72+
len = len & ~(elem_per_block - 1);
73+
if (len > 0) {
74+
DataCopy(output_gm[offset], output_local, len);
75+
}
76+
7777
if(tail != 0) {
78+
#ifdef ASCEND_310P
79+
for (size_t i = tail; i < elem_per_block; i++) {
80+
output_local[len + i].SetValue(0, 0);
81+
}
82+
SetAtomicAdd<float>();
83+
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
84+
SetAtomicNone();
85+
#else
7886
DataCopyExtParams dataCopyParams;
7987
dataCopyParams.blockCount = 1;
8088
dataCopyParams.blockLen = tail * sizeof(float);
8189
DataCopyPad(output_gm[offset + len], output_local[len],
8290
dataCopyParams);
91+
#endif
8392
}
8493
output_queue.FreeTensor(output_local);
8594
}
@@ -144,6 +153,7 @@ class GET_ROW_F32 {
144153
GlobalTensor<float> output_gm;
145154
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
146155
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
156+
int64_t op_block_idx;
147157
};
148158

149159
template <typename T>

ggml/src/ggml-cann/kernels/get_row_q4_0.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,12 @@ class GET_ROW_Q4_0 {
110110
LocalTensor<float> output_local = output_queue.AllocTensor<float>();
111111

112112
// TODO: cast more data to speed up.
113+
#ifdef ASCEND_310P
114+
// TODO: 310P support quantification
115+
#else
113116
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
114117
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);
115-
118+
#endif
116119
// Only mul need compile by group.
117120
half scale = scale_gm.GetValue(scale_offset);
118121

0 commit comments

Comments
 (0)