Skip to content

CANN Support Ascend310P to accelerate F32 and F16 LLM Model #10216

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Nov 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions ggml/src/ggml-cann/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,33 @@ if ("cann${CANN_INSTALL_DIR}" STREQUAL "cann" AND DEFINED ENV{ASCEND_TOOLKIT_HOM
message(STATUS "CANN: updated CANN_INSTALL_DIR from ASCEND_TOOLKIT_HOME=$ENV{ASCEND_TOOLKIT_HOME}")
endif()

# Auto-detech Soc type and Soc version, if detect failed, will abort build
set(SOC_VERSION "")
function(detect_ascend_soc_type SOC_VERSION)
execute_process(
COMMAND bash -c "npu-smi info|awk -F' ' 'NF > 0 && NR==7 {print $3}'"
OUTPUT_VARIABLE npu_info
RESULT_VARIABLE npu_result
OUTPUT_STRIP_TRAILING_WHITESPACE
)
if("${npu_info}" STREQUAL "" OR ${npu_result})
message(FATAL_ERROR "Auto-detech ascend soc type failed, please specify manually or check ascend device working normally.")
endif()
set(${SOC_VERSION} "Ascend${npu_info}" PARENT_SCOPE)
endfunction()

if(NOT SOC_TYPE)
detect_ascend_soc_type(SOC_VERSION)
set(SOC_TYPE "${SOC_VERSION}")
message(STATUS "CANN: SOC_VERSION auto-detected is:${SOC_VERSION}")
else()
string(TOLOWER ${SOC_TYPE} SOC_VERSION)
endif()

# Construct Soc specify compile option: ASCEND_#Soc_Major_SN. Such as ASCEND_910B, ASCEND310P.
string(REGEX MATCH "[0-9]+[a-zA-Z]" SOC_TYPE_MAJOR_SN "${SOC_VERSION}")
set(SOC_TYPE_COMPILE_OPTION "ASCEND_${SOC_TYPE_MAJOR_SN}")

if (CANN_INSTALL_DIR)
# Only Support Linux.
if (NOT UNIX)
Expand Down Expand Up @@ -39,6 +66,8 @@ if (CANN_INSTALL_DIR)
target_include_directories(ggml-cann PRIVATE . .. ${CANN_INCLUDE_DIRS})
target_link_directories(ggml-cann PRIVATE ${CANN_INSTALL_DIR}/lib64)

target_compile_definitions(ggml-cann PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")

message(STATUS "CANN: CANN_INCLUDE_DIRS = ${CANN_INCLUDE_DIRS}")
message(STATUS "CANN: CANN_LIBRARIES = ${CANN_LIBRARIES}")
else()
Expand Down
18 changes: 18 additions & 0 deletions ggml/src/ggml-cann/aclnn_ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2312,6 +2312,14 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {

switch (src0->type) {
case GGML_TYPE_F32:
{
#ifdef ASCEND_310P
// Special operation for get_row_f32 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
if ((src0->ne[0] % 8) != 0) {
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32);
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
}
#endif
aclrtlaunch_ascendc_get_row_f32(
24, ctx.stream(), src0->data, src1->data, dst->data,
((ggml_tensor*)src0->extra)->ne,
Expand All @@ -2320,7 +2328,16 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb);
break;
}
case GGML_TYPE_F16:
{
#ifdef ASCEND_310P
// Special operation for get_row_f16 kernel of 310P: clear the content of dest data buffer when row is not aligned to 32 bytes
if ((src0->ne[0] % 16) != 0) {
size_t dst_len = src1->ne[0] * src1->ne[1] * src1->ne[2] * src0->ne[0] * ggml_type_size(GGML_TYPE_F32); // out is also f32, even input is f16
ACL_CHECK(aclrtMemset((char*)dst->data, dst_len, 0, dst_len));
}
#endif
aclrtlaunch_ascendc_get_row_f16(
24, ctx.stream(), src0->data, src1->data, dst->data,
((ggml_tensor*)src0->extra)->ne,
Expand All @@ -2329,6 +2346,7 @@ void ggml_cann_get_rows(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
((ggml_tensor*)src1->extra)->nb, ((ggml_tensor*)dst->extra)->ne,
((ggml_tensor*)dst->extra)->nb);
break;
}
case GGML_TYPE_Q4_0:
aclrtlaunch_ascendc_get_row_q4_0(
24, ctx.stream(), src0->data, src1->data, dst->data,
Expand Down
7 changes: 2 additions & 5 deletions ggml/src/ggml-cann/kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,3 @@
if (NOT SOC_TYPE)
set (SOC_TYPE "Ascend910B3")
endif()

file(GLOB SRC_FILES
get_row_f32.cpp
get_row_f16.cpp
Expand All @@ -13,7 +9,6 @@ file(GLOB SRC_FILES
dup.cpp
)

string(TOLOWER ${SOC_TYPE} SOC_VERSION)
set(ASCEND_CANN_PACKAGE_PATH ${CANN_INSTALL_DIR})
set(RUN_MODE "npu" CACHE STRING "run mode: npu/sim")

Expand All @@ -30,4 +25,6 @@ ascendc_library(ascendc_kernels STATIC
${SRC_FILES}
)

message(STATUS "CANN: compile ascend kernels witch SOC_VERSION:${SOC_VERSION}.")
ascendc_compile_definitions(ascendc_kernels PRIVATE "-D${SOC_TYPE_COMPILE_OPTION}")
# ascendc_compile_definitions(ascendc_kernels PRIVATE -DASCENDC_DUMP)
32 changes: 23 additions & 9 deletions ggml/src/ggml-cann/kernels/dup.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
using namespace AscendC;

#define BUFFER_NUM 2
const int64_t SUPPORTED_MAX_DIM = 65535; // currently the limit of max block dim supportted by dup kernel is 65535template <typename SRC_T, typename DST_T>

template <typename SRC_T, typename DST_T>
class DupByRows {
Expand All @@ -19,6 +20,7 @@ class DupByRows {
// Input has four dims.
int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx();
assert(op_block_idx < SUPPORTED_MAX_DIM && op_block_idx >= 0, "Invalid block index:%d, max is:%d\n", op_block_idx, SUPPORTED_MAX_DIM);

// param
num_rows = input_ne_ub[1] * input_ne_ub[2] * input_ne_ub[3];
Expand Down Expand Up @@ -51,24 +53,36 @@ class DupByRows {

__aicore__ inline void copy_in() {
LocalTensor<SRC_T> src_local = src_queue.AllocTensor<SRC_T>();

DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = num_elem * sizeof(SRC_T);
DataCopyPadExtParams<SRC_T> padParams;
DataCopyPad(src_local, src_gm, dataCopyParams, padParams);

const size_t elem_per_block = 32 / sizeof(SRC_T);
size_t tail = num_elem % elem_per_block;
size_t cpy_elements_len = tail > 0 ? num_elem + 1 : num_elem;
DataCopy(src_local, src_gm, cpy_elements_len);
src_queue.EnQue(src_local);
}

__aicore__ inline void copy_out() {
LocalTensor<DST_T> dst_local = dst_queue.DeQue<DST_T>();

#ifdef ASCEND_310P
const size_t elem_per_block = 32 / sizeof(DST_T);
size_t tail = num_elem % elem_per_block;
size_t len = num_elem & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(dst_gm, dst_local, len);
}
if(tail != 0) {
for (size_t i = tail; i < elem_per_block; i++) {
dst_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(dst_gm[len], dst_local[len], elem_per_block);
SetAtomicNone();
}
#else
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = num_elem * sizeof(DST_T);
DataCopyPad(dst_gm, dst_local, dataCopyParams);

#endif
dst_queue.FreeTensor(dst_local);
}

Expand Down
37 changes: 24 additions & 13 deletions ggml/src/ggml-cann/kernels/get_row_f16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ class GET_ROW_F16 {
int64_t *output_ne_ub, size_t *output_nb_ub) {
// TODO, use template for F16/f32
int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx();
op_block_idx = GetBlockIdx();

for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i];
Expand Down Expand Up @@ -59,32 +59,42 @@ class GET_ROW_F16 {
}

__aicore__ inline void copy_in(uint32_t offset, size_t len) {
size_t origin_len = len;
LocalTensor<half> input_local = input_queue.AllocTensor<half>();
size_t tail = len % 32;
len = len & ~31;
DataCopy(input_local, input_gm[offset], len);
const size_t elem_per_block = 32 / sizeof(half);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if(tail != 0) {
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(half);
DataCopyPadExtParams<half> padParams;
DataCopyPad(input_local[len], input_gm[offset + len],
dataCopyParams, padParams);
len += elem_per_block;
}
DataCopy(input_local, input_gm[offset], len);
input_queue.EnQue(input_local);
}

__aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
size_t tail = len % 32;
len = len & ~31;
DataCopy(output_gm[offset], output_local, len);
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(output_gm[offset], output_local, len);
}

if(tail != 0) {
#ifdef ASCEND_310P
for (size_t i = tail; i < elem_per_block; i++) {
output_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
SetAtomicNone();
#else
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams);
#endif
}
output_queue.FreeTensor(output_local);
}
Expand Down Expand Up @@ -150,6 +160,7 @@ class GET_ROW_F16 {
GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
int64_t op_block_idx;
};

template <typename T>
Expand Down
36 changes: 23 additions & 13 deletions ggml/src/ggml-cann/kernels/get_row_f32.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class GET_ROW_F32 {
int64_t *indices_ne_ub, size_t *indices_nb_ub,
int64_t *output_ne_ub, size_t *output_nb_ub) {
int64_t op_block_num = GetBlockNum();
int64_t op_block_idx = GetBlockIdx();
op_block_idx = GetBlockIdx();

for (int i = 0; i < 4; i++) {
input_ne[i] = input_ne_ub[i];
Expand Down Expand Up @@ -55,31 +55,40 @@ class GET_ROW_F32 {

__aicore__ inline void copy_in(uint32_t offset, size_t len) {
LocalTensor<float> input_local = input_queue.AllocTensor<float>();
size_t tail = len % 32;
len = len & ~31;
DataCopy(input_local, input_gm[offset], len);
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if(tail != 0) {
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPadExtParams<float> padParams;
DataCopyPad(input_local[len], input_gm[offset + len],
dataCopyParams, padParams);
len += elem_per_block;
}
DataCopy(input_local, input_gm[offset], len);
input_queue.EnQue(input_local);
}

__aicore__ inline void copy_out(uint32_t offset, size_t len) {
LocalTensor<float> output_local = output_queue.DeQue<float>();
size_t tail = len % 32;
len = len & ~31;
DataCopy(output_gm[offset], output_local, len);
const size_t elem_per_block = 32 / sizeof(float);
size_t tail = len % elem_per_block;
len = len & ~(elem_per_block - 1);
if (len > 0) {
DataCopy(output_gm[offset], output_local, len);
}

if(tail != 0) {
#ifdef ASCEND_310P
for (size_t i = tail; i < elem_per_block; i++) {
output_local[len + i].SetValue(0, 0);
}
SetAtomicAdd<float>();
DataCopy(output_gm[offset + len], output_local[len], elem_per_block);
SetAtomicNone();
#else
DataCopyExtParams dataCopyParams;
dataCopyParams.blockCount = 1;
dataCopyParams.blockLen = tail * sizeof(float);
DataCopyPad(output_gm[offset + len], output_local[len],
dataCopyParams);
#endif
}
output_queue.FreeTensor(output_local);
}
Expand Down Expand Up @@ -144,6 +153,7 @@ class GET_ROW_F32 {
GlobalTensor<float> output_gm;
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
int64_t op_block_idx;
};

template <typename T>
Expand Down
5 changes: 4 additions & 1 deletion ggml/src/ggml-cann/kernels/get_row_q4_0.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,9 +110,12 @@ class GET_ROW_Q4_0 {
LocalTensor<float> output_local = output_queue.AllocTensor<float>();

// TODO: cast more data to speed up.
#ifdef ASCEND_310P
// TODO: 310P support quantification
#else
Cast(cast_local, input_local, RoundMode::CAST_NONE, QK4_0);
Cast(output_local, cast_local, RoundMode::CAST_NONE, QK4_0);

#endif
// Only mul need compile by group.
half scale = scale_gm.GetValue(scale_offset);

Expand Down
Loading