Skip to content

Commit 41a6055

Browse files
committed
add im2col
1 parent 75bd288 commit 41a6055

File tree

3 files changed

+193
-6
lines changed

3 files changed

+193
-6
lines changed

ggml-cann.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,8 @@ static bool ggml_cann_compute_forward(ggml_backend_cann_context& ctx,
450450
case GGML_OP_ROPE:
451451
case GGML_OP_ALIBI:
452452
case GGML_OP_IM2COL:
453+
ggml_cann_im2col(ctx, dst);
454+
break;
453455
case GGML_OP_POOL_2D:
454456
ggml_cann_pool2d(ctx, dst);
455457
break;
@@ -683,8 +685,9 @@ GGML_CALL static bool ggml_backend_cann_supports_op(ggml_backend_t backend,
683685
case GGML_OP_SOFT_MAX:
684686
case GGML_OP_ROPE:
685687
case GGML_OP_ALIBI:
686-
case GGML_OP_IM2COL:
687688
return false;
689+
case GGML_OP_IM2COL:
690+
return true;
688691
case GGML_OP_POOL_2D:
689692
case GGML_OP_SUM_ROWS:
690693
case GGML_OP_ARGSORT:

ggml-cann/aclnn_ops.cpp

Lines changed: 181 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#include <aclnnop/aclnn_group_norm.h>
88
#include <aclnnop/aclnn_layer_norm.h>
99
#include <aclnnop/aclnn_max_pool.h>
10+
#include <aclnnop/aclnn_permute.h>
1011
#include <aclnnop/aclnn_reduce_sum.h>
1112
#include <aclnnop/aclnn_repeat.h>
1213
#include <aclnnop/aclnn_softmax.h>
@@ -900,4 +901,183 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float
900901
ACL_CHECK(aclDestroyTensor(mask_tensor));
901902
ACL_CHECK(aclDestroyTensor(acl_src));
902903
ACL_CHECK(aclDestroyTensor(acl_dst));
903-
}
904+
}
905+
906+
void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
907+
aclTensor* acl_dst, aclDataType cast_data_type,
908+
ggml_tensor* bind_tensor) {
909+
uint64_t workspaceSize = 0;
910+
aclOpExecutor* executor;
911+
void* workspaceAddr = nullptr;
912+
aclrtStream stream = ctx.stream();
913+
914+
ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type,
915+
acl_dst, &workspaceSize, &executor));
916+
if (workspaceSize > 0) {
917+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
918+
}
919+
920+
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, stream));
921+
}
922+
923+
void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor *acl_src,
924+
aclTensor* acl_dst, int64_t* new_dim, uint64_t dims,
925+
ggml_tensor* bind_tensor) {
926+
aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
927+
928+
uint64_t workspaceSize = 0;
929+
aclOpExecutor* executor;
930+
void* workspaceAddr = nullptr;
931+
932+
ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst, &workspaceSize, &executor));
933+
if(workspaceSize > 0) {
934+
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
935+
}
936+
937+
ACL_CHECK(aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream()));
938+
939+
ACL_CHECK(aclDestroyIntArray(acl_dims));
940+
}
941+
942+
#ifdef __cplusplus
943+
extern "C" {
944+
#endif
945+
aclnnStatus aclnnIm2colGetWorkspaceSize(const aclTensor* self,
946+
const aclIntArray* kernelSize,
947+
const aclIntArray* dilation,
948+
const aclIntArray* padding,
949+
const aclIntArray* stride,
950+
aclTensor* out, uint64_t* workspaceSize,
951+
aclOpExecutor** executor);
952+
aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
953+
aclOpExecutor* executor, aclrtStream stream);
954+
#ifdef __cplusplus
955+
}
956+
#endif
957+
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
958+
ggml_tensor* src0 = dst->src[0]; // kernel
959+
ggml_tensor* src1 = dst->src[1]; // input
960+
961+
GGML_ASSERT(src0->type == GGML_TYPE_F16);
962+
GGML_ASSERT(src1->type == GGML_TYPE_F32);
963+
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
964+
965+
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
966+
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
967+
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
968+
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
969+
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
970+
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
971+
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
972+
973+
GGML_TENSOR_BINARY_OP_LOCALS;
974+
975+
const int64_t N = is_2D ? ne13 : ne12;
976+
const int64_t IC = is_2D ? ne12 : ne11;
977+
const int64_t IH = is_2D ? ne11 : 1;
978+
const int64_t IW = ne10;
979+
980+
const int64_t KH = is_2D ? ne01 : 1;
981+
const int64_t KW = ne00;
982+
983+
const int64_t OH = is_2D ? ne2 : 1;
984+
const int64_t OW = ne1;
985+
986+
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
987+
GGML_ASSERT(nb10 == sizeof(float));
988+
989+
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
990+
aclTensor* acl_src1 = create_acl_tensor(src1);
991+
992+
int64_t tmp_im2col_ne[] = {OW * OH , IC * KH * KW, N};
993+
994+
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
995+
tmp_im2col_nb[0] = ggml_type_size(src1->type);
996+
tmp_im2col_nb[1] = tmp_im2col_nb[0] * (tmp_im2col_ne[0] /
997+
ggml_blck_size(src1->type));
998+
for (int i = 2; i < GGML_MAX_DIMS-1; i++) {
999+
tmp_im2col_nb[i] = tmp_im2col_nb[i-1] * tmp_im2col_ne[i-1];
1000+
}
1001+
1002+
void* tmp_im2col_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src1));
1003+
aclTensor* tmp_im2col_tensor = create_acl_tensor(tmp_im2col_buffer,
1004+
type_mapping(src1->type),
1005+
ggml_type_size(src1->type),
1006+
tmp_im2col_ne,
1007+
tmp_im2col_nb,
1008+
GGML_MAX_DIMS-1,
1009+
ACL_FORMAT_ND);
1010+
1011+
std::vector<int64_t> kernel_dims = {KH, KW};
1012+
std::vector<int64_t> dilation_size = {d1, d0};
1013+
std::vector<int64_t> padding_dims = {p1, p0};
1014+
std::vector<int64_t> stride_dims = {s1, s0};
1015+
auto *kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
1016+
auto *dilations = aclCreateIntArray(dilation_size.data(), 2);
1017+
auto *paddings = aclCreateIntArray(padding_dims.data(), 2);
1018+
auto *strides = aclCreateIntArray(stride_dims.data(), 2);
1019+
1020+
uint64_t workspaceSize = 0;
1021+
aclOpExecutor* executor;
1022+
void* workspaceAddr = nullptr;
1023+
aclrtStream stream = ctx.stream();
1024+
1025+
ACL_CHECK(aclnnIm2colGetWorkspaceSize(acl_src1, kernel_size, dilations,
1026+
paddings, strides, tmp_im2col_tensor,
1027+
&workspaceSize, &executor));
1028+
1029+
if (workspaceSize > 0) {
1030+
workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
1031+
}
1032+
1033+
ACL_CHECK(aclnnIm2col(workspaceAddr, workspaceSize, executor, stream));
1034+
aclrtSynchronizeStream(ctx.stream());
1035+
1036+
// cast
1037+
void* tmp_cast_buffer = ctx.alloc_buffer(dst, ggml_nbytes(dst));
1038+
aclTensor* tmp_cast_tensor = nullptr;
1039+
if (src1->type != dst->type) {
1040+
1041+
size_t temp_cast_nb[GGML_MAX_DIMS - 1];
1042+
temp_cast_nb[0] = ggml_type_size(dst->type);
1043+
temp_cast_nb[1] = temp_cast_nb[0] * (tmp_im2col_ne[0] /
1044+
ggml_blck_size(dst->type));
1045+
for (int i = 2; i < GGML_MAX_DIMS-1; i++) {
1046+
temp_cast_nb[i] = temp_cast_nb[i-1] * tmp_im2col_ne[i-1];
1047+
}
1048+
1049+
tmp_cast_tensor = create_acl_tensor(tmp_cast_buffer,
1050+
type_mapping(dst->type),
1051+
ggml_type_size(dst->type),
1052+
tmp_im2col_ne, temp_cast_nb,
1053+
GGML_MAX_DIMS-1, ACL_FORMAT_ND);
1054+
aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor,
1055+
type_mapping(dst->type), dst);
1056+
aclrtSynchronizeStream(ctx.stream());
1057+
}
1058+
1059+
// permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1060+
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
1061+
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
1062+
aclTensor* acl_dst = create_acl_tensor(dst, dst_ne, dst_nb,
1063+
GGML_MAX_DIMS-1);
1064+
1065+
int64_t permute_dim[] = {0, 2, 1};
1066+
if (src1->type != dst->type) {
1067+
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3, dst);
1068+
}
1069+
else {
1070+
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3, dst);
1071+
}
1072+
aclrtSynchronizeStream(ctx.stream());
1073+
1074+
// release
1075+
ACL_CHECK(aclDestroyTensor(acl_src1));
1076+
ACL_CHECK(aclDestroyTensor(tmp_im2col_tensor));
1077+
ACL_CHECK(aclDestroyTensor(tmp_cast_tensor));
1078+
ACL_CHECK(aclDestroyTensor(acl_dst));
1079+
ACL_CHECK(aclDestroyIntArray(kernel_size));
1080+
ACL_CHECK(aclDestroyIntArray(dilations));
1081+
ACL_CHECK(aclDestroyIntArray(paddings));
1082+
ACL_CHECK(aclDestroyIntArray(strides));
1083+
}

ggml-cann/aclnn_ops.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,14 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst);
5454

5555
void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value);
5656

57+
void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
58+
59+
void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
60+
61+
void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
62+
63+
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst);
64+
5765
template <aclnnStatus getWorkspaceSize(const aclTensor*, const aclTensor*,
5866
aclTensor*, uint64_t*, aclOpExecutor**),
5967
aclnnStatus execute(void*, uint64_t, aclOpExecutor*, aclrtStream)>
@@ -159,8 +167,4 @@ void ggml_cann_activation(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
159167
void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
160168
ggml_tensor* dst);
161169

162-
void ggml_cann_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
163-
void ggml_cann_avg_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
164-
void ggml_cann_max_pool2d(ggml_backend_cann_context& ctx, ggml_tensor* dst);
165-
166170
#endif // CANN_ACLNN_OPS

0 commit comments

Comments
 (0)