Skip to content

Commit 136775e

Browse files
committed
fix im2col with f16
1 parent f77d82f commit 136775e

File tree

1 file changed

+67
-70
lines changed

1 file changed

+67
-70
lines changed

ggml-cann/aclnn_ops.cpp

Lines changed: 67 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,9 @@
1111
#include <aclnnop/aclnn_reduce_sum.h>
1212
#include <aclnnop/aclnn_repeat.h>
1313
#include <aclnnop/aclnn_softmax.h>
14-
#include <aclnnop/aclnn_upsample_nearest_2d.h>
1514
#include <aclnnop/aclnn_tril.h>
1615
#include <aclnnop/aclnn_triu.h>
16+
#include <aclnnop/aclnn_upsample_nearest_2d.h>
1717
#include <float.h>
1818

1919
#include <cmath>
@@ -548,8 +548,9 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
548548
ACL_CHECK(aclDestroyTensor(acl_dst));
549549
}
550550

551-
void aclnn_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_src,
552-
aclTensor* acl_dst, int64_t* paddings, float value = 0.0f) {
551+
void aclnn_pad(ggml_backend_cann_context& ctx, ggml_tensor* dst,
552+
aclTensor* acl_src, aclTensor* acl_dst, int64_t* paddings,
553+
float value = 0.0f) {
553554
aclIntArray* acl_pad = aclCreateIntArray(paddings, GGML_MAX_DIMS * 2);
554555
aclScalar* acl_value = aclCreateScalar(&value, aclDataType::ACL_FLOAT);
555556

@@ -772,8 +773,9 @@ aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize,
772773
}
773774
#endif
774775

775-
aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, ggml_tensor* dst, int64_t* ne, int64_t dims,
776-
aclDataType type, size_t type_size) {
776+
aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, ggml_tensor* dst,
777+
int64_t* ne, int64_t dims, aclDataType type,
778+
size_t type_size) {
777779
int64_t elements = 1;
778780
for (int i = 0; i < dims; i++) {
779781
elements *= ne[i];
@@ -792,8 +794,9 @@ aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, ggml_tensor* dst, int64_t*
792794
return zero;
793795
}
794796

795-
aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, ggml_tensor* dst, int64_t* ne, int64_t dims,
796-
aclDataType type, size_t type_size, float value = 1.0f) {
797+
aclTensor* aclnn_ones(ggml_backend_cann_context& ctx, ggml_tensor* dst,
798+
int64_t* ne, int64_t dims, aclDataType type,
799+
size_t type_size, float value = 1.0f) {
797800
aclTensor* acl_tensor = aclnn_zero(ctx, dst, ne, dims, type, type_size);
798801
float alpha_host = 1.0f;
799802
aclScalar* alpha = aclCreateScalar(&alpha_host, aclDataType::ACL_FLOAT);
@@ -830,8 +833,8 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
830833
aclOpExecutor* executor;
831834
void* workspaceAddr = nullptr;
832835

833-
aclTensor* acl_gamma = aclnn_ones(ctx, dst, src->ne, 1, type_mapping(src->type),
834-
ggml_element_size(src));
836+
aclTensor* acl_gamma = aclnn_ones(
837+
ctx, dst, src->ne, 1, type_mapping(src->type), ggml_element_size(src));
835838

836839
int64_t rstd_ne[] = {1, src->ne[1], src->ne[2], src->ne[3]};
837840
aclTensor* acl_rstd =
@@ -855,30 +858,34 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
855858
}
856859

857860
// TODO: performace is low.
858-
void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float value) {
861+
void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst,
862+
float value) {
859863
ggml_tensor* src = dst->src[0];
860864

861865
aclTensor* acl_src = create_acl_tensor(src);
862866
aclTensor* acl_dst = create_acl_tensor(dst);
863867

864-
const int n_past = ((int32_t *) dst->op_params)[0];
868+
const int n_past = ((int32_t*)dst->op_params)[0];
865869

866870
aclTensor* mask_tensor =
867871
aclnn_ones(ctx, dst, src->ne, GGML_MAX_DIMS, type_mapping(src->type),
868872
ggml_element_size(src), value);
869-
873+
870874
uint64_t workspaceSize = 0;
871875
aclOpExecutor* executor;
872876
void* workspaceAddr = nullptr;
873877

874-
ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past+1, &workspaceSize, &executor));
878+
ACL_CHECK(aclnnInplaceTriuGetWorkspaceSize(mask_tensor, n_past + 1,
879+
&workspaceSize, &executor));
875880
if (workspaceSize > 0) {
876881
workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
877882
}
878883

879-
ACL_CHECK(aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream()));
884+
ACL_CHECK(
885+
aclnnInplaceTriu(workspaceAddr, workspaceSize, executor, ctx.stream()));
880886

881-
ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past+1, acl_dst, &workspaceSize, &executor));
887+
ACL_CHECK(aclnnTrilGetWorkspaceSize(acl_src, n_past + 1, acl_dst,
888+
&workspaceSize, &executor));
882889
if (workspaceSize > 0) {
883890
workspaceAddr = ctx.alloc_buffer(dst, workspaceSize);
884891
}
@@ -911,16 +918,16 @@ void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
911918
void* workspaceAddr = nullptr;
912919
aclrtStream stream = ctx.stream();
913920

914-
ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type,
915-
acl_dst, &workspaceSize, &executor));
921+
ACL_CHECK(aclnnCastGetWorkspaceSize(acl_src, cast_data_type, acl_dst,
922+
&workspaceSize, &executor));
916923
if (workspaceSize > 0) {
917924
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
918925
}
919926

920927
ACL_CHECK(aclnnCast(workspaceAddr, workspaceSize, executor, stream));
921928
}
922929

923-
void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor *acl_src,
930+
void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor* acl_src,
924931
aclTensor* acl_dst, int64_t* new_dim, uint64_t dims,
925932
ggml_tensor* bind_tensor) {
926933
aclIntArray* acl_dims = aclCreateIntArray(new_dim, dims);
@@ -929,12 +936,14 @@ void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor *acl_src,
929936
aclOpExecutor* executor;
930937
void* workspaceAddr = nullptr;
931938

932-
ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst, &workspaceSize, &executor));
933-
if(workspaceSize > 0) {
939+
ACL_CHECK(aclnnPermuteGetWorkspaceSize(acl_src, acl_dims, acl_dst,
940+
&workspaceSize, &executor));
941+
if (workspaceSize > 0) {
934942
workspaceAddr = ctx.alloc_buffer(bind_tensor, workspaceSize);
935943
}
936944

937-
ACL_CHECK(aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream()));
945+
ACL_CHECK(
946+
aclnnPermute(workspaceAddr, workspaceSize, executor, ctx.stream()));
938947

939948
ACL_CHECK(aclDestroyIntArray(acl_dims));
940949
}
@@ -955,24 +964,24 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
955964
}
956965
#endif
957966
void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
958-
ggml_tensor* src0 = dst->src[0]; // kernel
959-
ggml_tensor* src1 = dst->src[1]; // input
967+
ggml_tensor* src0 = dst->src[0]; // kernel
968+
ggml_tensor* src1 = dst->src[1]; // input
960969

961970
GGML_ASSERT(src0->type == GGML_TYPE_F16);
962971
GGML_ASSERT(src1->type == GGML_TYPE_F32);
963972
GGML_ASSERT(dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
964973

965-
const int32_t s0 = ((const int32_t *)(dst->op_params))[0];
966-
const int32_t s1 = ((const int32_t *)(dst->op_params))[1];
967-
const int32_t p0 = ((const int32_t *)(dst->op_params))[2];
968-
const int32_t p1 = ((const int32_t *)(dst->op_params))[3];
969-
const int32_t d0 = ((const int32_t *)(dst->op_params))[4];
970-
const int32_t d1 = ((const int32_t *)(dst->op_params))[5];
971-
const bool is_2D = ((const int32_t *)(dst->op_params))[6] == 1;
974+
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
975+
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
976+
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
977+
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
978+
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
979+
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
980+
const bool is_2D = ((const int32_t*)(dst->op_params))[6] == 1;
972981

973982
GGML_TENSOR_BINARY_OP_LOCALS;
974983

975-
const int64_t N = is_2D ? ne13 : ne12;
984+
const int64_t N = is_2D ? ne13 : ne12;
976985
const int64_t IC = is_2D ? ne12 : ne11;
977986
const int64_t IH = is_2D ? ne11 : 1;
978987
const int64_t IW = ne10;
@@ -988,34 +997,31 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
988997

989998
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
990999
aclTensor* acl_src1 = create_acl_tensor(src1);
991-
992-
int64_t tmp_im2col_ne[] = {OW * OH , IC * KH * KW, N};
993-
1000+
int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
9941001
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1];
1002+
9951003
tmp_im2col_nb[0] = ggml_type_size(src1->type);
996-
tmp_im2col_nb[1] = tmp_im2col_nb[0] * (tmp_im2col_ne[0] /
997-
ggml_blck_size(src1->type));
998-
for (int i = 2; i < GGML_MAX_DIMS-1; i++) {
999-
tmp_im2col_nb[i] = tmp_im2col_nb[i-1] * tmp_im2col_ne[i-1];
1004+
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1005+
tmp_im2col_nb[i] = tmp_im2col_nb[i - 1] * tmp_im2col_ne[i - 1];
10001006
}
10011007

1002-
void* tmp_im2col_buffer = ctx.alloc_buffer(dst, ggml_nbytes(src1));
1003-
aclTensor* tmp_im2col_tensor = create_acl_tensor(tmp_im2col_buffer,
1004-
type_mapping(src1->type),
1005-
ggml_type_size(src1->type),
1006-
tmp_im2col_ne,
1007-
tmp_im2col_nb,
1008-
GGML_MAX_DIMS-1,
1009-
ACL_FORMAT_ND);
1008+
// Calculate im2col.
1009+
// If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
1010+
// dst.elemcount.
1011+
void* tmp_im2col_buffer =
1012+
ctx.alloc_buffer(dst, ggml_nelements(dst) * ggml_element_size(src1));
1013+
aclTensor* tmp_im2col_tensor = create_acl_tensor(
1014+
tmp_im2col_buffer, type_mapping(src1->type), ggml_type_size(src1->type),
1015+
tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
10101016

10111017
std::vector<int64_t> kernel_dims = {KH, KW};
10121018
std::vector<int64_t> dilation_size = {d1, d0};
10131019
std::vector<int64_t> padding_dims = {p1, p0};
10141020
std::vector<int64_t> stride_dims = {s1, s0};
1015-
auto *kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
1016-
auto *dilations = aclCreateIntArray(dilation_size.data(), 2);
1017-
auto *paddings = aclCreateIntArray(padding_dims.data(), 2);
1018-
auto *strides = aclCreateIntArray(stride_dims.data(), 2);
1021+
auto* kernel_size = aclCreateIntArray(kernel_dims.data(), 2);
1022+
auto* dilations = aclCreateIntArray(dilation_size.data(), 2);
1023+
auto* paddings = aclCreateIntArray(padding_dims.data(), 2);
1024+
auto* strides = aclCreateIntArray(stride_dims.data(), 2);
10191025

10201026
uint64_t workspaceSize = 0;
10211027
aclOpExecutor* executor;
@@ -1031,45 +1037,36 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
10311037
}
10321038

10331039
ACL_CHECK(aclnnIm2col(workspaceAddr, workspaceSize, executor, stream));
1034-
aclrtSynchronizeStream(ctx.stream());
10351040

1036-
// cast
1037-
void* tmp_cast_buffer = ctx.alloc_buffer(dst, ggml_nbytes(dst));
1041+
// Cast if dst is f16.
10381042
aclTensor* tmp_cast_tensor = nullptr;
10391043
if (src1->type != dst->type) {
1040-
1044+
void* tmp_cast_buffer = ctx.alloc_buffer(dst, ggml_nbytes(dst));
10411045
size_t temp_cast_nb[GGML_MAX_DIMS - 1];
10421046
temp_cast_nb[0] = ggml_type_size(dst->type);
1043-
temp_cast_nb[1] = temp_cast_nb[0] * (tmp_im2col_ne[0] /
1044-
ggml_blck_size(dst->type));
1045-
for (int i = 2; i < GGML_MAX_DIMS-1; i++) {
1046-
temp_cast_nb[i] = temp_cast_nb[i-1] * tmp_im2col_ne[i-1];
1047+
for (int i = 1; i < GGML_MAX_DIMS - 1; i++) {
1048+
temp_cast_nb[i] = temp_cast_nb[i - 1] * tmp_im2col_ne[i - 1];
10471049
}
10481050

1049-
tmp_cast_tensor = create_acl_tensor(tmp_cast_buffer,
1050-
type_mapping(dst->type),
1051-
ggml_type_size(dst->type),
1052-
tmp_im2col_ne, temp_cast_nb,
1053-
GGML_MAX_DIMS-1, ACL_FORMAT_ND);
1051+
tmp_cast_tensor = create_acl_tensor(
1052+
tmp_cast_buffer, type_mapping(dst->type), ggml_type_size(dst->type),
1053+
tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1, ACL_FORMAT_ND);
10541054
aclnn_cast(ctx, tmp_im2col_tensor, tmp_cast_tensor,
10551055
type_mapping(dst->type), dst);
1056-
aclrtSynchronizeStream(ctx.stream());
10571056
}
10581057

1059-
// permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1058+
// Permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
10601059
int64_t dst_ne[] = {dst->ne[0], dst->ne[1] * dst->ne[2], dst->ne[3]};
10611060
size_t dst_nb[] = {dst->nb[0], dst->nb[1], dst->nb[3]};
1062-
aclTensor* acl_dst = create_acl_tensor(dst, dst_ne, dst_nb,
1063-
GGML_MAX_DIMS-1);
1061+
aclTensor* acl_dst =
1062+
create_acl_tensor(dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1);
10641063

10651064
int64_t permute_dim[] = {0, 2, 1};
10661065
if (src1->type != dst->type) {
10671066
aclnn_permute(ctx, tmp_cast_tensor, acl_dst, permute_dim, 3, dst);
1068-
}
1069-
else {
1067+
} else {
10701068
aclnn_permute(ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3, dst);
10711069
}
1072-
aclrtSynchronizeStream(ctx.stream());
10731070

10741071
// release
10751072
ACL_CHECK(aclDestroyTensor(acl_src1));

0 commit comments

Comments
 (0)