11
11
#include < aclnnop/aclnn_reduce_sum.h>
12
12
#include < aclnnop/aclnn_repeat.h>
13
13
#include < aclnnop/aclnn_softmax.h>
14
- #include < aclnnop/aclnn_upsample_nearest_2d.h>
15
14
#include < aclnnop/aclnn_tril.h>
16
15
#include < aclnnop/aclnn_triu.h>
16
+ #include < aclnnop/aclnn_upsample_nearest_2d.h>
17
17
#include < float.h>
18
18
19
19
#include < cmath>
@@ -548,8 +548,9 @@ void ggml_cann_upsample_nearest2d(ggml_backend_cann_context& ctx,
548
548
ACL_CHECK (aclDestroyTensor (acl_dst));
549
549
}
550
550
551
- void aclnn_pad (ggml_backend_cann_context& ctx, ggml_tensor* dst, aclTensor* acl_src,
552
- aclTensor* acl_dst, int64_t * paddings, float value = 0 .0f ) {
551
+ void aclnn_pad (ggml_backend_cann_context& ctx, ggml_tensor* dst,
552
+ aclTensor* acl_src, aclTensor* acl_dst, int64_t * paddings,
553
+ float value = 0 .0f ) {
553
554
aclIntArray* acl_pad = aclCreateIntArray (paddings, GGML_MAX_DIMS * 2 );
554
555
aclScalar* acl_value = aclCreateScalar (&value, aclDataType::ACL_FLOAT);
555
556
@@ -772,8 +773,9 @@ aclnnStatus aclnnRmsNorm(void* workspace, uint64_t workspaceSize,
772
773
}
773
774
#endif
774
775
775
- aclTensor* aclnn_zero (ggml_backend_cann_context& ctx, ggml_tensor* dst, int64_t * ne, int64_t dims,
776
- aclDataType type, size_t type_size) {
776
+ aclTensor* aclnn_zero (ggml_backend_cann_context& ctx, ggml_tensor* dst,
777
+ int64_t * ne, int64_t dims, aclDataType type,
778
+ size_t type_size) {
777
779
int64_t elements = 1 ;
778
780
for (int i = 0 ; i < dims; i++) {
779
781
elements *= ne[i];
@@ -792,8 +794,9 @@ aclTensor* aclnn_zero(ggml_backend_cann_context& ctx, ggml_tensor* dst, int64_t*
792
794
return zero;
793
795
}
794
796
795
- aclTensor* aclnn_ones (ggml_backend_cann_context& ctx, ggml_tensor* dst, int64_t * ne, int64_t dims,
796
- aclDataType type, size_t type_size, float value = 1 .0f ) {
797
+ aclTensor* aclnn_ones (ggml_backend_cann_context& ctx, ggml_tensor* dst,
798
+ int64_t * ne, int64_t dims, aclDataType type,
799
+ size_t type_size, float value = 1 .0f ) {
797
800
aclTensor* acl_tensor = aclnn_zero (ctx, dst, ne, dims, type, type_size);
798
801
float alpha_host = 1 .0f ;
799
802
aclScalar* alpha = aclCreateScalar (&alpha_host, aclDataType::ACL_FLOAT);
@@ -830,8 +833,8 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
830
833
aclOpExecutor* executor;
831
834
void * workspaceAddr = nullptr ;
832
835
833
- aclTensor* acl_gamma = aclnn_ones (ctx, dst, src-> ne , 1 , type_mapping (src-> type ),
834
- ggml_element_size (src));
836
+ aclTensor* acl_gamma = aclnn_ones (
837
+ ctx, dst, src-> ne , 1 , type_mapping (src-> type ), ggml_element_size (src));
835
838
836
839
int64_t rstd_ne[] = {1 , src->ne [1 ], src->ne [2 ], src->ne [3 ]};
837
840
aclTensor* acl_rstd =
@@ -855,30 +858,34 @@ void ggml_cann_rms_norm(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
855
858
}
856
859
857
860
// TODO: performace is low.
858
- void ggml_cann_diag_mask (ggml_backend_cann_context& ctx, ggml_tensor* dst, float value) {
861
+ void ggml_cann_diag_mask (ggml_backend_cann_context& ctx, ggml_tensor* dst,
862
+ float value) {
859
863
ggml_tensor* src = dst->src [0 ];
860
864
861
865
aclTensor* acl_src = create_acl_tensor (src);
862
866
aclTensor* acl_dst = create_acl_tensor (dst);
863
867
864
- const int n_past = ((int32_t *) dst->op_params )[0 ];
868
+ const int n_past = ((int32_t *) dst->op_params )[0 ];
865
869
866
870
aclTensor* mask_tensor =
867
871
aclnn_ones (ctx, dst, src->ne , GGML_MAX_DIMS, type_mapping (src->type ),
868
872
ggml_element_size (src), value);
869
-
873
+
870
874
uint64_t workspaceSize = 0 ;
871
875
aclOpExecutor* executor;
872
876
void * workspaceAddr = nullptr ;
873
877
874
- ACL_CHECK (aclnnInplaceTriuGetWorkspaceSize (mask_tensor, n_past+1 , &workspaceSize, &executor));
878
+ ACL_CHECK (aclnnInplaceTriuGetWorkspaceSize (mask_tensor, n_past + 1 ,
879
+ &workspaceSize, &executor));
875
880
if (workspaceSize > 0 ) {
876
881
workspaceAddr = ctx.alloc_buffer (dst, workspaceSize);
877
882
}
878
883
879
- ACL_CHECK (aclnnInplaceTriu (workspaceAddr, workspaceSize, executor, ctx.stream ()));
884
+ ACL_CHECK (
885
+ aclnnInplaceTriu (workspaceAddr, workspaceSize, executor, ctx.stream ()));
880
886
881
- ACL_CHECK (aclnnTrilGetWorkspaceSize (acl_src, n_past+1 , acl_dst, &workspaceSize, &executor));
887
+ ACL_CHECK (aclnnTrilGetWorkspaceSize (acl_src, n_past + 1 , acl_dst,
888
+ &workspaceSize, &executor));
882
889
if (workspaceSize > 0 ) {
883
890
workspaceAddr = ctx.alloc_buffer (dst, workspaceSize);
884
891
}
@@ -911,16 +918,16 @@ void aclnn_cast(ggml_backend_cann_context& ctx, aclTensor* acl_src,
911
918
void * workspaceAddr = nullptr ;
912
919
aclrtStream stream = ctx.stream ();
913
920
914
- ACL_CHECK (aclnnCastGetWorkspaceSize (acl_src, cast_data_type,
915
- acl_dst, &workspaceSize, &executor));
921
+ ACL_CHECK (aclnnCastGetWorkspaceSize (acl_src, cast_data_type, acl_dst,
922
+ &workspaceSize, &executor));
916
923
if (workspaceSize > 0 ) {
917
924
workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
918
925
}
919
926
920
927
ACL_CHECK (aclnnCast (workspaceAddr, workspaceSize, executor, stream));
921
928
}
922
929
923
- void aclnn_permute (ggml_backend_cann_context& ctx, aclTensor * acl_src,
930
+ void aclnn_permute (ggml_backend_cann_context& ctx, aclTensor* acl_src,
924
931
aclTensor* acl_dst, int64_t * new_dim, uint64_t dims,
925
932
ggml_tensor* bind_tensor) {
926
933
aclIntArray* acl_dims = aclCreateIntArray (new_dim, dims);
@@ -929,12 +936,14 @@ void aclnn_permute(ggml_backend_cann_context& ctx, aclTensor *acl_src,
929
936
aclOpExecutor* executor;
930
937
void * workspaceAddr = nullptr ;
931
938
932
- ACL_CHECK (aclnnPermuteGetWorkspaceSize (acl_src, acl_dims, acl_dst, &workspaceSize, &executor));
933
- if (workspaceSize > 0 ) {
939
+ ACL_CHECK (aclnnPermuteGetWorkspaceSize (acl_src, acl_dims, acl_dst,
940
+ &workspaceSize, &executor));
941
+ if (workspaceSize > 0 ) {
934
942
workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
935
943
}
936
944
937
- ACL_CHECK (aclnnPermute (workspaceAddr, workspaceSize, executor, ctx.stream ()));
945
+ ACL_CHECK (
946
+ aclnnPermute (workspaceAddr, workspaceSize, executor, ctx.stream ()));
938
947
939
948
ACL_CHECK (aclDestroyIntArray (acl_dims));
940
949
}
@@ -955,24 +964,24 @@ aclnnStatus aclnnIm2col(void* workspace, uint64_t workspaceSize,
955
964
}
956
965
#endif
957
966
void ggml_cann_im2col (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
958
- ggml_tensor* src0 = dst->src [0 ]; // kernel
959
- ggml_tensor* src1 = dst->src [1 ]; // input
967
+ ggml_tensor* src0 = dst->src [0 ]; // kernel
968
+ ggml_tensor* src1 = dst->src [1 ]; // input
960
969
961
970
GGML_ASSERT (src0->type == GGML_TYPE_F16);
962
971
GGML_ASSERT (src1->type == GGML_TYPE_F32);
963
972
GGML_ASSERT (dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
964
973
965
- const int32_t s0 = ((const int32_t *)(dst->op_params ))[0 ];
966
- const int32_t s1 = ((const int32_t *)(dst->op_params ))[1 ];
967
- const int32_t p0 = ((const int32_t *)(dst->op_params ))[2 ];
968
- const int32_t p1 = ((const int32_t *)(dst->op_params ))[3 ];
969
- const int32_t d0 = ((const int32_t *)(dst->op_params ))[4 ];
970
- const int32_t d1 = ((const int32_t *)(dst->op_params ))[5 ];
971
- const bool is_2D = ((const int32_t *)(dst->op_params ))[6 ] == 1 ;
974
+ const int32_t s0 = ((const int32_t *)(dst->op_params ))[0 ];
975
+ const int32_t s1 = ((const int32_t *)(dst->op_params ))[1 ];
976
+ const int32_t p0 = ((const int32_t *)(dst->op_params ))[2 ];
977
+ const int32_t p1 = ((const int32_t *)(dst->op_params ))[3 ];
978
+ const int32_t d0 = ((const int32_t *)(dst->op_params ))[4 ];
979
+ const int32_t d1 = ((const int32_t *)(dst->op_params ))[5 ];
980
+ const bool is_2D = ((const int32_t *)(dst->op_params ))[6 ] == 1 ;
972
981
973
982
GGML_TENSOR_BINARY_OP_LOCALS;
974
983
975
- const int64_t N = is_2D ? ne13 : ne12;
984
+ const int64_t N = is_2D ? ne13 : ne12;
976
985
const int64_t IC = is_2D ? ne12 : ne11;
977
986
const int64_t IH = is_2D ? ne11 : 1 ;
978
987
const int64_t IW = ne10;
@@ -988,34 +997,31 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
988
997
989
998
// im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
990
999
aclTensor* acl_src1 = create_acl_tensor (src1);
991
-
992
- int64_t tmp_im2col_ne[] = {OW * OH , IC * KH * KW, N};
993
-
1000
+ int64_t tmp_im2col_ne[] = {OW * OH, IC * KH * KW, N};
994
1001
size_t tmp_im2col_nb[GGML_MAX_DIMS - 1 ];
1002
+
995
1003
tmp_im2col_nb[0 ] = ggml_type_size (src1->type );
996
- tmp_im2col_nb[1 ] = tmp_im2col_nb[0 ] * (tmp_im2col_ne[0 ] /
997
- ggml_blck_size (src1->type ));
998
- for (int i = 2 ; i < GGML_MAX_DIMS-1 ; i++) {
999
- tmp_im2col_nb[i] = tmp_im2col_nb[i-1 ] * tmp_im2col_ne[i-1 ];
1004
+ for (int i = 1 ; i < GGML_MAX_DIMS - 1 ; i++) {
1005
+ tmp_im2col_nb[i] = tmp_im2col_nb[i - 1 ] * tmp_im2col_ne[i - 1 ];
1000
1006
}
1001
1007
1002
- void * tmp_im2col_buffer = ctx. alloc_buffer (dst, ggml_nbytes (src1));
1003
- aclTensor* tmp_im2col_tensor = create_acl_tensor (tmp_im2col_buffer,
1004
- type_mapping (src1-> type ),
1005
- ggml_type_size (src1-> type ),
1006
- tmp_im2col_ne,
1007
- tmp_im2col_nb,
1008
- GGML_MAX_DIMS- 1 ,
1009
- ACL_FORMAT_ND);
1008
+ // Calculate im2col.
1009
+ // If dst is f16, tmp_buffer is f32, we need alloc src.typesize *
1010
+ // dst.elemcount.
1011
+ void * tmp_im2col_buffer =
1012
+ ctx. alloc_buffer (dst, ggml_nelements (dst) * ggml_element_size (src1));
1013
+ aclTensor* tmp_im2col_tensor = create_acl_tensor (
1014
+ tmp_im2col_buffer, type_mapping (src1-> type ), ggml_type_size (src1-> type ) ,
1015
+ tmp_im2col_ne, tmp_im2col_nb, GGML_MAX_DIMS - 1 , ACL_FORMAT_ND);
1010
1016
1011
1017
std::vector<int64_t > kernel_dims = {KH, KW};
1012
1018
std::vector<int64_t > dilation_size = {d1, d0};
1013
1019
std::vector<int64_t > padding_dims = {p1, p0};
1014
1020
std::vector<int64_t > stride_dims = {s1, s0};
1015
- auto * kernel_size = aclCreateIntArray (kernel_dims.data (), 2 );
1016
- auto * dilations = aclCreateIntArray (dilation_size.data (), 2 );
1017
- auto * paddings = aclCreateIntArray (padding_dims.data (), 2 );
1018
- auto * strides = aclCreateIntArray (stride_dims.data (), 2 );
1021
+ auto * kernel_size = aclCreateIntArray (kernel_dims.data (), 2 );
1022
+ auto * dilations = aclCreateIntArray (dilation_size.data (), 2 );
1023
+ auto * paddings = aclCreateIntArray (padding_dims.data (), 2 );
1024
+ auto * strides = aclCreateIntArray (stride_dims.data (), 2 );
1019
1025
1020
1026
uint64_t workspaceSize = 0 ;
1021
1027
aclOpExecutor* executor;
@@ -1031,45 +1037,36 @@ void ggml_cann_im2col(ggml_backend_cann_context& ctx, ggml_tensor* dst) {
1031
1037
}
1032
1038
1033
1039
ACL_CHECK (aclnnIm2col (workspaceAddr, workspaceSize, executor, stream));
1034
- aclrtSynchronizeStream (ctx.stream ());
1035
1040
1036
- // cast
1037
- void * tmp_cast_buffer = ctx.alloc_buffer (dst, ggml_nbytes (dst));
1041
+ // Cast if dst is f16.
1038
1042
aclTensor* tmp_cast_tensor = nullptr ;
1039
1043
if (src1->type != dst->type ) {
1040
-
1044
+ void * tmp_cast_buffer = ctx. alloc_buffer (dst, ggml_nbytes (dst));
1041
1045
size_t temp_cast_nb[GGML_MAX_DIMS - 1 ];
1042
1046
temp_cast_nb[0 ] = ggml_type_size (dst->type );
1043
- temp_cast_nb[1 ] = temp_cast_nb[0 ] * (tmp_im2col_ne[0 ] /
1044
- ggml_blck_size (dst->type ));
1045
- for (int i = 2 ; i < GGML_MAX_DIMS-1 ; i++) {
1046
- temp_cast_nb[i] = temp_cast_nb[i-1 ] * tmp_im2col_ne[i-1 ];
1047
+ for (int i = 1 ; i < GGML_MAX_DIMS - 1 ; i++) {
1048
+ temp_cast_nb[i] = temp_cast_nb[i - 1 ] * tmp_im2col_ne[i - 1 ];
1047
1049
}
1048
1050
1049
- tmp_cast_tensor = create_acl_tensor (tmp_cast_buffer,
1050
- type_mapping (dst->type ),
1051
- ggml_type_size (dst->type ),
1052
- tmp_im2col_ne, temp_cast_nb,
1053
- GGML_MAX_DIMS-1 , ACL_FORMAT_ND);
1051
+ tmp_cast_tensor = create_acl_tensor (
1052
+ tmp_cast_buffer, type_mapping (dst->type ), ggml_type_size (dst->type ),
1053
+ tmp_im2col_ne, temp_cast_nb, GGML_MAX_DIMS - 1 , ACL_FORMAT_ND);
1054
1054
aclnn_cast (ctx, tmp_im2col_tensor, tmp_cast_tensor,
1055
1055
type_mapping (dst->type ), dst);
1056
- aclrtSynchronizeStream (ctx.stream ());
1057
1056
}
1058
1057
1059
- // permute : [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1058
+ // Permute : [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1060
1059
int64_t dst_ne[] = {dst->ne [0 ], dst->ne [1 ] * dst->ne [2 ], dst->ne [3 ]};
1061
1060
size_t dst_nb[] = {dst->nb [0 ], dst->nb [1 ], dst->nb [3 ]};
1062
- aclTensor* acl_dst = create_acl_tensor (dst, dst_ne, dst_nb,
1063
- GGML_MAX_DIMS- 1 );
1061
+ aclTensor* acl_dst =
1062
+ create_acl_tensor (dst, dst_ne, dst_nb, GGML_MAX_DIMS - 1 );
1064
1063
1065
1064
int64_t permute_dim[] = {0 , 2 , 1 };
1066
1065
if (src1->type != dst->type ) {
1067
1066
aclnn_permute (ctx, tmp_cast_tensor, acl_dst, permute_dim, 3 , dst);
1068
- }
1069
- else {
1067
+ } else {
1070
1068
aclnn_permute (ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3 , dst);
1071
1069
}
1072
- aclrtSynchronizeStream (ctx.stream ());
1073
1070
1074
1071
// release
1075
1072
ACL_CHECK (aclDestroyTensor (acl_src1));
0 commit comments