7
7
#include < aclnnop/aclnn_group_norm.h>
8
8
#include < aclnnop/aclnn_layer_norm.h>
9
9
#include < aclnnop/aclnn_max_pool.h>
10
+ #include < aclnnop/aclnn_permute.h>
10
11
#include < aclnnop/aclnn_reduce_sum.h>
11
12
#include < aclnnop/aclnn_repeat.h>
12
13
#include < aclnnop/aclnn_softmax.h>
@@ -900,4 +901,183 @@ void ggml_cann_diag_mask(ggml_backend_cann_context& ctx, ggml_tensor* dst, float
900
901
ACL_CHECK (aclDestroyTensor (mask_tensor));
901
902
ACL_CHECK (aclDestroyTensor (acl_src));
902
903
ACL_CHECK (aclDestroyTensor (acl_dst));
903
- }
904
+ }
905
+
906
+ void aclnn_cast (ggml_backend_cann_context& ctx, aclTensor* acl_src,
907
+ aclTensor* acl_dst, aclDataType cast_data_type,
908
+ ggml_tensor* bind_tensor) {
909
+ uint64_t workspaceSize = 0 ;
910
+ aclOpExecutor* executor;
911
+ void * workspaceAddr = nullptr ;
912
+ aclrtStream stream = ctx.stream ();
913
+
914
+ ACL_CHECK (aclnnCastGetWorkspaceSize (acl_src, cast_data_type,
915
+ acl_dst, &workspaceSize, &executor));
916
+ if (workspaceSize > 0 ) {
917
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
918
+ }
919
+
920
+ ACL_CHECK (aclnnCast (workspaceAddr, workspaceSize, executor, stream));
921
+ }
922
+
923
+ void aclnn_permute (ggml_backend_cann_context& ctx, aclTensor *acl_src,
924
+ aclTensor* acl_dst, int64_t * new_dim, uint64_t dims,
925
+ ggml_tensor* bind_tensor) {
926
+ aclIntArray* acl_dims = aclCreateIntArray (new_dim, dims);
927
+
928
+ uint64_t workspaceSize = 0 ;
929
+ aclOpExecutor* executor;
930
+ void * workspaceAddr = nullptr ;
931
+
932
+ ACL_CHECK (aclnnPermuteGetWorkspaceSize (acl_src, acl_dims, acl_dst, &workspaceSize, &executor));
933
+ if (workspaceSize > 0 ) {
934
+ workspaceAddr = ctx.alloc_buffer (bind_tensor, workspaceSize);
935
+ }
936
+
937
+ ACL_CHECK (aclnnPermute (workspaceAddr, workspaceSize, executor, ctx.stream ()));
938
+
939
+ ACL_CHECK (aclDestroyIntArray (acl_dims));
940
+ }
941
+
942
+ #ifdef __cplusplus
943
+ extern " C" {
944
+ #endif
945
+ aclnnStatus aclnnIm2colGetWorkspaceSize (const aclTensor* self,
946
+ const aclIntArray* kernelSize,
947
+ const aclIntArray* dilation,
948
+ const aclIntArray* padding,
949
+ const aclIntArray* stride,
950
+ aclTensor* out, uint64_t * workspaceSize,
951
+ aclOpExecutor** executor);
952
+ aclnnStatus aclnnIm2col (void * workspace, uint64_t workspaceSize,
953
+ aclOpExecutor* executor, aclrtStream stream);
954
+ #ifdef __cplusplus
955
+ }
956
+ #endif
957
+ void ggml_cann_im2col (ggml_backend_cann_context& ctx, ggml_tensor* dst) {
958
+ ggml_tensor* src0 = dst->src [0 ]; // kernel
959
+ ggml_tensor* src1 = dst->src [1 ]; // input
960
+
961
+ GGML_ASSERT (src0->type == GGML_TYPE_F16);
962
+ GGML_ASSERT (src1->type == GGML_TYPE_F32);
963
+ GGML_ASSERT (dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32);
964
+
965
+ const int32_t s0 = ((const int32_t *)(dst->op_params ))[0 ];
966
+ const int32_t s1 = ((const int32_t *)(dst->op_params ))[1 ];
967
+ const int32_t p0 = ((const int32_t *)(dst->op_params ))[2 ];
968
+ const int32_t p1 = ((const int32_t *)(dst->op_params ))[3 ];
969
+ const int32_t d0 = ((const int32_t *)(dst->op_params ))[4 ];
970
+ const int32_t d1 = ((const int32_t *)(dst->op_params ))[5 ];
971
+ const bool is_2D = ((const int32_t *)(dst->op_params ))[6 ] == 1 ;
972
+
973
+ GGML_TENSOR_BINARY_OP_LOCALS;
974
+
975
+ const int64_t N = is_2D ? ne13 : ne12;
976
+ const int64_t IC = is_2D ? ne12 : ne11;
977
+ const int64_t IH = is_2D ? ne11 : 1 ;
978
+ const int64_t IW = ne10;
979
+
980
+ const int64_t KH = is_2D ? ne01 : 1 ;
981
+ const int64_t KW = ne00;
982
+
983
+ const int64_t OH = is_2D ? ne2 : 1 ;
984
+ const int64_t OW = ne1;
985
+
986
+ GGML_ASSERT (nb00 == sizeof (ggml_fp16_t ));
987
+ GGML_ASSERT (nb10 == sizeof (float ));
988
+
989
+ // im2col: [N,C,H,W] -> [N, IC * KH * KW, OW * OH]
990
+ aclTensor* acl_src1 = create_acl_tensor (src1);
991
+
992
+ int64_t tmp_im2col_ne[] = {OW * OH , IC * KH * KW, N};
993
+
994
+ size_t tmp_im2col_nb[GGML_MAX_DIMS - 1 ];
995
+ tmp_im2col_nb[0 ] = ggml_type_size (src1->type );
996
+ tmp_im2col_nb[1 ] = tmp_im2col_nb[0 ] * (tmp_im2col_ne[0 ] /
997
+ ggml_blck_size (src1->type ));
998
+ for (int i = 2 ; i < GGML_MAX_DIMS-1 ; i++) {
999
+ tmp_im2col_nb[i] = tmp_im2col_nb[i-1 ] * tmp_im2col_ne[i-1 ];
1000
+ }
1001
+
1002
+ void * tmp_im2col_buffer = ctx.alloc_buffer (dst, ggml_nbytes (src1));
1003
+ aclTensor* tmp_im2col_tensor = create_acl_tensor (tmp_im2col_buffer,
1004
+ type_mapping (src1->type ),
1005
+ ggml_type_size (src1->type ),
1006
+ tmp_im2col_ne,
1007
+ tmp_im2col_nb,
1008
+ GGML_MAX_DIMS-1 ,
1009
+ ACL_FORMAT_ND);
1010
+
1011
+ std::vector<int64_t > kernel_dims = {KH, KW};
1012
+ std::vector<int64_t > dilation_size = {d1, d0};
1013
+ std::vector<int64_t > padding_dims = {p1, p0};
1014
+ std::vector<int64_t > stride_dims = {s1, s0};
1015
+ auto *kernel_size = aclCreateIntArray (kernel_dims.data (), 2 );
1016
+ auto *dilations = aclCreateIntArray (dilation_size.data (), 2 );
1017
+ auto *paddings = aclCreateIntArray (padding_dims.data (), 2 );
1018
+ auto *strides = aclCreateIntArray (stride_dims.data (), 2 );
1019
+
1020
+ uint64_t workspaceSize = 0 ;
1021
+ aclOpExecutor* executor;
1022
+ void * workspaceAddr = nullptr ;
1023
+ aclrtStream stream = ctx.stream ();
1024
+
1025
+ ACL_CHECK (aclnnIm2colGetWorkspaceSize (acl_src1, kernel_size, dilations,
1026
+ paddings, strides, tmp_im2col_tensor,
1027
+ &workspaceSize, &executor));
1028
+
1029
+ if (workspaceSize > 0 ) {
1030
+ workspaceAddr = ctx.alloc_buffer (dst, workspaceSize);
1031
+ }
1032
+
1033
+ ACL_CHECK (aclnnIm2col (workspaceAddr, workspaceSize, executor, stream));
1034
+ aclrtSynchronizeStream (ctx.stream ());
1035
+
1036
+ // cast
1037
+ void * tmp_cast_buffer = ctx.alloc_buffer (dst, ggml_nbytes (dst));
1038
+ aclTensor* tmp_cast_tensor = nullptr ;
1039
+ if (src1->type != dst->type ) {
1040
+
1041
+ size_t temp_cast_nb[GGML_MAX_DIMS - 1 ];
1042
+ temp_cast_nb[0 ] = ggml_type_size (dst->type );
1043
+ temp_cast_nb[1 ] = temp_cast_nb[0 ] * (tmp_im2col_ne[0 ] /
1044
+ ggml_blck_size (dst->type ));
1045
+ for (int i = 2 ; i < GGML_MAX_DIMS-1 ; i++) {
1046
+ temp_cast_nb[i] = temp_cast_nb[i-1 ] * tmp_im2col_ne[i-1 ];
1047
+ }
1048
+
1049
+ tmp_cast_tensor = create_acl_tensor (tmp_cast_buffer,
1050
+ type_mapping (dst->type ),
1051
+ ggml_type_size (dst->type ),
1052
+ tmp_im2col_ne, temp_cast_nb,
1053
+ GGML_MAX_DIMS-1 , ACL_FORMAT_ND);
1054
+ aclnn_cast (ctx, tmp_im2col_tensor, tmp_cast_tensor,
1055
+ type_mapping (dst->type ), dst);
1056
+ aclrtSynchronizeStream (ctx.stream ());
1057
+ }
1058
+
1059
+ // permute: [N, IC * KH * KW, OW * OH] -> [N, OW * OH, IC * KH * KW]
1060
+ int64_t dst_ne[] = {dst->ne [0 ], dst->ne [1 ] * dst->ne [2 ], dst->ne [3 ]};
1061
+ size_t dst_nb[] = {dst->nb [0 ], dst->nb [1 ], dst->nb [3 ]};
1062
+ aclTensor* acl_dst = create_acl_tensor (dst, dst_ne, dst_nb,
1063
+ GGML_MAX_DIMS-1 );
1064
+
1065
+ int64_t permute_dim[] = {0 , 2 , 1 };
1066
+ if (src1->type != dst->type ) {
1067
+ aclnn_permute (ctx, tmp_cast_tensor, acl_dst, permute_dim, 3 , dst);
1068
+ }
1069
+ else {
1070
+ aclnn_permute (ctx, tmp_im2col_tensor, acl_dst, permute_dim, 3 , dst);
1071
+ }
1072
+ aclrtSynchronizeStream (ctx.stream ());
1073
+
1074
+ // release
1075
+ ACL_CHECK (aclDestroyTensor (acl_src1));
1076
+ ACL_CHECK (aclDestroyTensor (tmp_im2col_tensor));
1077
+ ACL_CHECK (aclDestroyTensor (tmp_cast_tensor));
1078
+ ACL_CHECK (aclDestroyTensor (acl_dst));
1079
+ ACL_CHECK (aclDestroyIntArray (kernel_size));
1080
+ ACL_CHECK (aclDestroyIntArray (dilations));
1081
+ ACL_CHECK (aclDestroyIntArray (paddings));
1082
+ ACL_CHECK (aclDestroyIntArray (strides));
1083
+ }
0 commit comments