@@ -60,32 +60,64 @@ class GET_ROW_F16 {
60
60
61
61
__aicore__ inline void copy_in (uint32_t offset, size_t len) {
62
62
LocalTensor<half> input_local = input_queue.AllocTensor <half>();
63
- size_t tail = len % 32 ;
64
- len = len & ~31 ;
63
+ const size_t elem_per_block = 32 / sizeof (half);
64
+ size_t tail = len % elem_per_block;
65
+ len = len & ~elem_per_block;
65
66
DataCopy (input_local, input_gm[offset], len);
66
67
if (tail != 0 ) {
68
+ DataCopy (input_local[len], input_gm[offset + len], elem_per_block);
69
+ // clean
70
+ for (int i = tail; i < elem_per_block; i++) {
71
+ input_local[len].SetValue (i, 0 );
72
+ }
73
+ #if 0
74
+ const half padVal = 0;
75
+ uint64_t mask0 = ((uint64_t)1ul << 16) - ((uint64_t)1ul << tail);
76
+ uint64_t mask[2] = {mask0, 0};
77
+ Duplicate<half>(input_local[len], padVal, mask, 1 /*no repeat*/, 1/*no gap in block*/, 8/*no gap between ∂repeats*/);
78
+ #endif
79
+
80
+ #if 0
67
81
DataCopyExtParams dataCopyParams;
68
82
dataCopyParams.blockCount = 1;
69
83
dataCopyParams.blockLen = tail * sizeof(half);
70
84
DataCopyPadExtParams<half> padParams;
71
85
DataCopyPad(input_local[len], input_gm[offset + len],
72
86
dataCopyParams, padParams);
87
+
88
+ uint16_t rightPadNum = 32 / sizeof(half) - tail;
89
+ PadParams padParas{0, rightPadNum, 0};
90
+ Pad(input_local[len], input_gm[offset + len], padParas, tilingData.padTilingData);
91
+ #endif
73
92
}
74
93
input_queue.EnQue (input_local);
75
94
}
76
95
77
96
__aicore__ inline void copy_out (uint32_t offset, size_t len) {
78
97
LocalTensor<float > output_local = output_queue.DeQue <float >();
79
- size_t tail = len % 32 ;
80
- len = len & ~31 ;
81
- DataCopy (output_gm[offset], output_local, len);
98
+ const size_t elem_per_block = 32 / sizeof (float );
99
+ size_t tail = len % elem_per_block;
100
+ len = len & ~elem_per_block;
101
+ // DataCopy(output_gm[offset], output_local, len);
82
102
if (tail != 0 ) {
103
+ len += elem_per_block;
104
+ #if 0
83
105
DataCopyExtParams dataCopyParams;
84
106
dataCopyParams.blockCount = 1;
85
107
dataCopyParams.blockLen = tail * sizeof(float);
86
108
DataCopyPad(output_gm[offset + len], output_local[len],
87
109
dataCopyParams);
110
+ #endif
88
111
}
112
+ DataCopy (output_gm[offset], output_local, len);
113
+ // clean
114
+ if (tail != 0 ) {
115
+ for (int i = tail; i < elem_per_block; i++) {
116
+ output_gm[offset + len - elem_per_block].SetValue (i, 0 );
117
+ }
118
+ DataCacheCleanAndInvalid<float , CacheLine::SINGLE_CACHE_LINE>(output_gm[offset + len - elem_per_block]);
119
+ }
120
+
89
121
output_queue.FreeTensor (output_local);
90
122
}
91
123
0 commit comments