@@ -14,7 +14,7 @@ class GET_ROW_F16 {
14
14
int64_t *output_ne_ub, size_t *output_nb_ub) {
15
15
// TODO, use template for F16/f32
16
16
int64_t op_block_num = GetBlockNum ();
17
- int64_t op_block_idx = GetBlockIdx ();
17
+ op_block_idx = GetBlockIdx ();
18
18
19
19
for (int i = 0 ; i < 4 ; i++) {
20
20
input_ne[i] = input_ne_ub[i];
@@ -59,65 +59,61 @@ class GET_ROW_F16 {
59
59
}
60
60
61
61
__aicore__ inline void copy_in (uint32_t offset, size_t len) {
62
+ size_t origin_len = len;
62
63
LocalTensor<half> input_local = input_queue.AllocTensor <half>();
63
64
const size_t elem_per_block = 32 / sizeof (half);
64
65
size_t tail = len % elem_per_block;
65
- len = len & ~elem_per_block;
66
- DataCopy (input_local, input_gm[offset], len);
66
+ len = len & ~(elem_per_block - 1 );
67
67
if (tail != 0 ) {
68
- DataCopy (input_local[len], input_gm[offset + len], elem_per_block);
69
- // clean
70
- for (int i = tail; i < elem_per_block; i++) {
71
- input_local[len].SetValue (i, 0 );
72
- }
73
- #if 0
74
- const half padVal = 0;
75
- uint64_t mask0 = ((uint64_t)1ul << 16) - ((uint64_t)1ul << tail);
76
- uint64_t mask[2] = {mask0, 0};
77
- Duplicate<half>(input_local[len], padVal, mask, 1 /*no repeat*/, 1/*no gap in block*/, 8/*no gap between ∂repeats*/);
78
- #endif
79
-
68
+ // printf("f16 get_row: copy_in: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", offset, len, origin_len, tail, elem_per_block);
69
+ // DumpTensor(input_local, 5, elem_per_block);
70
+ len += elem_per_block;
71
+ // DataCopy(input_local[len], input_gm[offset + len], elem_per_block);
80
72
#if 0
81
73
DataCopyExtParams dataCopyParams;
82
74
dataCopyParams.blockCount = 1;
83
75
dataCopyParams.blockLen = tail * sizeof(half);
84
76
DataCopyPadExtParams<half> padParams;
85
77
DataCopyPad(input_local[len], input_gm[offset + len],
86
78
dataCopyParams, padParams);
87
-
88
- uint16_t rightPadNum = 32 / sizeof(half) - tail;
89
- PadParams padParas{0, rightPadNum, 0};
90
- Pad(input_local[len], input_gm[offset + len], padParas, tilingData.padTilingData);
91
79
#endif
92
80
}
81
+ DataCopy (input_local, input_gm[offset], len);
93
82
input_queue.EnQue (input_local);
94
83
}
95
84
96
85
__aicore__ inline void copy_out (uint32_t offset, size_t len) {
97
86
LocalTensor<float > output_local = output_queue.DeQue <float >();
98
87
const size_t elem_per_block = 32 / sizeof (float );
99
88
size_t tail = len % elem_per_block;
100
- len = len & ~elem_per_block;
101
- // DataCopy(output_gm[offset], output_local, len);
89
+ len = len & ~(elem_per_block - 1 );
90
+ if (len > 0 ) {
91
+ DataCopy (output_gm[offset], output_local, len);
92
+ }
93
+ #if 1
102
94
if (tail != 0 ) {
103
- len += elem_per_block;
95
+ /* printf("\nf16 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
96
+ DumpTensor(output_gm, 5, elem_per_block); */
97
+ for (size_t i = tail; i < elem_per_block; i++) {
98
+ output_local[len + i].SetValue (0 , 0 );
99
+ }
100
+ // DumpTensor(output_local[len], 5, elem_per_block);
101
+ SetAtomicAdd<float >();
102
+ DataCopy (output_gm[offset + len], output_local[len], elem_per_block);
103
+ SetAtomicNone ();
104
+ // DumpTensor(output_gm, 5, elem_per_block);
105
+ }
106
+ #endif
107
+
104
108
#if 0
109
+ if(tail != 0) {
105
110
DataCopyExtParams dataCopyParams;
106
111
dataCopyParams.blockCount = 1;
107
112
dataCopyParams.blockLen = tail * sizeof(float);
108
113
DataCopyPad(output_gm[offset + len], output_local[len],
109
114
dataCopyParams);
110
- #endif
111
- }
112
- DataCopy (output_gm[offset], output_local, len);
113
- // clean
114
- if (tail != 0 ) {
115
- for (int i = tail; i < elem_per_block; i++) {
116
- output_gm[offset + len - elem_per_block].SetValue (i, 0 );
117
- }
118
- DataCacheCleanAndInvalid<float , CacheLine::SINGLE_CACHE_LINE>(output_gm[offset + len - elem_per_block]);
119
115
}
120
-
116
+ # endif
121
117
output_queue.FreeTensor (output_local);
122
118
}
123
119
@@ -182,6 +178,7 @@ class GET_ROW_F16 {
182
178
GlobalTensor<float > output_gm;
183
179
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
184
180
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
181
+ int64_t op_block_idx;
185
182
};
186
183
187
184
template <typename T>
0 commit comments