@@ -13,7 +13,7 @@ class GET_ROW_F32 {
13
13
int64_t *indices_ne_ub, size_t *indices_nb_ub,
14
14
int64_t *output_ne_ub, size_t *output_nb_ub) {
15
15
int64_t op_block_num = GetBlockNum ();
16
- int64_t op_block_idx = GetBlockIdx ();
16
+ op_block_idx = GetBlockIdx ();
17
17
18
18
for (int i = 0 ; i < 4 ; i++) {
19
19
input_ne[i] = input_ne_ub[i];
@@ -51,25 +51,38 @@ class GET_ROW_F32 {
51
51
// All data should asign to 32. It's ok because all data is align to 32.
52
52
pipe.InitBuffer (input_queue, BUFFER_NUM, local_buffer_size);
53
53
pipe.InitBuffer (output_queue, BUFFER_NUM, local_buffer_size);
54
+ // printf("f32 BLOCK_IDX:%d get_row: init: ir:%d, dr:%d, n_elements:%d.\n", op_block_idx, ir, dr, n_elements);
54
55
}
55
56
56
57
__aicore__ inline void copy_in (uint32_t offset, size_t len) {
58
+ size_t origin_len = len;
57
59
LocalTensor<float > input_local = input_queue.AllocTensor <float >();
58
60
const size_t elem_per_block = 32 / sizeof (float );
59
61
size_t tail = len % elem_per_block;
60
- len = len & ~elem_per_block;
61
- DataCopy (input_local, input_gm[offset], len);
62
+ len = len & ~(elem_per_block - 1 );
63
+
64
+ // printf("f32 BLOCK_IDX:%d get_row: Copy_in: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
65
+ if (len > 0 )
66
+ DataCopy (input_local, input_gm[offset], len);
67
+ // printf("f32 BLOCK_IDX:%d get_row: Copy_in executed: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
62
68
if (tail != 0 ) {
69
+ #if 1
70
+ /* //printf("f32 BLOCK_IDX:%d get_row: Copy_in ENTER tail != 0: offset:%d, len:%d, origin_len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, origin_len, tail, elem_per_block);
71
+ for (int i = 0; i < elem_per_block; i++) {
72
+ printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, origin input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0));
73
+ }
74
+ //DumpTensor(input_gm[offset + len], 5, elem_per_block);
75
+ for (int i = 0; i < tail; i++) {
76
+ printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, input local val:%f, input_gm:%f.\n", op_block_idx, len + i, input_local[len + i].GetValue(0), input_gm[offset + len + i]);
77
+ } */
63
78
DataCopy (input_local[len], input_gm[offset + len], elem_per_block);
64
79
// clean
65
- for (int i = tail; i < elem_per_block; i++) {
66
- input_local[len].SetValue (i , 0 );
80
+ /* for (int i = tail; i < elem_per_block; i++) {
81
+ input_local[len + i ].SetValue(0 , 0);
67
82
}
68
- #if 0
69
- const float padVal = 0;
70
- uint64_t mask0 = ((uint64_t)1ul << 8) - ((uint64_t)1ul << tail);
71
- uint64_t mask[2] = {mask0, 0};
72
- Duplicate<float>(input_local[len], padVal, mask, 1 /*no repeat*/, 1/*no gap in block*/, 8/*no gap between repeats*/);
83
+ for (int i = 0; i < elem_per_block; i++) {
84
+ printf("f32 BLOCK_IDX:%d get_row: Copy_in: get value idx:%d, after clean and copy, input local val:%f.\n", op_block_idx, i, input_local[len + i].GetValue(0));
85
+ } */
73
86
#endif
74
87
#if 0
75
88
DataCopyExtParams dataCopyParams;
@@ -87,27 +100,32 @@ class GET_ROW_F32 {
87
100
LocalTensor<float > output_local = output_queue.DeQue <float >();
88
101
const size_t elem_per_block = 32 / sizeof (float );
89
102
size_t tail = len % elem_per_block;
90
- len = len & ~elem_per_block;
91
- // DataCopy(output_gm[offset], output_local, len);
103
+ len = len & ~(elem_per_block - 1 );
104
+ if (len > 0 ) {
105
+ DataCopy (output_gm[offset], output_local, len);
106
+ }
107
+
108
+ #if 1
92
109
if (tail != 0 ) {
93
- len += elem_per_block;
110
+ // printf("\nf32 BLOCK_IDX:%d get_row: Copy_Out AtomicAdd: offset:%d, len:%d, tail:%d, elem_per_block:%d.\n", op_block_idx, offset, len, tail, elem_per_block);
111
+ /* DumpTensor(output_gm[offset + len], 5, elem_per_block);
112
+ DumpTensor(output_local[len], 5, elem_per_block); */
113
+ SetAtomicAdd<float >();
114
+ DataCopy (output_gm[offset + len], output_local[len], elem_per_block);
115
+ SetAtomicNone ();
116
+ /* DumpTensor(output_gm[offset + len], 5, elem_per_block); */
117
+ }
118
+ #endif
94
119
#if 0
120
+ if(tail != 0) {
121
+
95
122
DataCopyExtParams dataCopyParams;
96
123
dataCopyParams.blockCount = 1;
97
124
dataCopyParams.blockLen = tail * sizeof(float);
98
125
DataCopyPad(output_gm[offset + len], output_local[len],
99
126
dataCopyParams);
100
- #endif
101
127
}
102
- DataCopy (output_gm[offset], output_local, len);
103
-
104
- if (tail != 0 ) { // clean
105
- for (int i = tail; i < elem_per_block; i++) {
106
- output_gm[offset + len - elem_per_block].SetValue (i, 0 );
107
- }
108
- DataCacheCleanAndInvalid<float , CacheLine::SINGLE_CACHE_LINE>(output_gm[offset + len - elem_per_block]);
109
- }
110
-
128
+ #endif
111
129
output_queue.FreeTensor (output_local);
112
130
}
113
131
@@ -171,6 +189,7 @@ class GET_ROW_F32 {
171
189
GlobalTensor<float > output_gm;
172
190
TQue<QuePosition::VECIN, BUFFER_NUM> input_queue;
173
191
TQue<QuePosition::VECOUT, BUFFER_NUM> output_queue;
192
+ int64_t op_block_idx;
174
193
};
175
194
176
195
template <typename T>
0 commit comments