@@ -12,6 +12,9 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
12
12
__aicore__ inline void init (GM_ADDR input, GM_ADDR output,
13
13
int64_t *input_ne_ub, size_t *input_nb_ub,
14
14
int64_t *output_ne_ub) {
15
+ // TODO: fix test_case CPY(type_src=f16,type_dst=q4_0,ne=[256,4,4,4],
16
+ // permute=[0,0,0,0]):
17
+ // [CPY] NMSE = 0.000008343 > 0.000001000 FAIL
15
18
int64_t op_block_num = GetBlockNum ();
16
19
int64_t op_block_idx = GetBlockIdx ();
17
20
@@ -61,13 +64,13 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
61
64
pipe.InitBuffer (input_queue, BUFFER_NUM, Group_Size * sizeof (SRC_T));
62
65
pipe.InitBuffer (output_queue, BUFFER_NUM,
63
66
Group_Size * sizeof (int8_t ) / 2 );
64
- pipe.InitBuffer (cast_queue , BUFFER_NUM , Group_Size * sizeof (float ));
65
- pipe.InitBuffer (work_queue, BUFFER_NUM , Group_Size* sizeof (float ));
66
- pipe.InitBuffer (max_queue, BUFFER_NUM , Group_Size* sizeof (float ));
67
- pipe.InitBuffer (min_queue, BUFFER_NUM , Group_Size* sizeof (float ));
68
- pipe.InitBuffer (scale_queue, BUFFER_NUM, 16 * sizeof (half));
69
- pipe.InitBuffer (int8_queue, BUFFER_NUM , Group_Size * sizeof (int8_t ));
70
- pipe.InitBuffer (half_queue, BUFFER_NUM , Group_Size * sizeof (half));
67
+ pipe.InitBuffer (cast_queue , 1 , Group_Size * sizeof (float ));
68
+ pipe.InitBuffer (work_queue, 1 , Group_Size * sizeof (float ));
69
+ pipe.InitBuffer (max_queue, 1 , Group_Size * sizeof (float ));
70
+ pipe.InitBuffer (min_queue, 1 , Group_Size * sizeof (float ));
71
+ pipe.InitBuffer (scale_queue, 1 , Group_Size / 2 * sizeof (half));
72
+ pipe.InitBuffer (int8_queue, 1 , Group_Size * sizeof (int8_t ));
73
+ pipe.InitBuffer (half_queue, 1 , Group_Size * sizeof (half));
71
74
}
72
75
73
76
__aicore__ inline void copy_in (uint32_t offset) {
@@ -178,13 +181,15 @@ class QUANTIZE_FLOAT_TO_Q4_0 {
178
181
for (int64_t j = 0 ; j < group_size_in_row; j++) {
179
182
half scale = calculate_group (i, j);
180
183
scale_local.SetValue (scale_local_offset++, scale);
181
- if (scale_local_offset == 16 ) {
184
+ // Copy Group_Size/2 length data each time.
185
+ if (scale_local_offset == Group_Size / 2 ) {
182
186
scale_local_offset = 0 ;
183
187
// TODO: OPTIMIZE ME
184
188
pipe_barrier (PIPE_ALL);
185
- DataCopy (scale_gm[scale_global_offset], scale_local, 16 );
189
+ DataCopy (scale_gm[scale_global_offset], scale_local,
190
+ Group_Size / 2 );
186
191
pipe_barrier (PIPE_ALL);
187
- scale_global_offset += 16 ;
192
+ scale_global_offset += Group_Size / 2 ;
188
193
}
189
194
}
190
195
}
0 commit comments