@@ -134,3 +134,125 @@ func.func @test_lower_vector_arm_neon_unroll_incompatible_shape(%lhs: vector<4x1
134
134
%res = vector.contract {index ing_maps = [affine_map <(d0 , d1 , d2 ) -> (d0 , d2 )>, affine_map <(d0 , d1 , d2 ) -> (d1 , d2 )>, affine_map <(d0 , d1 , d2 ) -> (d0 , d1 )>], iterator_types = [" parallel" , " parallel" , " reduction" ], kind = #vector.kind <add >} %lhs_extsi , %rhs_extsi , %acc : vector <4 x12 xi32 >, vector <4 x12 xi32 > into vector <4 x4 xi32 >
135
135
return %res : vector <4 x4 xi32 >
136
136
}
137
+
138
+ // -----
139
+
140
+ // CHECK-LABEL: func.func @test_lower_vector_arm_neon_vecmat_unroll(
141
+ // CHECK-SAME: %[[VAL_0:.*]]: vector<8xi8>,
142
+ // CHECK-SAME: %[[VAL_1:.*]]: vector<8x8xi8>,
143
+ // CHECK-SAME: %[[VAL_2:.*]]: vector<8xi32>) -> vector<8xi32> {
144
+ // CHECK: %[[VAL_3:.*]] = arith.constant dense<0> : vector<8xi32>
145
+ // CHECK: %[[VAL_4:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<8x8xi8> to vector<2x8xi8>
146
+ // CHECK: %[[VAL_5:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0], sizes = [2], strides = [1]} : vector<8xi32> to vector<2xi32>
147
+ // CHECK: %[[VAL_6:.*]] = vector.broadcast %[[VAL_0]] : vector<8xi8> to vector<2x8xi8>
148
+ // CHECK: %[[VAL_7:.*]] = vector.broadcast %[[VAL_5]] : vector<2xi32> to vector<2x2xi32>
149
+ // CHECK: %[[VAL_8:.*]] = vector.shape_cast %[[VAL_6]] : vector<2x8xi8> to vector<16xi8>
150
+ // CHECK: %[[VAL_9:.*]] = vector.shape_cast %[[VAL_4]] : vector<2x8xi8> to vector<16xi8>
151
+ // CHECK: %[[VAL_10:.*]] = vector.shape_cast %[[VAL_7]] : vector<2x2xi32> to vector<4xi32>
152
+ // CHECK: %[[VAL_11:.*]] = arm_neon.intr.smmla %[[VAL_10]], %[[VAL_8]], %[[VAL_9]] : vector<16xi8> to vector<4xi32>
153
+ // CHECK: %[[VAL_12:.*]] = vector.shape_cast %[[VAL_11]] : vector<4xi32> to vector<2x2xi32>
154
+ // CHECK: %[[VAL_13:.*]] = vector.extract %[[VAL_12]][0] : vector<2xi32> from vector<2x2xi32>
155
+ // CHECK: %[[VAL_14:.*]] = vector.insert_strided_slice %[[VAL_13]], %[[VAL_3]] {offsets = [0], strides = [1]} : vector<2xi32> into vector<8xi32>
156
+ // CHECK: %[[VAL_15:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<8x8xi8> to vector<2x8xi8>
157
+ // CHECK: %[[VAL_16:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [2], sizes = [2], strides = [1]} : vector<8xi32> to vector<2xi32>
158
+ // CHECK: %[[VAL_17:.*]] = vector.broadcast %[[VAL_0]] : vector<8xi8> to vector<2x8xi8>
159
+ // CHECK: %[[VAL_18:.*]] = vector.broadcast %[[VAL_16]] : vector<2xi32> to vector<2x2xi32>
160
+ // CHECK: %[[VAL_19:.*]] = vector.shape_cast %[[VAL_17]] : vector<2x8xi8> to vector<16xi8>
161
+ // CHECK: %[[VAL_20:.*]] = vector.shape_cast %[[VAL_15]] : vector<2x8xi8> to vector<16xi8>
162
+ // CHECK: %[[VAL_21:.*]] = vector.shape_cast %[[VAL_18]] : vector<2x2xi32> to vector<4xi32>
163
+ // CHECK: %[[VAL_22:.*]] = arm_neon.intr.smmla %[[VAL_21]], %[[VAL_19]], %[[VAL_20]] : vector<16xi8> to vector<4xi32>
164
+ // CHECK: %[[VAL_23:.*]] = vector.shape_cast %[[VAL_22]] : vector<4xi32> to vector<2x2xi32>
165
+ // CHECK: %[[VAL_24:.*]] = vector.extract %[[VAL_23]][0] : vector<2xi32> from vector<2x2xi32>
166
+ // CHECK: %[[VAL_25:.*]] = vector.insert_strided_slice %[[VAL_24]], %[[VAL_14]] {offsets = [2], strides = [1]} : vector<2xi32> into vector<8xi32>
167
+ // CHECK: %[[VAL_26:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [4, 0], sizes = [2, 8], strides = [1, 1]} : vector<8x8xi8> to vector<2x8xi8>
168
+ // CHECK: %[[VAL_27:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [4], sizes = [2], strides = [1]} : vector<8xi32> to vector<2xi32>
169
+ // CHECK: %[[VAL_28:.*]] = vector.broadcast %[[VAL_0]] : vector<8xi8> to vector<2x8xi8>
170
+ // CHECK: %[[VAL_29:.*]] = vector.broadcast %[[VAL_27]] : vector<2xi32> to vector<2x2xi32>
171
+ // CHECK: %[[VAL_30:.*]] = vector.shape_cast %[[VAL_28]] : vector<2x8xi8> to vector<16xi8>
172
+ // CHECK: %[[VAL_31:.*]] = vector.shape_cast %[[VAL_26]] : vector<2x8xi8> to vector<16xi8>
173
+ // CHECK: %[[VAL_32:.*]] = vector.shape_cast %[[VAL_29]] : vector<2x2xi32> to vector<4xi32>
174
+ // CHECK: %[[VAL_33:.*]] = arm_neon.intr.smmla %[[VAL_32]], %[[VAL_30]], %[[VAL_31]] : vector<16xi8> to vector<4xi32>
175
+ // CHECK: %[[VAL_34:.*]] = vector.shape_cast %[[VAL_33]] : vector<4xi32> to vector<2x2xi32>
176
+ // CHECK: %[[VAL_35:.*]] = vector.extract %[[VAL_34]][0] : vector<2xi32> from vector<2x2xi32>
177
+ // CHECK: %[[VAL_36:.*]] = vector.insert_strided_slice %[[VAL_35]], %[[VAL_25]] {offsets = [4], strides = [1]} : vector<2xi32> into vector<8xi32>
178
+ // CHECK: %[[VAL_37:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [6, 0], sizes = [2, 8], strides = [1, 1]} : vector<8x8xi8> to vector<2x8xi8>
179
+ // CHECK: %[[VAL_38:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [6], sizes = [2], strides = [1]} : vector<8xi32> to vector<2xi32>
180
+ // CHECK: %[[VAL_39:.*]] = vector.broadcast %[[VAL_0]] : vector<8xi8> to vector<2x8xi8>
181
+ // CHECK: %[[VAL_40:.*]] = vector.broadcast %[[VAL_38]] : vector<2xi32> to vector<2x2xi32>
182
+ // CHECK: %[[VAL_41:.*]] = vector.shape_cast %[[VAL_39]] : vector<2x8xi8> to vector<16xi8>
183
+ // CHECK: %[[VAL_42:.*]] = vector.shape_cast %[[VAL_37]] : vector<2x8xi8> to vector<16xi8>
184
+ // CHECK: %[[VAL_43:.*]] = vector.shape_cast %[[VAL_40]] : vector<2x2xi32> to vector<4xi32>
185
+ // CHECK: %[[VAL_44:.*]] = arm_neon.intr.smmla %[[VAL_43]], %[[VAL_41]], %[[VAL_42]] : vector<16xi8> to vector<4xi32>
186
+ // CHECK: %[[VAL_45:.*]] = vector.shape_cast %[[VAL_44]] : vector<4xi32> to vector<2x2xi32>
187
+ // CHECK: %[[VAL_46:.*]] = vector.extract %[[VAL_45]][0] : vector<2xi32> from vector<2x2xi32>
188
+ // CHECK: %[[VAL_47:.*]] = vector.insert_strided_slice %[[VAL_46]], %[[VAL_36]] {offsets = [6], strides = [1]} : vector<2xi32> into vector<8xi32>
189
+ // CHECK: return %[[VAL_47]] : vector<8xi32>
190
+ // CHECK: }
191
+ func.func @test_lower_vector_arm_neon_vecmat_unroll (%lhs: vector <8 xi8 >, %rhs: vector <8 x8 xi8 >, %acc : vector <8 xi32 >) -> vector <8 xi32 > {
192
+ %lhs_extsi = arith.extsi %lhs : vector <8 xi8 > to vector <8 xi32 >
193
+ %rhs_extsi = arith.extsi %rhs : vector <8 x8 xi8 > to vector <8 x8 xi32 >
194
+ %res = vector.contract {index ing_maps = [affine_map <(d0 , d1 ) -> (d1 )>, affine_map <(d0 , d1 ) -> (d0 , d1 )>, affine_map <(d0 , d1 ) -> (d0 )>], iterator_types = [" parallel" , " reduction" ], kind = #vector.kind <add >} %lhs_extsi , %rhs_extsi , %acc : vector <8 xi32 >, vector <8 x8 xi32 > into vector <8 xi32 >
195
+ return %res : vector <8 xi32 >
196
+ }
197
+
198
+ // -----
199
+
200
+
201
+ // CHECK-LABEL: func.func @test_lower_vector_arm_neon_vecmat_unroll_leading_dim(
202
+ // CHECK-SAME: %[[VAL_0:.*]]: vector<1x8xi8>,
203
+ // CHECK-SAME: %[[VAL_1:.*]]: vector<8x8xi8>,
204
+ // CHECK-SAME: %[[VAL_2:.*]]: vector<1x8xi32>) -> vector<1x8xi32> {
205
+ // CHECK: %[[VAL_3:.*]] = arith.constant dense<0> : vector<1x8xi32>
206
+ // CHECK: %[[VAL_4:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [0, 0], sizes = [2, 8], strides = [1, 1]} : vector<8x8xi8> to vector<2x8xi8>
207
+ // CHECK: %[[VAL_5:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0, 0], sizes = [1, 2], strides = [1, 1]} : vector<1x8xi32> to vector<1x2xi32>
208
+ // CHECK: %[[VAL_6:.*]] = vector.broadcast %[[VAL_0]] : vector<1x8xi8> to vector<2x8xi8>
209
+ // CHECK: %[[VAL_7:.*]] = vector.broadcast %[[VAL_5]] : vector<1x2xi32> to vector<2x2xi32>
210
+ // CHECK: %[[VAL_8:.*]] = vector.shape_cast %[[VAL_6]] : vector<2x8xi8> to vector<16xi8>
211
+ // CHECK: %[[VAL_9:.*]] = vector.shape_cast %[[VAL_4]] : vector<2x8xi8> to vector<16xi8>
212
+ // CHECK: %[[VAL_10:.*]] = vector.shape_cast %[[VAL_7]] : vector<2x2xi32> to vector<4xi32>
213
+ // CHECK: %[[VAL_11:.*]] = arm_neon.intr.smmla %[[VAL_10]], %[[VAL_8]], %[[VAL_9]] : vector<16xi8> to vector<4xi32>
214
+ // CHECK: %[[VAL_12:.*]] = vector.shape_cast %[[VAL_11]] : vector<4xi32> to vector<2x2xi32>
215
+ // CHECK: %[[VAL_13:.*]] = vector.extract %[[VAL_12]][0] : vector<2xi32> from vector<2x2xi32>
216
+ // CHECK: %[[VAL_14:.*]] = vector.insert_strided_slice %[[VAL_13]], %[[VAL_3]] {offsets = [0, 0], strides = [1]} : vector<2xi32> into vector<1x8xi32>
217
+ // CHECK: %[[VAL_15:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [2, 0], sizes = [2, 8], strides = [1, 1]} : vector<8x8xi8> to vector<2x8xi8>
218
+ // CHECK: %[[VAL_16:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0, 2], sizes = [1, 2], strides = [1, 1]} : vector<1x8xi32> to vector<1x2xi32>
219
+ // CHECK: %[[VAL_17:.*]] = vector.broadcast %[[VAL_0]] : vector<1x8xi8> to vector<2x8xi8>
220
+ // CHECK: %[[VAL_18:.*]] = vector.broadcast %[[VAL_16]] : vector<1x2xi32> to vector<2x2xi32>
221
+ // CHECK: %[[VAL_19:.*]] = vector.shape_cast %[[VAL_17]] : vector<2x8xi8> to vector<16xi8>
222
+ // CHECK: %[[VAL_20:.*]] = vector.shape_cast %[[VAL_15]] : vector<2x8xi8> to vector<16xi8>
223
+ // CHECK: %[[VAL_21:.*]] = vector.shape_cast %[[VAL_18]] : vector<2x2xi32> to vector<4xi32>
224
+ // CHECK: %[[VAL_22:.*]] = arm_neon.intr.smmla %[[VAL_21]], %[[VAL_19]], %[[VAL_20]] : vector<16xi8> to vector<4xi32>
225
+ // CHECK: %[[VAL_23:.*]] = vector.shape_cast %[[VAL_22]] : vector<4xi32> to vector<2x2xi32>
226
+ // CHECK: %[[VAL_24:.*]] = vector.extract %[[VAL_23]][0] : vector<2xi32> from vector<2x2xi32>
227
+ // CHECK: %[[VAL_25:.*]] = vector.insert_strided_slice %[[VAL_24]], %[[VAL_14]] {offsets = [0, 2], strides = [1]} : vector<2xi32> into vector<1x8xi32>
228
+ // CHECK: %[[VAL_26:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [4, 0], sizes = [2, 8], strides = [1, 1]} : vector<8x8xi8> to vector<2x8xi8>
229
+ // CHECK: %[[VAL_27:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0, 4], sizes = [1, 2], strides = [1, 1]} : vector<1x8xi32> to vector<1x2xi32>
230
+ // CHECK: %[[VAL_28:.*]] = vector.broadcast %[[VAL_0]] : vector<1x8xi8> to vector<2x8xi8>
231
+ // CHECK: %[[VAL_29:.*]] = vector.broadcast %[[VAL_27]] : vector<1x2xi32> to vector<2x2xi32>
232
+ // CHECK: %[[VAL_30:.*]] = vector.shape_cast %[[VAL_28]] : vector<2x8xi8> to vector<16xi8>
233
+ // CHECK: %[[VAL_31:.*]] = vector.shape_cast %[[VAL_26]] : vector<2x8xi8> to vector<16xi8>
234
+ // CHECK: %[[VAL_32:.*]] = vector.shape_cast %[[VAL_29]] : vector<2x2xi32> to vector<4xi32>
235
+ // CHECK: %[[VAL_33:.*]] = arm_neon.intr.smmla %[[VAL_32]], %[[VAL_30]], %[[VAL_31]] : vector<16xi8> to vector<4xi32>
236
+ // CHECK: %[[VAL_34:.*]] = vector.shape_cast %[[VAL_33]] : vector<4xi32> to vector<2x2xi32>
237
+ // CHECK: %[[VAL_35:.*]] = vector.extract %[[VAL_34]][0] : vector<2xi32> from vector<2x2xi32>
238
+ // CHECK: %[[VAL_36:.*]] = vector.insert_strided_slice %[[VAL_35]], %[[VAL_25]] {offsets = [0, 4], strides = [1]} : vector<2xi32> into vector<1x8xi32>
239
+ // CHECK: %[[VAL_37:.*]] = vector.extract_strided_slice %[[VAL_1]] {offsets = [6, 0], sizes = [2, 8], strides = [1, 1]} : vector<8x8xi8> to vector<2x8xi8>
240
+ // CHECK: %[[VAL_38:.*]] = vector.extract_strided_slice %[[VAL_2]] {offsets = [0, 6], sizes = [1, 2], strides = [1, 1]} : vector<1x8xi32> to vector<1x2xi32>
241
+ // CHECK: %[[VAL_39:.*]] = vector.broadcast %[[VAL_0]] : vector<1x8xi8> to vector<2x8xi8>
242
+ // CHECK: %[[VAL_40:.*]] = vector.broadcast %[[VAL_38]] : vector<1x2xi32> to vector<2x2xi32>
243
+ // CHECK: %[[VAL_41:.*]] = vector.shape_cast %[[VAL_39]] : vector<2x8xi8> to vector<16xi8>
244
+ // CHECK: %[[VAL_42:.*]] = vector.shape_cast %[[VAL_37]] : vector<2x8xi8> to vector<16xi8>
245
+ // CHECK: %[[VAL_43:.*]] = vector.shape_cast %[[VAL_40]] : vector<2x2xi32> to vector<4xi32>
246
+ // CHECK: %[[VAL_44:.*]] = arm_neon.intr.smmla %[[VAL_43]], %[[VAL_41]], %[[VAL_42]] : vector<16xi8> to vector<4xi32>
247
+ // CHECK: %[[VAL_45:.*]] = vector.shape_cast %[[VAL_44]] : vector<4xi32> to vector<2x2xi32>
248
+ // CHECK: %[[VAL_46:.*]] = vector.extract %[[VAL_45]][0] : vector<2xi32> from vector<2x2xi32>
249
+ // CHECK: %[[VAL_47:.*]] = vector.insert_strided_slice %[[VAL_46]], %[[VAL_36]] {offsets = [0, 6], strides = [1]} : vector<2xi32> into vector<1x8xi32>
250
+ // CHECK: return %[[VAL_47]] : vector<1x8xi32>
251
+ // CHECK: }
252
+
253
+ func.func @test_lower_vector_arm_neon_vecmat_unroll_leading_dim (%lhs: vector <1 x8 xi8 >, %rhs: vector <8 x8 xi8 >, %acc : vector <1 x8 xi32 >) -> vector <1 x8 xi32 > {
254
+ %lhs_extsi = arith.extsi %lhs : vector <1 x8 xi8 > to vector <1 x8 xi32 >
255
+ %rhs_extsi = arith.extsi %rhs : vector <8 x8 xi8 > to vector <8 x8 xi32 >
256
+ %res = vector.contract {index ing_maps = [affine_map <(d0 , d1 , d2 ) -> (d0 , d2 )>, affine_map <(d0 , d1 , d2 ) -> (d1 , d2 )>, affine_map <(d0 , d1 , d2 ) -> (d0 , d1 )>], iterator_types = [" parallel" , " parallel" , " reduction" ], kind = #vector.kind <add >} %lhs_extsi , %rhs_extsi , %acc : vector <1 x8 xi32 >, vector <8 x8 xi32 > into vector <1 x8 xi32 >
257
+ return %res : vector <1 x8 xi32 >
258
+ }
0 commit comments