@@ -9,11 +9,10 @@ define i32 @test_udot_v8i8(i8* nocapture readonly %a, i8* nocapture readonly %b)
9
9
; CHECK: // %bb.0: // %entry
10
10
; CHECK-NEXT: ldr d0, [x0]
11
11
; CHECK-NEXT: ldr d1, [x1]
12
- ; CHECK-NEXT: dup v2.2s, wzr
12
+ ; CHECK-NEXT: movi v2.2d, #0000000000000000
13
13
; CHECK-NEXT: udot v2.2s, v1.8b, v0.8b
14
14
; CHECK-NEXT: addp v0.2s, v2.2s, v2.2s
15
- ; CHECK-NEXT: fmov x0, d0
16
- ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
15
+ ; CHECK-NEXT: fmov w0, s0
17
16
; CHECK-NEXT: ret
18
17
entry:
19
18
%0 = bitcast i8* %a to <8 x i8 >*
@@ -33,7 +32,7 @@ define i32 @test_udot_v8i8_nomla(i8* nocapture readonly %a1) {
33
32
; CHECK-NEXT: ldr d0, [x0]
34
33
; CHECK-NEXT: movi v1.2d, #0000000000000000
35
34
; CHECK-NEXT: movi v2.8b, #1
36
- ; CHECK-NEXT: udot v1.2s, v2 .8b, v0 .8b
35
+ ; CHECK-NEXT: udot v1.2s, v0 .8b, v2 .8b
37
36
; CHECK-NEXT: addp v0.2s, v1.2s, v1.2s
38
37
; CHECK-NEXT: fmov w0, s0
39
38
; CHECK-NEXT: ret
@@ -50,11 +49,10 @@ define i32 @test_sdot_v8i8(i8* nocapture readonly %a, i8* nocapture readonly %b)
50
49
; CHECK: // %bb.0: // %entry
51
50
; CHECK-NEXT: ldr d0, [x0]
52
51
; CHECK-NEXT: ldr d1, [x1]
53
- ; CHECK-NEXT: dup v2.2s, wzr
52
+ ; CHECK-NEXT: movi v2.2d, #0000000000000000
54
53
; CHECK-NEXT: sdot v2.2s, v1.8b, v0.8b
55
54
; CHECK-NEXT: addp v0.2s, v2.2s, v2.2s
56
- ; CHECK-NEXT: fmov x0, d0
57
- ; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0
55
+ ; CHECK-NEXT: fmov w0, s0
58
56
; CHECK-NEXT: ret
59
57
entry:
60
58
%0 = bitcast i8* %a to <8 x i8 >*
@@ -74,7 +72,7 @@ define i32 @test_sdot_v8i8_nomla(i8* nocapture readonly %a1) {
74
72
; CHECK-NEXT: ldr d0, [x0]
75
73
; CHECK-NEXT: movi v1.2d, #0000000000000000
76
74
; CHECK-NEXT: movi v2.8b, #1
77
- ; CHECK-NEXT: sdot v1.2s, v2 .8b, v0 .8b
75
+ ; CHECK-NEXT: sdot v1.2s, v0 .8b, v2 .8b
78
76
; CHECK-NEXT: addp v0.2s, v1.2s, v1.2s
79
77
; CHECK-NEXT: fmov w0, s0
80
78
; CHECK-NEXT: ret
@@ -92,7 +90,7 @@ define i32 @test_udot_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b
92
90
; CHECK: // %bb.0: // %entry
93
91
; CHECK-NEXT: ldr q0, [x0]
94
92
; CHECK-NEXT: ldr q1, [x1]
95
- ; CHECK-NEXT: dup v2.4s, wzr
93
+ ; CHECK-NEXT: movi v2.2d, #0000000000000000
96
94
; CHECK-NEXT: udot v2.4s, v1.16b, v0.16b
97
95
; CHECK-NEXT: addv s0, v2.4s
98
96
; CHECK-NEXT: fmov w8, s0
@@ -117,7 +115,7 @@ define i32 @test_udot_v16i8_nomla(i8* nocapture readonly %a1) {
117
115
; CHECK-NEXT: ldr q0, [x0]
118
116
; CHECK-NEXT: movi v1.16b, #1
119
117
; CHECK-NEXT: movi v2.2d, #0000000000000000
120
- ; CHECK-NEXT: udot v2.4s, v1 .16b, v0 .16b
118
+ ; CHECK-NEXT: udot v2.4s, v0 .16b, v1 .16b
121
119
; CHECK-NEXT: addv s0, v2.4s
122
120
; CHECK-NEXT: fmov w0, s0
123
121
; CHECK-NEXT: ret
@@ -134,7 +132,7 @@ define i32 @test_sdot_v16i8(i8* nocapture readonly %a, i8* nocapture readonly %b
134
132
; CHECK: // %bb.0: // %entry
135
133
; CHECK-NEXT: ldr q0, [x0]
136
134
; CHECK-NEXT: ldr q1, [x1]
137
- ; CHECK-NEXT: dup v2.4s, wzr
135
+ ; CHECK-NEXT: movi v2.2d, #0000000000000000
138
136
; CHECK-NEXT: sdot v2.4s, v1.16b, v0.16b
139
137
; CHECK-NEXT: addv s0, v2.4s
140
138
; CHECK-NEXT: fmov w8, s0
@@ -159,7 +157,7 @@ define i32 @test_sdot_v16i8_nomla(i8* nocapture readonly %a1) {
159
157
; CHECK-NEXT: ldr q0, [x0]
160
158
; CHECK-NEXT: movi v1.16b, #1
161
159
; CHECK-NEXT: movi v2.2d, #0000000000000000
162
- ; CHECK-NEXT: sdot v2.4s, v1 .16b, v0 .16b
160
+ ; CHECK-NEXT: sdot v2.4s, v0 .16b, v1 .16b
163
161
; CHECK-NEXT: addv s0, v2.4s
164
162
; CHECK-NEXT: fmov w0, s0
165
163
; CHECK-NEXT: ret
@@ -175,20 +173,10 @@ entry:
175
173
define i32 @test_udot_v8i8_double (<8 x i8 > %a , <8 x i8 > %b , <8 x i8 > %c , <8 x i8 > %d ) {
176
174
; CHECK-LABEL: test_udot_v8i8_double:
177
175
; CHECK: // %bb.0: // %entry
178
- ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
179
- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
180
- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
181
- ; CHECK-NEXT: ushll v3.8h, v3.8b, #0
182
- ; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
183
- ; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
184
- ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
185
- ; CHECK-NEXT: ext v1.16b, v2.16b, v2.16b, #8
186
- ; CHECK-NEXT: umull v2.4s, v2.4h, v3.4h
187
- ; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8
188
- ; CHECK-NEXT: umlal v0.4s, v4.4h, v5.4h
189
- ; CHECK-NEXT: umlal v2.4s, v1.4h, v3.4h
190
- ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
191
- ; CHECK-NEXT: addv s0, v0.4s
176
+ ; CHECK-NEXT: movi v4.2d, #0000000000000000
177
+ ; CHECK-NEXT: udot v4.2s, v2.8b, v3.8b
178
+ ; CHECK-NEXT: udot v4.2s, v0.8b, v1.8b
179
+ ; CHECK-NEXT: addp v0.2s, v4.2s, v4.2s
192
180
; CHECK-NEXT: fmov w0, s0
193
181
; CHECK-NEXT: ret
194
182
entry:
@@ -209,8 +197,8 @@ define i32 @test_udot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <
209
197
; CHECK: // %bb.0: // %entry
210
198
; CHECK-NEXT: movi v1.2d, #0000000000000000
211
199
; CHECK-NEXT: movi v3.8b, #1
212
- ; CHECK-NEXT: udot v1.2s, v3 .8b, v2 .8b
213
- ; CHECK-NEXT: udot v1.2s, v3 .8b, v0 .8b
200
+ ; CHECK-NEXT: udot v1.2s, v2 .8b, v3 .8b
201
+ ; CHECK-NEXT: udot v1.2s, v0 .8b, v3 .8b
214
202
; CHECK-NEXT: addp v0.2s, v1.2s, v1.2s
215
203
; CHECK-NEXT: fmov w0, s0
216
204
; CHECK-NEXT: ret
@@ -226,30 +214,10 @@ entry:
226
214
define i32 @test_udot_v16i8_double (<16 x i8 > %a , <16 x i8 > %b , <16 x i8 > %c , <16 x i8 > %d ) {
227
215
; CHECK-LABEL: test_udot_v16i8_double:
228
216
; CHECK: // %bb.0: // %entry
229
- ; CHECK-NEXT: ushll2 v4.8h, v0.16b, #0
230
- ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
231
- ; CHECK-NEXT: ushll2 v5.8h, v1.16b, #0
232
- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
233
- ; CHECK-NEXT: ext v6.16b, v4.16b, v4.16b, #8
234
- ; CHECK-NEXT: ext v7.16b, v5.16b, v5.16b, #8
235
- ; CHECK-NEXT: umull2 v16.4s, v0.8h, v1.8h
236
- ; CHECK-NEXT: umlal v16.4s, v6.4h, v7.4h
237
- ; CHECK-NEXT: ushll2 v6.8h, v2.16b, #0
238
- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
239
- ; CHECK-NEXT: ushll2 v7.8h, v3.16b, #0
240
- ; CHECK-NEXT: ushll v3.8h, v3.8b, #0
241
- ; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h
242
- ; CHECK-NEXT: ext v1.16b, v6.16b, v6.16b, #8
243
- ; CHECK-NEXT: umlal v0.4s, v4.4h, v5.4h
244
- ; CHECK-NEXT: ext v4.16b, v7.16b, v7.16b, #8
245
- ; CHECK-NEXT: umull v5.4s, v2.4h, v3.4h
246
- ; CHECK-NEXT: umull2 v2.4s, v2.8h, v3.8h
247
- ; CHECK-NEXT: umlal v2.4s, v1.4h, v4.4h
248
- ; CHECK-NEXT: umlal v5.4s, v6.4h, v7.4h
249
- ; CHECK-NEXT: add v0.4s, v0.4s, v16.4s
250
- ; CHECK-NEXT: add v1.4s, v5.4s, v2.4s
251
- ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
252
- ; CHECK-NEXT: addv s0, v0.4s
217
+ ; CHECK-NEXT: movi v4.2d, #0000000000000000
218
+ ; CHECK-NEXT: udot v4.4s, v2.16b, v3.16b
219
+ ; CHECK-NEXT: udot v4.4s, v0.16b, v1.16b
220
+ ; CHECK-NEXT: addv s0, v4.4s
253
221
; CHECK-NEXT: fmov w0, s0
254
222
; CHECK-NEXT: ret
255
223
entry:
@@ -270,8 +238,8 @@ define i32 @test_udot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %
270
238
; CHECK: // %bb.0: // %entry
271
239
; CHECK-NEXT: movi v1.16b, #1
272
240
; CHECK-NEXT: movi v3.2d, #0000000000000000
273
- ; CHECK-NEXT: udot v3.4s, v1 .16b, v2 .16b
274
- ; CHECK-NEXT: udot v3.4s, v1 .16b, v0 .16b
241
+ ; CHECK-NEXT: udot v3.4s, v2 .16b, v1 .16b
242
+ ; CHECK-NEXT: udot v3.4s, v0 .16b, v1 .16b
275
243
; CHECK-NEXT: addv s0, v3.4s
276
244
; CHECK-NEXT: fmov w0, s0
277
245
; CHECK-NEXT: ret
@@ -287,20 +255,10 @@ entry:
287
255
define i32 @test_sdot_v8i8_double (<8 x i8 > %a , <8 x i8 > %b , <8 x i8 > %c , <8 x i8 > %d ) {
288
256
; CHECK-LABEL: test_sdot_v8i8_double:
289
257
; CHECK: // %bb.0: // %entry
290
- ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
291
- ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
292
- ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
293
- ; CHECK-NEXT: sshll v3.8h, v3.8b, #0
294
- ; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #8
295
- ; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #8
296
- ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
297
- ; CHECK-NEXT: ext v1.16b, v2.16b, v2.16b, #8
298
- ; CHECK-NEXT: smull v2.4s, v2.4h, v3.4h
299
- ; CHECK-NEXT: ext v3.16b, v3.16b, v3.16b, #8
300
- ; CHECK-NEXT: smlal v0.4s, v4.4h, v5.4h
301
- ; CHECK-NEXT: smlal v2.4s, v1.4h, v3.4h
302
- ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
303
- ; CHECK-NEXT: addv s0, v0.4s
258
+ ; CHECK-NEXT: movi v4.2d, #0000000000000000
259
+ ; CHECK-NEXT: sdot v4.2s, v2.8b, v3.8b
260
+ ; CHECK-NEXT: sdot v4.2s, v0.8b, v1.8b
261
+ ; CHECK-NEXT: addp v0.2s, v4.2s, v4.2s
304
262
; CHECK-NEXT: fmov w0, s0
305
263
; CHECK-NEXT: ret
306
264
entry:
@@ -321,8 +279,8 @@ define i32 @test_sdot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <
321
279
; CHECK: // %bb.0: // %entry
322
280
; CHECK-NEXT: movi v1.2d, #0000000000000000
323
281
; CHECK-NEXT: movi v3.8b, #1
324
- ; CHECK-NEXT: sdot v1.2s, v3 .8b, v2 .8b
325
- ; CHECK-NEXT: sdot v1.2s, v3 .8b, v0 .8b
282
+ ; CHECK-NEXT: sdot v1.2s, v2 .8b, v3 .8b
283
+ ; CHECK-NEXT: sdot v1.2s, v0 .8b, v3 .8b
326
284
; CHECK-NEXT: addp v0.2s, v1.2s, v1.2s
327
285
; CHECK-NEXT: fmov w0, s0
328
286
; CHECK-NEXT: ret
@@ -338,30 +296,10 @@ entry:
338
296
define i32 @test_sdot_v16i8_double (<16 x i8 > %a , <16 x i8 > %b , <16 x i8 > %c , <16 x i8 > %d ) {
339
297
; CHECK-LABEL: test_sdot_v16i8_double:
340
298
; CHECK: // %bb.0: // %entry
341
- ; CHECK-NEXT: sshll2 v4.8h, v0.16b, #0
342
- ; CHECK-NEXT: sshll v0.8h, v0.8b, #0
343
- ; CHECK-NEXT: sshll2 v5.8h, v1.16b, #0
344
- ; CHECK-NEXT: sshll v1.8h, v1.8b, #0
345
- ; CHECK-NEXT: ext v6.16b, v4.16b, v4.16b, #8
346
- ; CHECK-NEXT: ext v7.16b, v5.16b, v5.16b, #8
347
- ; CHECK-NEXT: smull2 v16.4s, v0.8h, v1.8h
348
- ; CHECK-NEXT: smlal v16.4s, v6.4h, v7.4h
349
- ; CHECK-NEXT: sshll2 v6.8h, v2.16b, #0
350
- ; CHECK-NEXT: sshll v2.8h, v2.8b, #0
351
- ; CHECK-NEXT: sshll2 v7.8h, v3.16b, #0
352
- ; CHECK-NEXT: sshll v3.8h, v3.8b, #0
353
- ; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h
354
- ; CHECK-NEXT: ext v1.16b, v6.16b, v6.16b, #8
355
- ; CHECK-NEXT: smlal v0.4s, v4.4h, v5.4h
356
- ; CHECK-NEXT: ext v4.16b, v7.16b, v7.16b, #8
357
- ; CHECK-NEXT: smull v5.4s, v2.4h, v3.4h
358
- ; CHECK-NEXT: smull2 v2.4s, v2.8h, v3.8h
359
- ; CHECK-NEXT: smlal v2.4s, v1.4h, v4.4h
360
- ; CHECK-NEXT: smlal v5.4s, v6.4h, v7.4h
361
- ; CHECK-NEXT: add v0.4s, v0.4s, v16.4s
362
- ; CHECK-NEXT: add v1.4s, v5.4s, v2.4s
363
- ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s
364
- ; CHECK-NEXT: addv s0, v0.4s
299
+ ; CHECK-NEXT: movi v4.2d, #0000000000000000
300
+ ; CHECK-NEXT: sdot v4.4s, v2.16b, v3.16b
301
+ ; CHECK-NEXT: sdot v4.4s, v0.16b, v1.16b
302
+ ; CHECK-NEXT: addv s0, v4.4s
365
303
; CHECK-NEXT: fmov w0, s0
366
304
; CHECK-NEXT: ret
367
305
entry:
@@ -382,8 +320,8 @@ define i32 @test_sdot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %
382
320
; CHECK: // %bb.0: // %entry
383
321
; CHECK-NEXT: movi v1.16b, #1
384
322
; CHECK-NEXT: movi v3.2d, #0000000000000000
385
- ; CHECK-NEXT: sdot v3.4s, v1 .16b, v2 .16b
386
- ; CHECK-NEXT: sdot v3.4s, v1 .16b, v0 .16b
323
+ ; CHECK-NEXT: sdot v3.4s, v2 .16b, v1 .16b
324
+ ; CHECK-NEXT: sdot v3.4s, v0 .16b, v1 .16b
387
325
; CHECK-NEXT: addv s0, v3.4s
388
326
; CHECK-NEXT: fmov w0, s0
389
327
; CHECK-NEXT: ret
0 commit comments