@@ -12,15 +12,13 @@ define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
12
12
;
13
13
; CHECK-NODOT-LABEL: udot:
14
14
; CHECK-NODOT: // %bb.0:
15
- ; CHECK-NODOT-NEXT: ushll v3.8h, v1.8b, #0
16
- ; CHECK-NODOT-NEXT: ushll v4.8h, v2.8b, #0
17
- ; CHECK-NODOT-NEXT: ushll2 v1.8h, v1.16b, #0
18
- ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
19
- ; CHECK-NODOT-NEXT: umlal v0.4s, v4.4h, v3.4h
20
- ; CHECK-NODOT-NEXT: umull v5.4s, v2.4h, v1.4h
21
- ; CHECK-NODOT-NEXT: umlal2 v0.4s, v2.8h, v1.8h
22
- ; CHECK-NODOT-NEXT: umlal2 v5.4s, v4.8h, v3.8h
23
- ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
15
+ ; CHECK-NODOT-NEXT: umull v3.8h, v2.8b, v1.8b
16
+ ; CHECK-NODOT-NEXT: umull2 v1.8h, v2.16b, v1.16b
17
+ ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
18
+ ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v3.4h
19
+ ; CHECK-NODOT-NEXT: uaddw2 v2.4s, v2.4s, v3.8h
20
+ ; CHECK-NODOT-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
21
+ ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
24
22
; CHECK-NODOT-NEXT: ret
25
23
%u.wide = zext <16 x i8 > %u to <16 x i32 >
26
24
%s.wide = zext <16 x i8 > %s to <16 x i32 >
@@ -52,20 +50,18 @@ define <4 x i32> @udot_in_loop(ptr %p1, ptr %p2){
52
50
; CHECK-NODOT-NEXT: mov x8, xzr
53
51
; CHECK-NODOT-NEXT: .LBB1_1: // %vector.body
54
52
; CHECK-NODOT-NEXT: // =>This Inner Loop Header: Depth=1
55
- ; CHECK-NODOT-NEXT: ldr q0, [x1 , x8]
56
- ; CHECK-NODOT-NEXT: ldr q2, [x0 , x8]
53
+ ; CHECK-NODOT-NEXT: ldr q0, [x0 , x8]
54
+ ; CHECK-NODOT-NEXT: ldr q2, [x1 , x8]
57
55
; CHECK-NODOT-NEXT: add x8, x8, #16
58
56
; CHECK-NODOT-NEXT: cmp x8, #16
59
- ; CHECK-NODOT-NEXT: ushll2 v3.8h, v0.16b, #0
60
- ; CHECK-NODOT-NEXT: ushll2 v4.8h, v2.16b, #0
61
- ; CHECK-NODOT-NEXT: ushll v5.8h, v0.8b, #0
62
- ; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
57
+ ; CHECK-NODOT-NEXT: umull v3.8h, v0.8b, v2.8b
58
+ ; CHECK-NODOT-NEXT: umull2 v2.8h, v0.16b, v2.16b
63
59
; CHECK-NODOT-NEXT: mov v0.16b, v1.16b
64
- ; CHECK-NODOT-NEXT: umull v6 .4s, v4 .4h, v3.4h
65
- ; CHECK-NODOT-NEXT: umlal v1 .4s, v2.4h, v5 .4h
66
- ; CHECK-NODOT-NEXT: umlal2 v6 .4s, v2.8h, v5 .8h
67
- ; CHECK-NODOT-NEXT: umlal2 v1 .4s, v4.8h, v3 .8h
68
- ; CHECK-NODOT-NEXT: add v1.4s, v6 .4s, v1 .4s
60
+ ; CHECK-NODOT-NEXT: ushll v1 .4s, v2 .4h, #0
61
+ ; CHECK-NODOT-NEXT: uaddw v4 .4s, v0.4s, v3 .4h
62
+ ; CHECK-NODOT-NEXT: uaddw2 v1 .4s, v1.4s, v3 .8h
63
+ ; CHECK-NODOT-NEXT: uaddw2 v2 .4s, v4.4s, v2 .8h
64
+ ; CHECK-NODOT-NEXT: add v1.4s, v1 .4s, v2 .4s
69
65
; CHECK-NODOT-NEXT: b.ne .LBB1_1
70
66
; CHECK-NODOT-NEXT: // %bb.2: // %end
71
67
; CHECK-NODOT-NEXT: ret
@@ -99,19 +95,17 @@ define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
99
95
;
100
96
; CHECK-NODOT-LABEL: udot_narrow:
101
97
; CHECK-NODOT: // %bb.0:
102
- ; CHECK-NODOT-NEXT: ushll v1.8h, v1.8b, #0
103
- ; CHECK-NODOT-NEXT: ushll v2.8h, v2.8b, #0
98
+ ; CHECK-NODOT-NEXT: umull v1.8h, v2.8b, v1.8b
104
99
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
105
- ; CHECK-NODOT-NEXT: umull v3.4s, v2.4h, v1.4h
106
- ; CHECK-NODOT-NEXT: umull2 v4.4s, v2.8h, v1.8h
107
- ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
108
- ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
109
- ; CHECK-NODOT-NEXT: umlal v0.4s, v2.4h, v1.4h
100
+ ; CHECK-NODOT-NEXT: ushll v2.4s, v1.4h, #0
101
+ ; CHECK-NODOT-NEXT: ushll2 v3.4s, v1.8h, #0
102
+ ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
103
+ ; CHECK-NODOT-NEXT: uaddw v0.4s, v0.4s, v1.4h
110
104
; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
111
- ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
112
- ; CHECK-NODOT-NEXT: umlal v3.4s, v6.4h, v5.4h
113
- ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
105
+ ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
114
106
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
107
+ ; CHECK-NODOT-NEXT: uaddw v1.4s, v2.4s, v4.4h
108
+ ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
115
109
; CHECK-NODOT-NEXT: ret
116
110
%u.wide = zext <8 x i8 > %u to <8 x i32 >
117
111
%s.wide = zext <8 x i8 > %s to <8 x i32 >
@@ -128,15 +122,13 @@ define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
128
122
;
129
123
; CHECK-NODOT-LABEL: sdot:
130
124
; CHECK-NODOT: // %bb.0:
131
- ; CHECK-NODOT-NEXT: sshll v3.8h, v1.8b, #0
132
- ; CHECK-NODOT-NEXT: sshll v4.8h, v2.8b, #0
133
- ; CHECK-NODOT-NEXT: sshll2 v1.8h, v1.16b, #0
134
- ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
135
- ; CHECK-NODOT-NEXT: smlal v0.4s, v4.4h, v3.4h
136
- ; CHECK-NODOT-NEXT: smull v5.4s, v2.4h, v1.4h
137
- ; CHECK-NODOT-NEXT: smlal2 v0.4s, v2.8h, v1.8h
138
- ; CHECK-NODOT-NEXT: smlal2 v5.4s, v4.8h, v3.8h
139
- ; CHECK-NODOT-NEXT: add v0.4s, v5.4s, v0.4s
125
+ ; CHECK-NODOT-NEXT: smull v3.8h, v2.8b, v1.8b
126
+ ; CHECK-NODOT-NEXT: smull2 v1.8h, v2.16b, v1.16b
127
+ ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
128
+ ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v3.4h
129
+ ; CHECK-NODOT-NEXT: saddw2 v2.4s, v2.4s, v3.8h
130
+ ; CHECK-NODOT-NEXT: saddw2 v0.4s, v0.4s, v1.8h
131
+ ; CHECK-NODOT-NEXT: add v0.4s, v2.4s, v0.4s
140
132
; CHECK-NODOT-NEXT: ret
141
133
%u.wide = sext <16 x i8 > %u to <16 x i32 >
142
134
%s.wide = sext <16 x i8 > %s to <16 x i32 >
@@ -153,19 +145,17 @@ define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
153
145
;
154
146
; CHECK-NODOT-LABEL: sdot_narrow:
155
147
; CHECK-NODOT: // %bb.0:
156
- ; CHECK-NODOT-NEXT: sshll v1.8h, v1.8b, #0
157
- ; CHECK-NODOT-NEXT: sshll v2.8h, v2.8b, #0
148
+ ; CHECK-NODOT-NEXT: smull v1.8h, v2.8b, v1.8b
158
149
; CHECK-NODOT-NEXT: // kill: def $d0 killed $d0 def $q0
159
- ; CHECK-NODOT-NEXT: smull v3.4s, v2.4h, v1.4h
160
- ; CHECK-NODOT-NEXT: smull2 v4.4s, v2.8h, v1.8h
161
- ; CHECK-NODOT-NEXT: ext v5.16b, v1.16b, v1.16b, #8
162
- ; CHECK-NODOT-NEXT: ext v6.16b, v2.16b, v2.16b, #8
163
- ; CHECK-NODOT-NEXT: smlal v0.4s, v2.4h, v1.4h
150
+ ; CHECK-NODOT-NEXT: sshll v2.4s, v1.4h, #0
151
+ ; CHECK-NODOT-NEXT: sshll2 v3.4s, v1.8h, #0
152
+ ; CHECK-NODOT-NEXT: ext v4.16b, v1.16b, v1.16b, #8
153
+ ; CHECK-NODOT-NEXT: saddw v0.4s, v0.4s, v1.4h
164
154
; CHECK-NODOT-NEXT: ext v3.16b, v3.16b, v3.16b, #8
165
- ; CHECK-NODOT-NEXT: ext v1.16b, v4.16b, v4.16b, #8
166
- ; CHECK-NODOT-NEXT: smlal v3.4s, v6.4h, v5.4h
167
- ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
155
+ ; CHECK-NODOT-NEXT: ext v2.16b, v2.16b, v2.16b, #8
168
156
; CHECK-NODOT-NEXT: add v0.2s, v3.2s, v0.2s
157
+ ; CHECK-NODOT-NEXT: saddw v1.4s, v2.4s, v4.4h
158
+ ; CHECK-NODOT-NEXT: add v0.2s, v1.2s, v0.2s
169
159
; CHECK-NODOT-NEXT: ret
170
160
%u.wide = sext <8 x i8 > %u to <8 x i32 >
171
161
%s.wide = sext <8 x i8 > %s to <8 x i32 >
@@ -417,27 +407,19 @@ define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
417
407
;
418
408
; CHECK-NODOT-LABEL: udot_8to64:
419
409
; CHECK-NODOT: // %bb.0: // %entry
420
- ; CHECK-NODOT-NEXT: ushll v4.8h, v3.8b, #0
421
- ; CHECK-NODOT-NEXT: ushll v5.8h, v2.8b, #0
422
- ; CHECK-NODOT-NEXT: ushll2 v3.8h, v3.16b, #0
423
- ; CHECK-NODOT-NEXT: ushll2 v2.8h, v2.16b, #0
424
- ; CHECK-NODOT-NEXT: ushll v6.4s, v4.4h, #0
425
- ; CHECK-NODOT-NEXT: ushll v7.4s, v5.4h, #0
410
+ ; CHECK-NODOT-NEXT: umull v4.8h, v2.8b, v3.8b
411
+ ; CHECK-NODOT-NEXT: umull2 v2.8h, v2.16b, v3.16b
412
+ ; CHECK-NODOT-NEXT: ushll v3.4s, v4.4h, #0
413
+ ; CHECK-NODOT-NEXT: ushll v5.4s, v2.4h, #0
426
414
; CHECK-NODOT-NEXT: ushll2 v4.4s, v4.8h, #0
427
- ; CHECK-NODOT-NEXT: ushll2 v5.4s, v5.8h, #0
428
- ; CHECK-NODOT-NEXT: ushll2 v16.4s, v3.8h, #0
429
- ; CHECK-NODOT-NEXT: ushll2 v17.4s, v2.8h, #0
430
- ; CHECK-NODOT-NEXT: ushll v3.4s, v3.4h, #0
431
- ; CHECK-NODOT-NEXT: ushll v2.4s, v2.4h, #0
432
- ; CHECK-NODOT-NEXT: umlal2 v1.2d, v7.4s, v6.4s
433
- ; CHECK-NODOT-NEXT: umlal v0.2d, v7.2s, v6.2s
434
- ; CHECK-NODOT-NEXT: umull2 v18.2d, v5.4s, v4.4s
435
- ; CHECK-NODOT-NEXT: umull v4.2d, v5.2s, v4.2s
436
- ; CHECK-NODOT-NEXT: umlal2 v1.2d, v17.4s, v16.4s
437
- ; CHECK-NODOT-NEXT: umlal v0.2d, v17.2s, v16.2s
438
- ; CHECK-NODOT-NEXT: umlal2 v18.2d, v2.4s, v3.4s
439
- ; CHECK-NODOT-NEXT: umlal v4.2d, v2.2s, v3.2s
440
- ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
415
+ ; CHECK-NODOT-NEXT: ushll2 v2.4s, v2.8h, #0
416
+ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v3.4s
417
+ ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v3.2s
418
+ ; CHECK-NODOT-NEXT: uaddl2 v3.2d, v4.4s, v5.4s
419
+ ; CHECK-NODOT-NEXT: uaddl v4.2d, v4.2s, v5.2s
420
+ ; CHECK-NODOT-NEXT: uaddw2 v1.2d, v1.2d, v2.4s
421
+ ; CHECK-NODOT-NEXT: uaddw v0.2d, v0.2d, v2.2s
422
+ ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
441
423
; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
442
424
; CHECK-NODOT-NEXT: ret
443
425
entry:
@@ -460,27 +442,19 @@ define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
460
442
;
461
443
; CHECK-NODOT-LABEL: sdot_8to64:
462
444
; CHECK-NODOT: // %bb.0: // %entry
463
- ; CHECK-NODOT-NEXT: sshll v4.8h, v3.8b, #0
464
- ; CHECK-NODOT-NEXT: sshll v5.8h, v2.8b, #0
465
- ; CHECK-NODOT-NEXT: sshll2 v3.8h, v3.16b, #0
466
- ; CHECK-NODOT-NEXT: sshll2 v2.8h, v2.16b, #0
467
- ; CHECK-NODOT-NEXT: sshll v6.4s, v4.4h, #0
468
- ; CHECK-NODOT-NEXT: sshll v7.4s, v5.4h, #0
445
+ ; CHECK-NODOT-NEXT: smull v4.8h, v2.8b, v3.8b
446
+ ; CHECK-NODOT-NEXT: smull2 v2.8h, v2.16b, v3.16b
447
+ ; CHECK-NODOT-NEXT: sshll v3.4s, v4.4h, #0
448
+ ; CHECK-NODOT-NEXT: sshll v5.4s, v2.4h, #0
469
449
; CHECK-NODOT-NEXT: sshll2 v4.4s, v4.8h, #0
470
- ; CHECK-NODOT-NEXT: sshll2 v5.4s, v5.8h, #0
471
- ; CHECK-NODOT-NEXT: sshll2 v16.4s, v3.8h, #0
472
- ; CHECK-NODOT-NEXT: sshll2 v17.4s, v2.8h, #0
473
- ; CHECK-NODOT-NEXT: sshll v3.4s, v3.4h, #0
474
- ; CHECK-NODOT-NEXT: sshll v2.4s, v2.4h, #0
475
- ; CHECK-NODOT-NEXT: smlal2 v1.2d, v7.4s, v6.4s
476
- ; CHECK-NODOT-NEXT: smlal v0.2d, v7.2s, v6.2s
477
- ; CHECK-NODOT-NEXT: smull2 v18.2d, v5.4s, v4.4s
478
- ; CHECK-NODOT-NEXT: smull v4.2d, v5.2s, v4.2s
479
- ; CHECK-NODOT-NEXT: smlal2 v1.2d, v17.4s, v16.4s
480
- ; CHECK-NODOT-NEXT: smlal v0.2d, v17.2s, v16.2s
481
- ; CHECK-NODOT-NEXT: smlal2 v18.2d, v2.4s, v3.4s
482
- ; CHECK-NODOT-NEXT: smlal v4.2d, v2.2s, v3.2s
483
- ; CHECK-NODOT-NEXT: add v1.2d, v18.2d, v1.2d
450
+ ; CHECK-NODOT-NEXT: sshll2 v2.4s, v2.8h, #0
451
+ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v3.4s
452
+ ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v3.2s
453
+ ; CHECK-NODOT-NEXT: saddl2 v3.2d, v4.4s, v5.4s
454
+ ; CHECK-NODOT-NEXT: saddl v4.2d, v4.2s, v5.2s
455
+ ; CHECK-NODOT-NEXT: saddw2 v1.2d, v1.2d, v2.4s
456
+ ; CHECK-NODOT-NEXT: saddw v0.2d, v0.2d, v2.2s
457
+ ; CHECK-NODOT-NEXT: add v1.2d, v3.2d, v1.2d
484
458
; CHECK-NODOT-NEXT: add v0.2d, v4.2d, v0.2d
485
459
; CHECK-NODOT-NEXT: ret
486
460
entry:
@@ -797,10 +771,9 @@ define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
797
771
define <4 x i32 > @not_udot (<4 x i32 > %acc , <8 x i8 > %u , <8 x i8 > %s ) #0 {
798
772
; CHECK-LABEL: not_udot:
799
773
; CHECK: // %bb.0:
800
- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
801
- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
802
- ; CHECK-NEXT: umlal v0.4s, v2.4h, v1.4h
803
- ; CHECK-NEXT: umlal2 v0.4s, v2.8h, v1.8h
774
+ ; CHECK-NEXT: umull v1.8h, v2.8b, v1.8b
775
+ ; CHECK-NEXT: uaddw v0.4s, v0.4s, v1.4h
776
+ ; CHECK-NEXT: uaddw2 v0.4s, v0.4s, v1.8h
804
777
; CHECK-NEXT: ret
805
778
%u.wide = zext <8 x i8 > %u to <8 x i32 >
806
779
%s.wide = zext <8 x i8 > %s to <8 x i32 >
0 commit comments