@@ -5,13 +5,8 @@ define <8 x i8> @load4_v4i8_add(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
5
5
; CHECK-LABEL: load4_v4i8_add:
6
6
; CHECK: // %bb.0:
7
7
; CHECK-NEXT: ldp s0, s1, [x0]
8
- ; CHECK-NEXT: ldp s2, s3, [x1]
9
- ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
10
- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
11
- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
12
- ; CHECK-NEXT: ushll v3.8h, v3.8b, #0
13
- ; CHECK-NEXT: uzp1 v0.8b, v0.8b, v2.8b
14
- ; CHECK-NEXT: uzp1 v1.8b, v1.8b, v3.8b
8
+ ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
9
+ ; CHECK-NEXT: ld1 { v1.s }[1], [x1]
15
10
; CHECK-NEXT: add v0.8b, v0.8b, v1.8b
16
11
; CHECK-NEXT: ret
17
12
%la = load <4 x i8 >, <4 x i8 > *%a
@@ -30,13 +25,8 @@ define <8 x i16> @load4_v4i8_zext_add(float %tmp, <4 x i8> *%a, <4 x i8> *%b) {
30
25
; CHECK-LABEL: load4_v4i8_zext_add:
31
26
; CHECK: // %bb.0:
32
27
; CHECK-NEXT: ldp s0, s1, [x0]
33
- ; CHECK-NEXT: ldp s2, s3, [x1]
34
- ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
35
- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
36
- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
37
- ; CHECK-NEXT: ushll v3.8h, v3.8b, #0
38
- ; CHECK-NEXT: uzp1 v0.8b, v0.8b, v2.8b
39
- ; CHECK-NEXT: uzp1 v1.8b, v1.8b, v3.8b
28
+ ; CHECK-NEXT: ld1 { v0.s }[1], [x1], #4
29
+ ; CHECK-NEXT: ld1 { v1.s }[1], [x1]
40
30
; CHECK-NEXT: uaddl v0.8h, v0.8b, v1.8b
41
31
; CHECK-NEXT: ret
42
32
%la = load <4 x i8 >, <4 x i8 > *%a
@@ -59,103 +49,49 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
59
49
; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1
60
50
; CHECK-NEXT: sxtw x8, w1
61
51
; CHECK-NEXT: // kill: def $w3 killed $w3 def $x3
62
- ; CHECK-NEXT: sxtw x9, w3
63
- ; CHECK-NEXT: ldp s0, s1, [x0]
64
- ; CHECK-NEXT: ldp s2, s3, [x2]
65
- ; CHECK-NEXT: add x10, x0, x8
66
- ; CHECK-NEXT: add x11, x2, x9
67
- ; CHECK-NEXT: ushll v4.8h, v0.8b, #0
68
- ; CHECK-NEXT: ushll v0.8h, v3.8b, #0
69
- ; CHECK-NEXT: ldp s5, s3, [x10]
70
- ; CHECK-NEXT: add x10, x10, x8
52
+ ; CHECK-NEXT: sxtw x11, w3
53
+ ; CHECK-NEXT: add x9, x0, x8
54
+ ; CHECK-NEXT: add x12, x2, x11
55
+ ; CHECK-NEXT: add x10, x9, x8
56
+ ; CHECK-NEXT: add x13, x12, x11
71
57
; CHECK-NEXT: add x8, x10, x8
72
- ; CHECK-NEXT: ldp s6, s7, [x11]
73
- ; CHECK-NEXT: ldp s16, s17, [x10]
74
- ; CHECK-NEXT: ldp s18, s21, [x8]
75
- ; CHECK-NEXT: add x11, x11, x9
76
- ; CHECK-NEXT: add x9, x11, x9
77
- ; CHECK-NEXT: ushll v5.8h, v5.8b, #0
78
- ; CHECK-NEXT: ushll v16.8h, v16.8b, #0
79
- ; CHECK-NEXT: ushll v18.8h, v18.8b, #0
80
- ; CHECK-NEXT: ldp s19, s20, [x11]
81
- ; CHECK-NEXT: uzp1 v16.8b, v18.8b, v16.8b
82
- ; CHECK-NEXT: uzp1 v4.8b, v5.8b, v4.8b
83
- ; CHECK-NEXT: ldp s18, s5, [x9]
84
- ; CHECK-NEXT: ushll v2.8h, v2.8b, #0
85
- ; CHECK-NEXT: ushll v6.8h, v6.8b, #0
86
- ; CHECK-NEXT: ushll v19.8h, v19.8b, #0
87
- ; CHECK-NEXT: ushll v18.8h, v18.8b, #0
88
- ; CHECK-NEXT: uzp1 v2.8b, v6.8b, v2.8b
89
- ; CHECK-NEXT: uzp1 v18.8b, v18.8b, v19.8b
58
+ ; CHECK-NEXT: add x11, x13, x11
59
+ ; CHECK-NEXT: ldp s1, s5, [x9]
60
+ ; CHECK-NEXT: ldp s0, s4, [x8]
61
+ ; CHECK-NEXT: ld1 { v0.s }[1], [x10], #4
62
+ ; CHECK-NEXT: ld1 { v1.s }[1], [x0], #4
63
+ ; CHECK-NEXT: ldp s2, s6, [x11]
64
+ ; CHECK-NEXT: ldp s3, s7, [x12]
65
+ ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
90
66
; CHECK-NEXT: ushll v1.8h, v1.8b, #0
91
- ; CHECK-NEXT: ushll v3.8h, v3.8b , #0
92
- ; CHECK-NEXT: ushll v17.8h, v17.8b , #0
93
- ; CHECK-NEXT: ushll v20.8h, v20.8b, #0
94
- ; CHECK-NEXT: ushll v6.8h, v16.8b, #0
95
- ; CHECK-NEXT: ushll v4.8h, v4.8b, #0
96
- ; CHECK-NEXT: ushll v16.8h, v18.8b, #0
67
+ ; CHECK-NEXT: ld1 { v2.s }[1], [x13] , #4
68
+ ; CHECK-NEXT: ld1 { v3.s }[1], [x2] , #4
69
+ ; CHECK-NEXT: ld1 { v4.s }[1], [x10]
70
+ ; CHECK-NEXT: ld1 { v5.s }[1], [x0]
71
+ ; CHECK-NEXT: ld1 { v6.s }[1], [x13]
72
+ ; CHECK-NEXT: ld1 { v7.s }[1], [x2]
97
73
; CHECK-NEXT: ushll v2.8h, v2.8b, #0
98
- ; CHECK-NEXT: ushll v19.8h, v21.8b, #0
99
- ; CHECK-NEXT: ushll v5.8h, v5.8b, #0
100
- ; CHECK-NEXT: ushll v7.8h, v7.8b, #0
101
- ; CHECK-NEXT: usubl v18.4s, v6.4h, v16.4h
102
- ; CHECK-NEXT: usubl2 v6.4s, v6.8h, v16.8h
103
- ; CHECK-NEXT: usubl v16.4s, v4.4h, v2.4h
104
- ; CHECK-NEXT: usubl2 v2.4s, v4.8h, v2.8h
105
- ; CHECK-NEXT: uzp1 v4.8b, v19.8b, v17.8b
106
- ; CHECK-NEXT: uzp1 v1.8b, v3.8b, v1.8b
107
- ; CHECK-NEXT: uzp1 v3.8b, v5.8b, v20.8b
108
- ; CHECK-NEXT: uzp1 v0.8b, v7.8b, v0.8b
109
- ; CHECK-NEXT: ushll v4.8h, v4.8b, #0
110
74
; CHECK-NEXT: ushll v3.8h, v3.8b, #0
111
- ; CHECK-NEXT: ushll v1.8h, v1.8b, #0
112
- ; CHECK-NEXT: ushll v0.8h, v0.8b, #0
113
- ; CHECK-NEXT: usubl2 v5.4s, v4.8h, v3.8h
114
- ; CHECK-NEXT: usubl v3.4s, v4.4h, v3.4h
115
- ; CHECK-NEXT: usubl2 v4.4s, v1.8h, v0.8h
116
- ; CHECK-NEXT: usubl v0.4s, v1.4h, v0.4h
117
- ; CHECK-NEXT: shl v1.4s, v3.4s, #16
118
- ; CHECK-NEXT: shl v3.4s, v5.4s, #16
75
+ ; CHECK-NEXT: usubl v16.4s, v0.4h, v2.4h
76
+ ; CHECK-NEXT: usubl2 v0.4s, v0.8h, v2.8h
77
+ ; CHECK-NEXT: usubl v2.4s, v1.4h, v3.4h
78
+ ; CHECK-NEXT: usubl2 v1.4s, v1.8h, v3.8h
79
+ ; CHECK-NEXT: ushll v3.8h, v4.8b, #0
80
+ ; CHECK-NEXT: ushll v4.8h, v5.8b, #0
81
+ ; CHECK-NEXT: ushll v5.8h, v6.8b, #0
82
+ ; CHECK-NEXT: ushll v6.8h, v7.8b, #0
83
+ ; CHECK-NEXT: usubl2 v7.4s, v3.8h, v5.8h
84
+ ; CHECK-NEXT: usubl v3.4s, v3.4h, v5.4h
85
+ ; CHECK-NEXT: usubl2 v5.4s, v4.8h, v6.8h
86
+ ; CHECK-NEXT: usubl v4.4s, v4.4h, v6.4h
87
+ ; CHECK-NEXT: shl v3.4s, v3.4s, #16
88
+ ; CHECK-NEXT: shl v6.4s, v7.4s, #16
89
+ ; CHECK-NEXT: shl v5.4s, v5.4s, #16
119
90
; CHECK-NEXT: shl v4.4s, v4.4s, #16
120
- ; CHECK-NEXT: add v1 .4s, v1 .4s, v18 .4s
121
- ; CHECK-NEXT: shl v0 .4s, v0 .4s, #16
122
- ; CHECK-NEXT: add v3 .4s, v3 .4s, v6 .4s
91
+ ; CHECK-NEXT: add v0 .4s, v6 .4s, v0 .4s
92
+ ; CHECK-NEXT: add v3 .4s, v3 .4s, v16.4s
93
+ ; CHECK-NEXT: add v1 .4s, v5 .4s, v1 .4s
123
94
; CHECK-NEXT: add v2.4s, v4.4s, v2.4s
124
- ; CHECK-NEXT: rev64 v4.4s, v3.4s
125
- ; CHECK-NEXT: rev64 v5.4s, v1.4s
126
- ; CHECK-NEXT: add v0.4s, v0.4s, v16.4s
127
- ; CHECK-NEXT: rev64 v6.4s, v2.4s
128
- ; CHECK-NEXT: rev64 v7.4s, v0.4s
129
- ; CHECK-NEXT: add v16.4s, v3.4s, v4.4s
130
- ; CHECK-NEXT: add v17.4s, v1.4s, v5.4s
131
- ; CHECK-NEXT: sub v1.4s, v1.4s, v5.4s
132
- ; CHECK-NEXT: trn2 v5.4s, v16.4s, v17.4s
133
- ; CHECK-NEXT: add v18.4s, v2.4s, v6.4s
134
- ; CHECK-NEXT: add v19.4s, v0.4s, v7.4s
135
- ; CHECK-NEXT: sub v2.4s, v2.4s, v6.4s
136
- ; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s
137
- ; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s
138
- ; CHECK-NEXT: trn2 v4.4s, v19.4s, v18.4s
139
- ; CHECK-NEXT: ext v6.16b, v5.16b, v16.16b, #8
140
- ; CHECK-NEXT: zip1 v7.4s, v0.4s, v2.4s
141
- ; CHECK-NEXT: trn2 v16.4s, v17.4s, v16.4s
142
- ; CHECK-NEXT: ext v4.16b, v19.16b, v4.16b, #8
143
- ; CHECK-NEXT: zip1 v20.4s, v3.4s, v1.4s
144
- ; CHECK-NEXT: ext v7.16b, v0.16b, v7.16b, #8
145
- ; CHECK-NEXT: ext v17.16b, v16.16b, v17.16b, #8
146
- ; CHECK-NEXT: zip2 v18.4s, v19.4s, v18.4s
147
- ; CHECK-NEXT: zip2 v1.4s, v3.4s, v1.4s
148
- ; CHECK-NEXT: mov v0.s[3], v2.s[2]
149
- ; CHECK-NEXT: mov v5.d[1], v4.d[1]
150
- ; CHECK-NEXT: mov v20.d[1], v7.d[1]
151
- ; CHECK-NEXT: mov v17.d[1], v18.d[1]
152
- ; CHECK-NEXT: mov v16.d[1], v4.d[1]
153
- ; CHECK-NEXT: mov v1.d[1], v0.d[1]
154
- ; CHECK-NEXT: mov v6.d[1], v18.d[1]
155
- ; CHECK-NEXT: add v0.4s, v17.4s, v16.4s
156
- ; CHECK-NEXT: add v2.4s, v1.4s, v20.4s
157
- ; CHECK-NEXT: sub v3.4s, v5.4s, v6.4s
158
- ; CHECK-NEXT: sub v1.4s, v20.4s, v1.4s
159
95
; CHECK-NEXT: rev64 v4.4s, v0.4s
160
96
; CHECK-NEXT: rev64 v5.4s, v3.4s
161
97
; CHECK-NEXT: rev64 v6.4s, v1.4s
@@ -164,43 +100,77 @@ define i32 @large(i8* nocapture noundef readonly %p1, i32 noundef %st1, i8* noca
164
100
; CHECK-NEXT: add v17.4s, v3.4s, v5.4s
165
101
; CHECK-NEXT: add v18.4s, v1.4s, v6.4s
166
102
; CHECK-NEXT: add v19.4s, v2.4s, v7.4s
167
- ; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s
168
103
; CHECK-NEXT: sub v1.4s, v1.4s, v6.4s
169
- ; CHECK-NEXT: sub v3 .4s, v3 .4s, v5 .4s
104
+ ; CHECK-NEXT: sub v2 .4s, v2 .4s, v7 .4s
170
105
; CHECK-NEXT: sub v0.4s, v0.4s, v4.4s
106
+ ; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s
107
+ ; CHECK-NEXT: trn2 v4.4s, v16.4s, v17.4s
108
+ ; CHECK-NEXT: trn2 v5.4s, v19.4s, v18.4s
109
+ ; CHECK-NEXT: zip1 v7.4s, v2.4s, v1.4s
110
+ ; CHECK-NEXT: trn2 v20.4s, v17.4s, v16.4s
111
+ ; CHECK-NEXT: zip1 v6.4s, v0.4s, v3.4s
112
+ ; CHECK-NEXT: zip2 v18.4s, v19.4s, v18.4s
113
+ ; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #8
114
+ ; CHECK-NEXT: ext v16.16b, v4.16b, v16.16b, #8
115
+ ; CHECK-NEXT: ext v7.16b, v2.16b, v7.16b, #8
116
+ ; CHECK-NEXT: ext v17.16b, v20.16b, v17.16b, #8
117
+ ; CHECK-NEXT: zip2 v0.4s, v0.4s, v3.4s
118
+ ; CHECK-NEXT: mov v2.s[3], v1.s[2]
119
+ ; CHECK-NEXT: mov v4.d[1], v5.d[1]
120
+ ; CHECK-NEXT: mov v6.d[1], v7.d[1]
121
+ ; CHECK-NEXT: mov v17.d[1], v18.d[1]
122
+ ; CHECK-NEXT: mov v20.d[1], v5.d[1]
123
+ ; CHECK-NEXT: mov v0.d[1], v2.d[1]
124
+ ; CHECK-NEXT: mov v16.d[1], v18.d[1]
125
+ ; CHECK-NEXT: add v1.4s, v17.4s, v20.4s
126
+ ; CHECK-NEXT: add v2.4s, v0.4s, v6.4s
127
+ ; CHECK-NEXT: sub v3.4s, v4.4s, v16.4s
128
+ ; CHECK-NEXT: sub v0.4s, v6.4s, v0.4s
129
+ ; CHECK-NEXT: rev64 v4.4s, v1.4s
130
+ ; CHECK-NEXT: rev64 v5.4s, v3.4s
131
+ ; CHECK-NEXT: rev64 v6.4s, v0.4s
132
+ ; CHECK-NEXT: rev64 v7.4s, v2.4s
133
+ ; CHECK-NEXT: add v16.4s, v1.4s, v4.4s
134
+ ; CHECK-NEXT: add v17.4s, v3.4s, v5.4s
135
+ ; CHECK-NEXT: add v18.4s, v0.4s, v6.4s
136
+ ; CHECK-NEXT: add v19.4s, v2.4s, v7.4s
137
+ ; CHECK-NEXT: sub v2.4s, v2.4s, v7.4s
138
+ ; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s
139
+ ; CHECK-NEXT: sub v3.4s, v3.4s, v5.4s
140
+ ; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s
171
141
; CHECK-NEXT: ext v4.16b, v2.16b, v19.16b, #12
172
- ; CHECK-NEXT: ext v5.16b, v1 .16b, v18.16b, #12
142
+ ; CHECK-NEXT: ext v5.16b, v0 .16b, v18.16b, #12
173
143
; CHECK-NEXT: ext v7.16b, v3.16b, v17.16b, #12
174
144
; CHECK-NEXT: rev64 v16.4s, v16.4s
175
145
; CHECK-NEXT: ext v6.16b, v4.16b, v2.16b, #4
176
146
; CHECK-NEXT: ext v17.16b, v4.16b, v4.16b, #8
177
- ; CHECK-NEXT: ext v18.16b, v5.16b, v1 .16b, #4
147
+ ; CHECK-NEXT: ext v18.16b, v5.16b, v0 .16b, #4
178
148
; CHECK-NEXT: ext v19.16b, v5.16b, v5.16b, #8
179
149
; CHECK-NEXT: ext v20.16b, v7.16b, v3.16b, #4
180
150
; CHECK-NEXT: ext v21.16b, v7.16b, v7.16b, #8
181
151
; CHECK-NEXT: rev64 v7.4s, v7.4s
182
- ; CHECK-NEXT: trn2 v0 .4s, v16.4s, v0 .4s
152
+ ; CHECK-NEXT: trn2 v1 .4s, v16.4s, v1 .4s
183
153
; CHECK-NEXT: rev64 v5.4s, v5.4s
184
154
; CHECK-NEXT: rev64 v4.4s, v4.4s
185
155
; CHECK-NEXT: ext v6.16b, v6.16b, v17.16b, #12
186
156
; CHECK-NEXT: ext v17.16b, v18.16b, v19.16b, #12
187
157
; CHECK-NEXT: ext v18.16b, v20.16b, v21.16b, #12
188
158
; CHECK-NEXT: ext v3.16b, v7.16b, v3.16b, #4
189
- ; CHECK-NEXT: ext v7.16b, v0 .16b, v0 .16b, #8
190
- ; CHECK-NEXT: ext v1 .16b, v5.16b, v1 .16b, #4
159
+ ; CHECK-NEXT: ext v7.16b, v1 .16b, v1 .16b, #8
160
+ ; CHECK-NEXT: ext v0 .16b, v5.16b, v0 .16b, #4
191
161
; CHECK-NEXT: ext v2.16b, v4.16b, v2.16b, #4
192
162
; CHECK-NEXT: add v4.4s, v18.4s, v3.4s
193
- ; CHECK-NEXT: add v5.4s, v0 .4s, v7.4s
194
- ; CHECK-NEXT: add v16.4s, v17.4s, v1 .4s
163
+ ; CHECK-NEXT: add v5.4s, v1 .4s, v7.4s
164
+ ; CHECK-NEXT: add v16.4s, v17.4s, v0 .4s
195
165
; CHECK-NEXT: add v19.4s, v6.4s, v2.4s
196
166
; CHECK-NEXT: sub v3.4s, v18.4s, v3.4s
197
- ; CHECK-NEXT: sub v0 .4s, v0 .4s, v7.4s
167
+ ; CHECK-NEXT: sub v1 .4s, v1 .4s, v7.4s
198
168
; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s
199
- ; CHECK-NEXT: sub v1 .4s, v17.4s, v1 .4s
169
+ ; CHECK-NEXT: sub v0 .4s, v17.4s, v0 .4s
200
170
; CHECK-NEXT: mov v19.d[1], v2.d[1]
201
- ; CHECK-NEXT: mov v16.d[1], v1 .d[1]
171
+ ; CHECK-NEXT: mov v16.d[1], v0 .d[1]
202
172
; CHECK-NEXT: mov v4.d[1], v3.d[1]
203
- ; CHECK-NEXT: mov v5.d[1], v0 .d[1]
173
+ ; CHECK-NEXT: mov v5.d[1], v1 .d[1]
204
174
; CHECK-NEXT: movi v0.8h, #1
205
175
; CHECK-NEXT: movi v7.2d, #0x00ffff0000ffff
206
176
; CHECK-NEXT: ushr v1.4s, v4.4s, #15
0 commit comments