@@ -15,30 +15,27 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
15
15
; CHECK-LABEL: complex_mul_v2f64:
16
16
; CHECK: // %bb.0: // %entry
17
17
; CHECK-NEXT: mov z1.d, #0 // =0x0
18
- ; CHECK-NEXT: ptrue p1.b
19
- ; CHECK-NEXT: cntd x9
20
18
; CHECK-NEXT: ptrue p0.d
21
- ; CHECK-NEXT: neg x9, x9
22
- ; CHECK-NEXT: mov w10, #100 // =0x64
23
- ; CHECK-NEXT: mov x8, xzr
24
- ; CHECK-NEXT: and x10, x9, x10
25
- ; CHECK-NEXT: rdvl x11, #2
19
+ ; CHECK-NEXT: cntd x8
20
+ ; CHECK-NEXT: neg x8, x8
21
+ ; CHECK-NEXT: mov w9, #100 // =0x64
22
+ ; CHECK-NEXT: rdvl x10, #2
23
+ ; CHECK-NEXT: and x9, x8, x9
26
24
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
27
25
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
28
26
; CHECK-NEXT: .LBB0_1: // %vector.body
29
27
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
30
- ; CHECK-NEXT: add x12, x0, x8
31
- ; CHECK-NEXT: add x13, x1, x8
32
- ; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8]
33
- ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
34
- ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
35
- ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
36
- ; CHECK-NEXT: adds x10, x10, x9
37
- ; CHECK-NEXT: add x8, x8, x11
38
- ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
39
- ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
40
- ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90
41
- ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90
28
+ ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
29
+ ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
30
+ ; CHECK-NEXT: adds x9, x9, x8
31
+ ; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
32
+ ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
33
+ ; CHECK-NEXT: add x1, x1, x10
34
+ ; CHECK-NEXT: add x0, x0, x10
35
+ ; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
36
+ ; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0
37
+ ; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90
38
+ ; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90
42
39
; CHECK-NEXT: b.ne .LBB0_1
43
40
; CHECK-NEXT: // %bb.2: // %exit.block
44
41
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
@@ -105,32 +102,29 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
105
102
; CHECK-NEXT: fmov d0, #1.00000000
106
103
; CHECK-NEXT: mov z1.d, #0 // =0x0
107
104
; CHECK-NEXT: fmov d2, #2.00000000
108
- ; CHECK-NEXT: cntd x9
109
- ; CHECK-NEXT: mov w10, #100 // =0x64
110
- ; CHECK-NEXT: ptrue p1.b
111
- ; CHECK-NEXT: neg x9, x9
112
- ; CHECK-NEXT: mov x8, xzr
113
- ; CHECK-NEXT: and x10, x9, x10
114
- ; CHECK-NEXT: rdvl x11, #2
105
+ ; CHECK-NEXT: cntd x8
106
+ ; CHECK-NEXT: mov w9, #100 // =0x64
107
+ ; CHECK-NEXT: neg x8, x8
108
+ ; CHECK-NEXT: rdvl x10, #2
109
+ ; CHECK-NEXT: and x9, x8, x9
115
110
; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d
116
111
; CHECK-NEXT: mov z1.d, p0/m, z2.d
117
112
; CHECK-NEXT: ptrue p0.d
118
113
; CHECK-NEXT: zip2 z0.d, z1.d, z3.d
119
114
; CHECK-NEXT: zip1 z1.d, z1.d, z3.d
120
115
; CHECK-NEXT: .LBB1_1: // %vector.body
121
116
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
122
- ; CHECK-NEXT: add x12, x0, x8
123
- ; CHECK-NEXT: add x13, x1, x8
124
- ; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8]
125
- ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
126
- ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
127
- ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
128
- ; CHECK-NEXT: adds x10, x10, x9
129
- ; CHECK-NEXT: add x8, x8, x11
130
- ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
131
- ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
132
- ; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90
133
- ; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90
117
+ ; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
118
+ ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
119
+ ; CHECK-NEXT: adds x9, x9, x8
120
+ ; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
121
+ ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
122
+ ; CHECK-NEXT: add x1, x1, x10
123
+ ; CHECK-NEXT: add x0, x0, x10
124
+ ; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
125
+ ; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0
126
+ ; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90
127
+ ; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90
134
128
; CHECK-NEXT: b.ne .LBB1_1
135
129
; CHECK-NEXT: // %bb.2: // %exit.block
136
130
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
@@ -190,45 +184,37 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
190
184
; CHECK-LABEL: complex_mul_v2f64_unrolled:
191
185
; CHECK: // %bb.0: // %entry
192
186
; CHECK-NEXT: mov z1.d, #0 // =0x0
193
- ; CHECK-NEXT: ptrue p1.b
194
- ; CHECK-NEXT: cntw x9
195
187
; CHECK-NEXT: ptrue p0.d
196
- ; CHECK-NEXT: neg x9, x9
197
- ; CHECK-NEXT: mov w10, #1000 // =0x3e8
198
- ; CHECK-NEXT: rdvl x12 , #2
199
- ; CHECK-NEXT: mov x8, xzr
200
- ; CHECK-NEXT: and x10, x9, x10
188
+ ; CHECK-NEXT: cntw x8
189
+ ; CHECK-NEXT: neg x8, x8
190
+ ; CHECK-NEXT: mov w9 , #1000 // =0x3e8
191
+ ; CHECK-NEXT: rdvl x10, #4
192
+ ; CHECK-NEXT: and x9, x8, x9
201
193
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
202
194
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
203
- ; CHECK-NEXT: add x11, x1, x12
204
- ; CHECK-NEXT: add x12, x0, x12
205
- ; CHECK-NEXT: rdvl x13, #4
206
195
; CHECK-NEXT: mov z2.d, z1.d
207
196
; CHECK-NEXT: mov z3.d, z0.d
208
197
; CHECK-NEXT: .LBB2_1: // %vector.body
209
198
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
210
- ; CHECK-NEXT: add x14, x0, x8
211
- ; CHECK-NEXT: add x15, x12, x8
212
- ; CHECK-NEXT: add x16, x1, x8
213
- ; CHECK-NEXT: add x17, x11, x8
214
- ; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8]
215
- ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl]
216
- ; CHECK-NEXT: ld1b { z6.b }, p1/z, [x12, x8]
217
- ; CHECK-NEXT: ld1b { z7.b }, p1/z, [x1, x8]
218
- ; CHECK-NEXT: ld1d { z16.d }, p0/z, [x16, #1, mul vl]
219
- ; CHECK-NEXT: ld1d { z17.d }, p0/z, [x15, #1, mul vl]
220
- ; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8]
221
- ; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl]
222
- ; CHECK-NEXT: adds x10, x10, x9
223
- ; CHECK-NEXT: add x8, x8, x13
224
- ; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0
225
- ; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0
226
- ; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0
227
- ; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #0
228
- ; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90
229
- ; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90
230
- ; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #90
231
- ; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #90
199
+ ; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
200
+ ; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0]
201
+ ; CHECK-NEXT: adds x9, x9, x8
202
+ ; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl]
203
+ ; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl]
204
+ ; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1]
205
+ ; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl]
206
+ ; CHECK-NEXT: add x0, x0, x10
207
+ ; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl]
208
+ ; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl]
209
+ ; CHECK-NEXT: add x1, x1, x10
210
+ ; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0
211
+ ; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0
212
+ ; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #0
213
+ ; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #0
214
+ ; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #90
215
+ ; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90
216
+ ; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #90
217
+ ; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #90
232
218
; CHECK-NEXT: b.ne .LBB2_1
233
219
; CHECK-NEXT: // %bb.2: // %exit.block
234
220
; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d
0 commit comments