Skip to content

Commit c5c2568

Browse files
committed
[AArch64] Override isLSRCostLess, take number of instructions into account
1 parent f448b8e commit c5c2568

9 files changed

+237
-233
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ static cl::opt<unsigned> InlineCallPenaltyChangeSM(
5858
static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
5959
cl::init(true), cl::Hidden);
6060

61+
static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
62+
cl::init(true), cl::Hidden);
63+
6164
namespace {
6265
class TailFoldingOption {
6366
// These bitfields will only ever be set to something non-zero in operator=,
@@ -4152,3 +4155,19 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
41524155
return true;
41534156
return BaseT::shouldTreatInstructionLikeSelect(I);
41544157
}
4158+
4159+
bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
4160+
const TargetTransformInfo::LSRCost &C2) {
4161+
// AArch64 specific here is adding the number of instructions to the
4162+
// comparison (though not as the first consideration, as some targets do)
4163+
// along with changing the priority of the base additions.
4164+
// TODO: Maybe a more nuanced tradeoff between instruction count
4165+
// and number of registers? To be investigated at a later date.
4166+
if (EnableLSRCostOpt)
4167+
return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
4168+
C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4169+
std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
4170+
C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4171+
4172+
return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
4173+
}

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
424424
}
425425

426426
std::optional<unsigned> getMinPageSize() const { return 4096; }
427+
428+
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
429+
const TargetTransformInfo::LSRCost &C2);
427430
};
428431

429432
} // end namespace llvm

llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ entry:
1212

1313
for.body:
1414
; CHECK: for.body
15-
; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
15+
; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
1616
; CHECK: add x[[REG:[0-9]+]],
1717
; CHECK: x[[REG]], #1, lsl #12
1818
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]

llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -176,13 +176,13 @@ exit:
176176
; CHECK: ********** MI Scheduling **********
177177
; CHECK: LDURDi_LDRDui:%bb.1 vector_body
178178
;
179-
; CHECK: Cluster ld/st SU(2) - SU(6)
180-
; CHECK: Cluster ld/st SU(3) - SU(7)
179+
; CHECK: Cluster ld/st SU(0) - SU(4)
180+
; CHECK: Cluster ld/st SU(1) - SU(5)
181181
;
182-
; CHECK: SU(2): %{{[0-9]+}}:fpr64 = LDURDi
183-
; CHECK: SU(3): %{{[0-9]+}}:fpr64 = LDURDi
184-
; CHECK: SU(6): %{{[0-9]+}}:fpr64 = LDRDui
185-
; CHECK: SU(7): %{{[0-9]+}}:fpr64 = LDRDui
182+
; CHECK: SU(0): %{{[0-9]+}}:fpr64 = LDURDi
183+
; CHECK: SU(1): %{{[0-9]+}}:fpr64 = LDURDi
184+
; CHECK: SU(4): %{{[0-9]+}}:fpr64 = LDRDui
185+
; CHECK: SU(5): %{{[0-9]+}}:fpr64 = LDRDui
186186
;
187187
define void @LDURDi_LDRDui(ptr nocapture readonly %arg) {
188188
entry:

llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll

Lines changed: 44 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -14,31 +14,29 @@ target triple = "aarch64"
1414
define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
1515
; CHECK-LABEL: complex_mul_v2f64:
1616
; CHECK: // %bb.0: // %entry
17-
; CHECK-NEXT: mov w9, #100 // =0x64
17+
; CHECK-NEXT: mov w8, #100 // =0x64
1818
; CHECK-NEXT: mov z1.d, #0 // =0x0
1919
; CHECK-NEXT: ptrue p0.d
20-
; CHECK-NEXT: whilelo p1.d, xzr, x9
21-
; CHECK-NEXT: cntd x10
22-
; CHECK-NEXT: mov x8, xzr
23-
; CHECK-NEXT: rdvl x11, #2
24-
; CHECK-NEXT: mov x12, x10
20+
; CHECK-NEXT: whilelo p1.d, xzr, x8
21+
; CHECK-NEXT: cntd x9
22+
; CHECK-NEXT: rdvl x10, #2
23+
; CHECK-NEXT: mov x11, x9
2524
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
2625
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
2726
; CHECK-NEXT: .LBB0_1: // %vector.body
2827
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
2928
; CHECK-NEXT: zip2 p3.d, p1.d, p1.d
30-
; CHECK-NEXT: add x13, x0, x8
31-
; CHECK-NEXT: add x14, x1, x8
32-
; CHECK-NEXT: zip1 p2.d, p1.d, p1.d
3329
; CHECK-NEXT: mov z6.d, z1.d
3430
; CHECK-NEXT: mov z7.d, z0.d
35-
; CHECK-NEXT: whilelo p1.d, x12, x9
36-
; CHECK-NEXT: add x8, x8, x11
37-
; CHECK-NEXT: add x12, x12, x10
38-
; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl]
39-
; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl]
40-
; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
41-
; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14]
31+
; CHECK-NEXT: zip1 p2.d, p1.d, p1.d
32+
; CHECK-NEXT: whilelo p1.d, x11, x8
33+
; CHECK-NEXT: add x11, x11, x9
34+
; CHECK-NEXT: ld1d { z2.d }, p3/z, [x0, #1, mul vl]
35+
; CHECK-NEXT: ld1d { z4.d }, p3/z, [x1, #1, mul vl]
36+
; CHECK-NEXT: ld1d { z3.d }, p2/z, [x0]
37+
; CHECK-NEXT: ld1d { z5.d }, p2/z, [x1]
38+
; CHECK-NEXT: add x1, x1, x10
39+
; CHECK-NEXT: add x0, x0, x10
4240
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
4341
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
4442
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
@@ -115,32 +113,30 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
115113
; CHECK: // %bb.0: // %entry
116114
; CHECK-NEXT: mov z1.d, #0 // =0x0
117115
; CHECK-NEXT: ptrue p0.d
118-
; CHECK-NEXT: cntd x10
119-
; CHECK-NEXT: neg x11, x10
120-
; CHECK-NEXT: mov w12, #100 // =0x64
116+
; CHECK-NEXT: cntd x9
117+
; CHECK-NEXT: neg x10, x9
118+
; CHECK-NEXT: mov w11, #100 // =0x64
121119
; CHECK-NEXT: mov x8, xzr
122-
; CHECK-NEXT: mov x9, xzr
123-
; CHECK-NEXT: and x11, x11, x12
124-
; CHECK-NEXT: rdvl x12, #2
120+
; CHECK-NEXT: and x10, x10, x11
121+
; CHECK-NEXT: rdvl x11, #2
125122
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
126123
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
127124
; CHECK-NEXT: .LBB1_1: // %vector.body
128125
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
129-
; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x9, lsl #2]
130-
; CHECK-NEXT: add x13, x0, x8
131-
; CHECK-NEXT: add x14, x1, x8
126+
; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x8, lsl #2]
132127
; CHECK-NEXT: mov z6.d, z1.d
133128
; CHECK-NEXT: mov z7.d, z0.d
134-
; CHECK-NEXT: add x9, x9, x10
135-
; CHECK-NEXT: add x8, x8, x12
129+
; CHECK-NEXT: add x8, x8, x9
136130
; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0
137-
; CHECK-NEXT: cmp x11, x9
131+
; CHECK-NEXT: cmp x10, x8
138132
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
139133
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
140-
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl]
141-
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl]
142-
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13]
143-
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x14]
134+
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
135+
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
136+
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
137+
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
138+
; CHECK-NEXT: add x1, x1, x11
139+
; CHECK-NEXT: add x0, x0, x11
144140
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
145141
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
146142
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
@@ -217,33 +213,33 @@ exit.block: ; preds = %vector.body
217213
define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, ptr %cond) {
218214
; CHECK-LABEL: complex_mul_predicated_x2_v2f64:
219215
; CHECK: // %bb.0: // %entry
220-
; CHECK-NEXT: mov w10, #100 // =0x64
216+
; CHECK-NEXT: mov w8, #100 // =0x64
221217
; CHECK-NEXT: mov z1.d, #0 // =0x0
222218
; CHECK-NEXT: ptrue p0.d
223-
; CHECK-NEXT: whilelo p1.d, xzr, x10
224-
; CHECK-NEXT: mov x8, xzr
225-
; CHECK-NEXT: mov x9, xzr
226-
; CHECK-NEXT: cntd x11
227-
; CHECK-NEXT: rdvl x12, #2
219+
; CHECK-NEXT: whilelo p1.d, xzr, x8
220+
; CHECK-NEXT: cntd x9
221+
; CHECK-NEXT: rdvl x10, #2
222+
; CHECK-NEXT: cnth x11
223+
; CHECK-NEXT: mov x12, x9
228224
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
229225
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
230226
; CHECK-NEXT: .LBB2_1: // %vector.body
231227
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
232-
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2, x9, lsl #2]
233-
; CHECK-NEXT: add x13, x0, x8
234-
; CHECK-NEXT: add x14, x1, x8
228+
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2]
235229
; CHECK-NEXT: mov z6.d, z1.d
236230
; CHECK-NEXT: mov z7.d, z0.d
237-
; CHECK-NEXT: add x9, x9, x11
238-
; CHECK-NEXT: add x8, x8, x12
231+
; CHECK-NEXT: add x2, x2, x11
239232
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
240233
; CHECK-NEXT: zip2 p3.d, p1.d, p1.d
241234
; CHECK-NEXT: zip1 p2.d, p1.d, p1.d
242-
; CHECK-NEXT: whilelo p1.d, x9, x10
243-
; CHECK-NEXT: ld1d { z2.d }, p3/z, [x13, #1, mul vl]
244-
; CHECK-NEXT: ld1d { z4.d }, p3/z, [x14, #1, mul vl]
245-
; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
246-
; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14]
235+
; CHECK-NEXT: whilelo p1.d, x12, x8
236+
; CHECK-NEXT: add x12, x12, x9
237+
; CHECK-NEXT: ld1d { z2.d }, p3/z, [x0, #1, mul vl]
238+
; CHECK-NEXT: ld1d { z4.d }, p3/z, [x1, #1, mul vl]
239+
; CHECK-NEXT: ld1d { z3.d }, p2/z, [x0]
240+
; CHECK-NEXT: ld1d { z5.d }, p2/z, [x1]
241+
; CHECK-NEXT: add x1, x1, x10
242+
; CHECK-NEXT: add x0, x0, x10
247243
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
248244
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
249245
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90

llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-scalable.ll

Lines changed: 56 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -15,30 +15,27 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
1515
; CHECK-LABEL: complex_mul_v2f64:
1616
; CHECK: // %bb.0: // %entry
1717
; CHECK-NEXT: mov z1.d, #0 // =0x0
18-
; CHECK-NEXT: ptrue p1.b
19-
; CHECK-NEXT: cntd x9
2018
; CHECK-NEXT: ptrue p0.d
21-
; CHECK-NEXT: neg x9, x9
22-
; CHECK-NEXT: mov w10, #100 // =0x64
23-
; CHECK-NEXT: mov x8, xzr
24-
; CHECK-NEXT: and x10, x9, x10
25-
; CHECK-NEXT: rdvl x11, #2
19+
; CHECK-NEXT: cntd x8
20+
; CHECK-NEXT: neg x8, x8
21+
; CHECK-NEXT: mov w9, #100 // =0x64
22+
; CHECK-NEXT: rdvl x10, #2
23+
; CHECK-NEXT: and x9, x8, x9
2624
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
2725
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
2826
; CHECK-NEXT: .LBB0_1: // %vector.body
2927
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
30-
; CHECK-NEXT: add x12, x0, x8
31-
; CHECK-NEXT: add x13, x1, x8
32-
; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8]
33-
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
34-
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
35-
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
36-
; CHECK-NEXT: adds x10, x10, x9
37-
; CHECK-NEXT: add x8, x8, x11
38-
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
39-
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
40-
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90
41-
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90
28+
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
29+
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
30+
; CHECK-NEXT: adds x9, x9, x8
31+
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
32+
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
33+
; CHECK-NEXT: add x1, x1, x10
34+
; CHECK-NEXT: add x0, x0, x10
35+
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
36+
; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0
37+
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90
38+
; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90
4239
; CHECK-NEXT: b.ne .LBB0_1
4340
; CHECK-NEXT: // %bb.2: // %exit.block
4441
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
@@ -105,32 +102,29 @@ define %"class.std::complex" @complex_mul_nonzero_init_v2f64(ptr %a, ptr %b) {
105102
; CHECK-NEXT: fmov d0, #1.00000000
106103
; CHECK-NEXT: mov z1.d, #0 // =0x0
107104
; CHECK-NEXT: fmov d2, #2.00000000
108-
; CHECK-NEXT: cntd x9
109-
; CHECK-NEXT: mov w10, #100 // =0x64
110-
; CHECK-NEXT: ptrue p1.b
111-
; CHECK-NEXT: neg x9, x9
112-
; CHECK-NEXT: mov x8, xzr
113-
; CHECK-NEXT: and x10, x9, x10
114-
; CHECK-NEXT: rdvl x11, #2
105+
; CHECK-NEXT: cntd x8
106+
; CHECK-NEXT: mov w9, #100 // =0x64
107+
; CHECK-NEXT: neg x8, x8
108+
; CHECK-NEXT: rdvl x10, #2
109+
; CHECK-NEXT: and x9, x8, x9
115110
; CHECK-NEXT: sel z3.d, p0, z0.d, z1.d
116111
; CHECK-NEXT: mov z1.d, p0/m, z2.d
117112
; CHECK-NEXT: ptrue p0.d
118113
; CHECK-NEXT: zip2 z0.d, z1.d, z3.d
119114
; CHECK-NEXT: zip1 z1.d, z1.d, z3.d
120115
; CHECK-NEXT: .LBB1_1: // %vector.body
121116
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
122-
; CHECK-NEXT: add x12, x0, x8
123-
; CHECK-NEXT: add x13, x1, x8
124-
; CHECK-NEXT: ld1b { z2.b }, p1/z, [x0, x8]
125-
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x12, #1, mul vl]
126-
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x1, x8]
127-
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x13, #1, mul vl]
128-
; CHECK-NEXT: adds x10, x10, x9
129-
; CHECK-NEXT: add x8, x8, x11
130-
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #0
131-
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #0
132-
; CHECK-NEXT: fcmla z1.d, p0/m, z4.d, z2.d, #90
133-
; CHECK-NEXT: fcmla z0.d, p0/m, z5.d, z3.d, #90
117+
; CHECK-NEXT: ld1d { z2.d }, p0/z, [x0, #1, mul vl]
118+
; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0]
119+
; CHECK-NEXT: adds x9, x9, x8
120+
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x1, #1, mul vl]
121+
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x1]
122+
; CHECK-NEXT: add x1, x1, x10
123+
; CHECK-NEXT: add x0, x0, x10
124+
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #0
125+
; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #0
126+
; CHECK-NEXT: fcmla z1.d, p0/m, z5.d, z3.d, #90
127+
; CHECK-NEXT: fcmla z0.d, p0/m, z4.d, z2.d, #90
134128
; CHECK-NEXT: b.ne .LBB1_1
135129
; CHECK-NEXT: // %bb.2: // %exit.block
136130
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
@@ -190,45 +184,37 @@ define %"class.std::complex" @complex_mul_v2f64_unrolled(ptr %a, ptr %b) {
190184
; CHECK-LABEL: complex_mul_v2f64_unrolled:
191185
; CHECK: // %bb.0: // %entry
192186
; CHECK-NEXT: mov z1.d, #0 // =0x0
193-
; CHECK-NEXT: ptrue p1.b
194-
; CHECK-NEXT: cntw x9
195187
; CHECK-NEXT: ptrue p0.d
196-
; CHECK-NEXT: neg x9, x9
197-
; CHECK-NEXT: mov w10, #1000 // =0x3e8
198-
; CHECK-NEXT: rdvl x12, #2
199-
; CHECK-NEXT: mov x8, xzr
200-
; CHECK-NEXT: and x10, x9, x10
188+
; CHECK-NEXT: cntw x8
189+
; CHECK-NEXT: neg x8, x8
190+
; CHECK-NEXT: mov w9, #1000 // =0x3e8
191+
; CHECK-NEXT: rdvl x10, #4
192+
; CHECK-NEXT: and x9, x8, x9
201193
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
202194
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
203-
; CHECK-NEXT: add x11, x1, x12
204-
; CHECK-NEXT: add x12, x0, x12
205-
; CHECK-NEXT: rdvl x13, #4
206195
; CHECK-NEXT: mov z2.d, z1.d
207196
; CHECK-NEXT: mov z3.d, z0.d
208197
; CHECK-NEXT: .LBB2_1: // %vector.body
209198
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
210-
; CHECK-NEXT: add x14, x0, x8
211-
; CHECK-NEXT: add x15, x12, x8
212-
; CHECK-NEXT: add x16, x1, x8
213-
; CHECK-NEXT: add x17, x11, x8
214-
; CHECK-NEXT: ld1b { z4.b }, p1/z, [x0, x8]
215-
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x14, #1, mul vl]
216-
; CHECK-NEXT: ld1b { z6.b }, p1/z, [x12, x8]
217-
; CHECK-NEXT: ld1b { z7.b }, p1/z, [x1, x8]
218-
; CHECK-NEXT: ld1d { z16.d }, p0/z, [x16, #1, mul vl]
219-
; CHECK-NEXT: ld1d { z17.d }, p0/z, [x15, #1, mul vl]
220-
; CHECK-NEXT: ld1b { z18.b }, p1/z, [x11, x8]
221-
; CHECK-NEXT: ld1d { z19.d }, p0/z, [x17, #1, mul vl]
222-
; CHECK-NEXT: adds x10, x10, x9
223-
; CHECK-NEXT: add x8, x8, x13
224-
; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #0
225-
; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #0
226-
; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #0
227-
; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #0
228-
; CHECK-NEXT: fcmla z1.d, p0/m, z7.d, z4.d, #90
229-
; CHECK-NEXT: fcmla z0.d, p0/m, z16.d, z5.d, #90
230-
; CHECK-NEXT: fcmla z2.d, p0/m, z18.d, z6.d, #90
231-
; CHECK-NEXT: fcmla z3.d, p0/m, z19.d, z17.d, #90
199+
; CHECK-NEXT: ld1d { z4.d }, p0/z, [x0, #1, mul vl]
200+
; CHECK-NEXT: ld1d { z5.d }, p0/z, [x0]
201+
; CHECK-NEXT: adds x9, x9, x8
202+
; CHECK-NEXT: ld1d { z6.d }, p0/z, [x0, #3, mul vl]
203+
; CHECK-NEXT: ld1d { z7.d }, p0/z, [x1, #1, mul vl]
204+
; CHECK-NEXT: ld1d { z16.d }, p0/z, [x1]
205+
; CHECK-NEXT: ld1d { z17.d }, p0/z, [x0, #2, mul vl]
206+
; CHECK-NEXT: add x0, x0, x10
207+
; CHECK-NEXT: ld1d { z18.d }, p0/z, [x1, #3, mul vl]
208+
; CHECK-NEXT: ld1d { z19.d }, p0/z, [x1, #2, mul vl]
209+
; CHECK-NEXT: add x1, x1, x10
210+
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #0
211+
; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #0
212+
; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #0
213+
; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #0
214+
; CHECK-NEXT: fcmla z1.d, p0/m, z16.d, z5.d, #90
215+
; CHECK-NEXT: fcmla z0.d, p0/m, z7.d, z4.d, #90
216+
; CHECK-NEXT: fcmla z3.d, p0/m, z18.d, z6.d, #90
217+
; CHECK-NEXT: fcmla z2.d, p0/m, z19.d, z17.d, #90
232218
; CHECK-NEXT: b.ne .LBB2_1
233219
; CHECK-NEXT: // %bb.2: // %exit.block
234220
; CHECK-NEXT: uzp1 z4.d, z2.d, z3.d

0 commit comments

Comments
 (0)