Skip to content

Commit e16f2f5

Browse files
authored
[AArch64] Override isLSRCostLess, take number of instructions into account (#84189)
Adds an AArch64-specific version of isLSRCostLess, changing the relative importance of the various terms from the formulae being evaluated. This has been split out from my vscale-aware LSR work, see the RFC for reference: https://discourse.llvm.org/t/rfc-vscale-aware-loopstrengthreduce/77131
1 parent a97871e commit e16f2f5

9 files changed

+248
-244
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,9 @@ static cl::opt<unsigned> InlineCallPenaltyChangeSM(
5858
static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
5959
cl::init(true), cl::Hidden);
6060

61+
static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
62+
cl::init(true), cl::Hidden);
63+
6164
namespace {
6265
class TailFoldingOption {
6366
// These bitfields will only ever be set to something non-zero in operator=,
@@ -4216,3 +4219,19 @@ bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
42164219
return true;
42174220
return BaseT::shouldTreatInstructionLikeSelect(I);
42184221
}
4222+
4223+
bool AArch64TTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
4224+
const TargetTransformInfo::LSRCost &C2) {
4225+
// AArch64 specific here is adding the number of instructions to the
4226+
// comparison (though not as the first consideration, as some targets do)
4227+
// along with changing the priority of the base additions.
4228+
// TODO: Maybe a more nuanced tradeoff between instruction count
4229+
// and number of registers? To be investigated at a later date.
4230+
if (EnableLSRCostOpt)
4231+
return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
4232+
C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
4233+
std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
4234+
C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
4235+
4236+
return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
4237+
}

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,9 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
425425
}
426426

427427
std::optional<unsigned> getMinPageSize() const { return 4096; }
428+
429+
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
430+
const TargetTransformInfo::LSRCost &C2);
428431
};
429432

430433
} // end namespace llvm

llvm/test/CodeGen/AArch64/arm64-2011-10-18-LdStOptBug.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ entry:
1212

1313
for.body:
1414
; CHECK: for.body
15-
; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}, x{{[0-9]+}}]
15+
; CHECK: ldr w{{[0-9]+}}, [x{{[0-9]+}}]
1616
; CHECK: add x[[REG:[0-9]+]],
1717
; CHECK: x[[REG]], #1, lsl #12
1818
%indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]

llvm/test/CodeGen/AArch64/arm64-ldp-cluster.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -176,13 +176,13 @@ exit:
176176
; CHECK: ********** MI Scheduling **********
177177
; CHECK: LDURDi_LDRDui:%bb.1 vector_body
178178
;
179-
; CHECK: Cluster ld/st SU(2) - SU(6)
180-
; CHECK: Cluster ld/st SU(3) - SU(7)
179+
; CHECK: Cluster ld/st SU(0) - SU(4)
180+
; CHECK: Cluster ld/st SU(1) - SU(5)
181181
;
182-
; CHECK: SU(2): %{{[0-9]+}}:fpr64 = LDURDi
183-
; CHECK: SU(3): %{{[0-9]+}}:fpr64 = LDURDi
184-
; CHECK: SU(6): %{{[0-9]+}}:fpr64 = LDRDui
185-
; CHECK: SU(7): %{{[0-9]+}}:fpr64 = LDRDui
182+
; CHECK: SU(0): %{{[0-9]+}}:fpr64 = LDURDi
183+
; CHECK: SU(1): %{{[0-9]+}}:fpr64 = LDURDi
184+
; CHECK: SU(4): %{{[0-9]+}}:fpr64 = LDRDui
185+
; CHECK: SU(5): %{{[0-9]+}}:fpr64 = LDRDui
186186
;
187187
define void @LDURDi_LDRDui(ptr nocapture readonly %arg) {
188188
entry:

llvm/test/CodeGen/AArch64/complex-deinterleaving-reductions-predicated-scalable.ll

Lines changed: 54 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -15,36 +15,34 @@ define %"class.std::complex" @complex_mul_v2f64(ptr %a, ptr %b) {
1515
; CHECK-LABEL: complex_mul_v2f64:
1616
; CHECK: // %bb.0: // %entry
1717
; CHECK-NEXT: mov z1.d, #0 // =0x0
18-
; CHECK-NEXT: mov w9, #100 // =0x64
19-
; CHECK-NEXT: cntd x10
20-
; CHECK-NEXT: whilelo p1.d, xzr, x9
21-
; CHECK-NEXT: mov x8, xzr
22-
; CHECK-NEXT: rdvl x11, #2
18+
; CHECK-NEXT: mov w8, #100 // =0x64
19+
; CHECK-NEXT: cntd x9
20+
; CHECK-NEXT: whilelo p1.d, xzr, x8
21+
; CHECK-NEXT: rdvl x10, #2
22+
; CHECK-NEXT: mov x11, x9
2323
; CHECK-NEXT: ptrue p0.d
24-
; CHECK-NEXT: mov x12, x10
2524
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
2625
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
2726
; CHECK-NEXT: .LBB0_1: // %vector.body
2827
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
2928
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
30-
; CHECK-NEXT: add x13, x0, x8
31-
; CHECK-NEXT: add x14, x1, x8
32-
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
3329
; CHECK-NEXT: mov z6.d, z1.d
3430
; CHECK-NEXT: mov z7.d, z0.d
35-
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x13, #1, mul vl]
36-
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x14, #1, mul vl]
37-
; CHECK-NEXT: add x8, x8, x11
38-
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x13]
39-
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x14]
31+
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
32+
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
33+
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
34+
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
35+
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
36+
; CHECK-NEXT: add x1, x1, x10
37+
; CHECK-NEXT: add x0, x0, x10
4038
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
4139
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
4240
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
4341
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
4442
; CHECK-NEXT: mov z0.d, p2/m, z7.d
4543
; CHECK-NEXT: mov z1.d, p1/m, z6.d
46-
; CHECK-NEXT: whilelo p1.d, x12, x9
47-
; CHECK-NEXT: add x12, x12, x10
44+
; CHECK-NEXT: whilelo p1.d, x11, x8
45+
; CHECK-NEXT: add x11, x11, x9
4846
; CHECK-NEXT: b.mi .LBB0_1
4947
; CHECK-NEXT: // %bb.2: // %exit.block
5048
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
@@ -114,39 +112,37 @@ define %"class.std::complex" @complex_mul_predicated_v2f64(ptr %a, ptr %b, ptr %
114112
; CHECK-LABEL: complex_mul_predicated_v2f64:
115113
; CHECK: // %bb.0: // %entry
116114
; CHECK-NEXT: mov z1.d, #0 // =0x0
117-
; CHECK-NEXT: cntd x10
118-
; CHECK-NEXT: mov w12, #100 // =0x64
119-
; CHECK-NEXT: neg x11, x10
115+
; CHECK-NEXT: cntd x9
116+
; CHECK-NEXT: mov w11, #100 // =0x64
117+
; CHECK-NEXT: neg x10, x9
120118
; CHECK-NEXT: ptrue p0.d
121119
; CHECK-NEXT: mov x8, xzr
122-
; CHECK-NEXT: mov x9, xzr
123-
; CHECK-NEXT: and x11, x11, x12
124-
; CHECK-NEXT: rdvl x12, #2
120+
; CHECK-NEXT: and x10, x10, x11
121+
; CHECK-NEXT: rdvl x11, #2
125122
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
126123
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
127124
; CHECK-NEXT: .LBB1_1: // %vector.body
128125
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
129-
; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x9, lsl #2]
130-
; CHECK-NEXT: add x13, x0, x8
131-
; CHECK-NEXT: add x14, x1, x8
126+
; CHECK-NEXT: ld1w { z2.d }, p0/z, [x2, x8, lsl #2]
132127
; CHECK-NEXT: mov z6.d, z1.d
133128
; CHECK-NEXT: mov z7.d, z0.d
134-
; CHECK-NEXT: add x9, x9, x10
135-
; CHECK-NEXT: add x8, x8, x12
136-
; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0
137-
; CHECK-NEXT: cmp x11, x9
138-
; CHECK-NEXT: zip2 p1.d, p2.d, p2.d
139-
; CHECK-NEXT: zip1 p2.d, p2.d, p2.d
140-
; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl]
141-
; CHECK-NEXT: ld1d { z4.d }, p1/z, [x14, #1, mul vl]
142-
; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
143-
; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14]
129+
; CHECK-NEXT: add x8, x8, x9
130+
; CHECK-NEXT: cmpne p1.d, p0/z, z2.d, #0
131+
; CHECK-NEXT: cmp x10, x8
132+
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
133+
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
134+
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
135+
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
136+
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
137+
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
138+
; CHECK-NEXT: add x1, x1, x11
139+
; CHECK-NEXT: add x0, x0, x11
144140
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
145141
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
146142
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
147143
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
148-
; CHECK-NEXT: mov z0.d, p1/m, z7.d
149-
; CHECK-NEXT: mov z1.d, p2/m, z6.d
144+
; CHECK-NEXT: mov z0.d, p2/m, z7.d
145+
; CHECK-NEXT: mov z1.d, p1/m, z6.d
150146
; CHECK-NEXT: b.ne .LBB1_1
151147
; CHECK-NEXT: // %bb.2: // %exit.block
152148
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d
@@ -218,38 +214,38 @@ define %"class.std::complex" @complex_mul_predicated_x2_v2f64(ptr %a, ptr %b, pt
218214
; CHECK-LABEL: complex_mul_predicated_x2_v2f64:
219215
; CHECK: // %bb.0: // %entry
220216
; CHECK-NEXT: mov z1.d, #0 // =0x0
221-
; CHECK-NEXT: mov w10, #100 // =0x64
217+
; CHECK-NEXT: mov w8, #100 // =0x64
218+
; CHECK-NEXT: cntd x9
219+
; CHECK-NEXT: whilelo p1.d, xzr, x8
220+
; CHECK-NEXT: rdvl x10, #2
221+
; CHECK-NEXT: cnth x11
222222
; CHECK-NEXT: ptrue p0.d
223-
; CHECK-NEXT: whilelo p1.d, xzr, x10
224-
; CHECK-NEXT: mov x8, xzr
225-
; CHECK-NEXT: mov x9, xzr
226-
; CHECK-NEXT: cntd x11
227-
; CHECK-NEXT: rdvl x12, #2
223+
; CHECK-NEXT: mov x12, x9
228224
; CHECK-NEXT: zip2 z0.d, z1.d, z1.d
229225
; CHECK-NEXT: zip1 z1.d, z1.d, z1.d
230226
; CHECK-NEXT: .LBB2_1: // %vector.body
231227
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
232-
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2, x9, lsl #2]
233-
; CHECK-NEXT: add x13, x0, x8
234-
; CHECK-NEXT: add x14, x1, x8
228+
; CHECK-NEXT: ld1w { z2.d }, p1/z, [x2]
235229
; CHECK-NEXT: mov z6.d, z1.d
236230
; CHECK-NEXT: mov z7.d, z0.d
237-
; CHECK-NEXT: add x9, x9, x11
238-
; CHECK-NEXT: add x8, x8, x12
239-
; CHECK-NEXT: cmpne p2.d, p1/z, z2.d, #0
240-
; CHECK-NEXT: zip2 p1.d, p2.d, p2.d
241-
; CHECK-NEXT: zip1 p2.d, p2.d, p2.d
242-
; CHECK-NEXT: ld1d { z2.d }, p1/z, [x13, #1, mul vl]
243-
; CHECK-NEXT: ld1d { z4.d }, p1/z, [x14, #1, mul vl]
244-
; CHECK-NEXT: ld1d { z3.d }, p2/z, [x13]
245-
; CHECK-NEXT: ld1d { z5.d }, p2/z, [x14]
231+
; CHECK-NEXT: add x2, x2, x11
232+
; CHECK-NEXT: cmpne p1.d, p1/z, z2.d, #0
233+
; CHECK-NEXT: zip2 p2.d, p1.d, p1.d
234+
; CHECK-NEXT: zip1 p1.d, p1.d, p1.d
235+
; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #1, mul vl]
236+
; CHECK-NEXT: ld1d { z4.d }, p2/z, [x1, #1, mul vl]
237+
; CHECK-NEXT: ld1d { z3.d }, p1/z, [x0]
238+
; CHECK-NEXT: ld1d { z5.d }, p1/z, [x1]
239+
; CHECK-NEXT: add x1, x1, x10
240+
; CHECK-NEXT: add x0, x0, x10
246241
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #0
247242
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #0
248243
; CHECK-NEXT: fcmla z7.d, p0/m, z4.d, z2.d, #90
249244
; CHECK-NEXT: fcmla z6.d, p0/m, z5.d, z3.d, #90
250-
; CHECK-NEXT: mov z0.d, p1/m, z7.d
251-
; CHECK-NEXT: whilelo p1.d, x9, x10
252-
; CHECK-NEXT: mov z1.d, p2/m, z6.d
245+
; CHECK-NEXT: mov z0.d, p2/m, z7.d
246+
; CHECK-NEXT: mov z1.d, p1/m, z6.d
247+
; CHECK-NEXT: whilelo p1.d, x12, x8
248+
; CHECK-NEXT: add x12, x12, x9
253249
; CHECK-NEXT: b.mi .LBB2_1
254250
; CHECK-NEXT: // %bb.2: // %exit.block
255251
; CHECK-NEXT: uzp1 z2.d, z1.d, z0.d

0 commit comments

Comments
 (0)