Skip to content

Commit e947f95

Browse files
committed
[LSR][TTI][RISCV] Enable terminator folding for RISC-V
If looking for a miscompile revert candidate, look here! The transform being enabled prefers comparing to a loop invariant exit value for a secondary IV over using an otherwise dead primary IV. This increases register pressure (by requiring the exit value to be live through the loop), but reduces the number of instructions within the loop by one. On RISC-V which has a large number of scalar registers, this is generally a profitable transform. We loose the ability to use a beqz on what is typically a count down IV, and pay the cost of computing the exit value on the secondary IV in the loop preheader, but save an add or sub in the loop body. For anything except an extremely short running loop, or one with extreme register pressure, this is profitable. On spec2017, we see a 0.42% geomean improvement in dynamic icount, with no individual workload regressing by more than 0.25%. Code size wise, we trade a (possibly compressible) beqz and a (possibly compressible) addi for a uncompressible beq. We also add instructions in the preheader. Net result is a slight regression overall, but neutral or better inside the loop. Previous versions of this transform had numerous cornercase correctness bugs. All of them ones I can spot by inspection have been fixed, and I have run this through all of spec2017, but there may be further issues lurking. Adding uses to an IV is a fraught thing to do given poison semantics, so this transform is somewhat inherently risky. This patch is a reworked version of D134893 by @eop. That patch has been abandoned since May, so I picked it up, reworked it a bit, and am landing it.
1 parent 002c54a commit e947f95

14 files changed

+759
-734
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -718,6 +718,11 @@ class TargetTransformInfo {
718718
/// cost should return false, otherwise return true.
719719
bool isNumRegsMajorCostOfLSR() const;
720720

721+
/// Return true if LSR should attempts to replace a use of an otherwise dead
722+
/// primary IV in the latch condition with another IV available in the loop.
723+
/// When successful, makes the primary IV dead.
724+
bool shouldFoldTerminatingConditionAfterLSR() const;
725+
721726
/// \returns true if LSR should not optimize a chain that includes \p I.
722727
bool isProfitableLSRChainElement(Instruction *I) const;
723728

@@ -1786,6 +1791,7 @@ class TargetTransformInfo::Concept {
17861791
virtual bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
17871792
const TargetTransformInfo::LSRCost &C2) = 0;
17881793
virtual bool isNumRegsMajorCostOfLSR() = 0;
1794+
virtual bool shouldFoldTerminatingConditionAfterLSR() const = 0;
17891795
virtual bool isProfitableLSRChainElement(Instruction *I) = 0;
17901796
virtual bool canMacroFuseCmp() = 0;
17911797
virtual bool canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
@@ -2239,6 +2245,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
22392245
bool isNumRegsMajorCostOfLSR() override {
22402246
return Impl.isNumRegsMajorCostOfLSR();
22412247
}
2248+
bool shouldFoldTerminatingConditionAfterLSR() const override {
2249+
return Impl.shouldFoldTerminatingConditionAfterLSR();
2250+
}
22422251
bool isProfitableLSRChainElement(Instruction *I) override {
22432252
return Impl.isProfitableLSRChainElement(I);
22442253
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,8 @@ class TargetTransformInfoImplBase {
235235

236236
bool isNumRegsMajorCostOfLSR() const { return true; }
237237

238+
bool shouldFoldTerminatingConditionAfterLSR() const { return false; }
239+
238240
bool isProfitableLSRChainElement(Instruction *I) const { return false; }
239241

240242
bool canMacroFuseCmp() const { return false; }

llvm/include/llvm/CodeGen/BasicTTIImpl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -382,6 +382,11 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
382382
return TargetTransformInfoImplBase::isNumRegsMajorCostOfLSR();
383383
}
384384

385+
bool shouldFoldTerminatingConditionAfterLSR() const {
386+
return TargetTransformInfoImplBase::
387+
shouldFoldTerminatingConditionAfterLSR();
388+
}
389+
385390
bool isProfitableLSRChainElement(Instruction *I) {
386391
return TargetTransformInfoImplBase::isProfitableLSRChainElement(I);
387392
}

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -413,6 +413,10 @@ bool TargetTransformInfo::isNumRegsMajorCostOfLSR() const {
413413
return TTIImpl->isNumRegsMajorCostOfLSR();
414414
}
415415

416+
bool TargetTransformInfo::shouldFoldTerminatingConditionAfterLSR() const {
417+
return TTIImpl->shouldFoldTerminatingConditionAfterLSR();
418+
}
419+
416420
bool TargetTransformInfo::isProfitableLSRChainElement(Instruction *I) const {
417421
return TTIImpl->isProfitableLSRChainElement(I);
418422
}

llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -357,6 +357,10 @@ class RISCVTTIImpl : public BasicTTIImplBase<RISCVTTIImpl> {
357357

358358
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
359359
const TargetTransformInfo::LSRCost &C2);
360+
361+
bool shouldFoldTerminatingConditionAfterLSR() const {
362+
return true;
363+
}
360364
};
361365

362366
} // end namespace llvm

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,8 @@ static cl::opt<unsigned> SetupCostDepthLimit(
188188
"lsr-setupcost-depth-limit", cl::Hidden, cl::init(7),
189189
cl::desc("The limit on recursion depth for LSRs setup cost"));
190190

191-
static cl::opt<bool> AllowTerminatingConditionFoldingAfterLSR(
192-
"lsr-term-fold", cl::Hidden, cl::init(false),
191+
static cl::opt<cl::boolOrDefault> AllowTerminatingConditionFoldingAfterLSR(
192+
"lsr-term-fold", cl::Hidden,
193193
cl::desc("Attempt to replace primary IV with other IV."));
194194

195195
static cl::opt<bool> AllowDropSolutionIfLessProfitable(
@@ -6938,7 +6938,18 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
69386938
}
69396939
}
69406940

6941-
if (AllowTerminatingConditionFoldingAfterLSR) {
6941+
const bool EnableFormTerm = [&] {
6942+
switch (AllowTerminatingConditionFoldingAfterLSR) {
6943+
case cl::BOU_TRUE:
6944+
return true;
6945+
case cl::BOU_FALSE:
6946+
return false;
6947+
case cl::BOU_UNSET:
6948+
return TTI.shouldFoldTerminatingConditionAfterLSR();
6949+
}
6950+
}();
6951+
6952+
if (EnableFormTerm) {
69426953
if (auto Opt = canFoldTermCondOfLoop(L, SE, DT, LI)) {
69436954
auto [ToFold, ToHelpFold, TermValueS, MustDrop] = *Opt;
69446955

llvm/test/CodeGen/RISCV/branch-on-zero.ll

Lines changed: 25 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -120,36 +120,45 @@ define i32 @test_lshr2(ptr nocapture %x, ptr nocapture readonly %y, i32 %n) {
120120
; RV32-LABEL: test_lshr2:
121121
; RV32: # %bb.0: # %entry
122122
; RV32-NEXT: srli a2, a2, 2
123-
; RV32-NEXT: beqz a2, .LBB3_2
124-
; RV32-NEXT: .LBB3_1: # %while.body
123+
; RV32-NEXT: beqz a2, .LBB3_3
124+
; RV32-NEXT: # %bb.1: # %while.body.preheader
125+
; RV32-NEXT: slli a2, a2, 2
126+
; RV32-NEXT: add a2, a1, a2
127+
; RV32-NEXT: .LBB3_2: # %while.body
125128
; RV32-NEXT: # =>This Inner Loop Header: Depth=1
126129
; RV32-NEXT: lw a3, 0(a1)
127-
; RV32-NEXT: addi a1, a1, 4
130+
; RV32-NEXT: addi a4, a1, 4
128131
; RV32-NEXT: slli a3, a3, 1
129-
; RV32-NEXT: addi a4, a0, 4
130-
; RV32-NEXT: addi a2, a2, -1
132+
; RV32-NEXT: addi a1, a0, 4
131133
; RV32-NEXT: sw a3, 0(a0)
132-
; RV32-NEXT: mv a0, a4
133-
; RV32-NEXT: bnez a2, .LBB3_1
134-
; RV32-NEXT: .LBB3_2: # %while.end
134+
; RV32-NEXT: mv a0, a1
135+
; RV32-NEXT: mv a1, a4
136+
; RV32-NEXT: bne a4, a2, .LBB3_2
137+
; RV32-NEXT: .LBB3_3: # %while.end
135138
; RV32-NEXT: li a0, 0
136139
; RV32-NEXT: ret
137140
;
138141
; RV64-LABEL: test_lshr2:
139142
; RV64: # %bb.0: # %entry
140143
; RV64-NEXT: srliw a2, a2, 2
141-
; RV64-NEXT: beqz a2, .LBB3_2
142-
; RV64-NEXT: .LBB3_1: # %while.body
144+
; RV64-NEXT: beqz a2, .LBB3_3
145+
; RV64-NEXT: # %bb.1: # %while.body.preheader
146+
; RV64-NEXT: addi a2, a2, -1
147+
; RV64-NEXT: slli a2, a2, 32
148+
; RV64-NEXT: srli a2, a2, 30
149+
; RV64-NEXT: add a2, a2, a1
150+
; RV64-NEXT: addi a2, a2, 4
151+
; RV64-NEXT: .LBB3_2: # %while.body
143152
; RV64-NEXT: # =>This Inner Loop Header: Depth=1
144153
; RV64-NEXT: lw a3, 0(a1)
145-
; RV64-NEXT: addi a1, a1, 4
154+
; RV64-NEXT: addi a4, a1, 4
146155
; RV64-NEXT: slli a3, a3, 1
147-
; RV64-NEXT: addi a4, a0, 4
148-
; RV64-NEXT: addiw a2, a2, -1
156+
; RV64-NEXT: addi a1, a0, 4
149157
; RV64-NEXT: sw a3, 0(a0)
150-
; RV64-NEXT: mv a0, a4
151-
; RV64-NEXT: bnez a2, .LBB3_1
152-
; RV64-NEXT: .LBB3_2: # %while.end
158+
; RV64-NEXT: mv a0, a1
159+
; RV64-NEXT: mv a1, a4
160+
; RV64-NEXT: bne a4, a2, .LBB3_2
161+
; RV64-NEXT: .LBB3_3: # %while.end
153162
; RV64-NEXT: li a0, 0
154163
; RV64-NEXT: ret
155164
entry:

llvm/test/CodeGen/RISCV/riscv-codegenprepare-asm.ll

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,19 @@
88
define void @test1(ptr nocapture noundef %a, i32 noundef signext %n) {
99
; CHECK-LABEL: test1:
1010
; CHECK: # %bb.0: # %entry
11-
; CHECK-NEXT: blez a1, .LBB0_2
12-
; CHECK-NEXT: .LBB0_1: # %for.body
11+
; CHECK-NEXT: blez a1, .LBB0_3
12+
; CHECK-NEXT: # %bb.1: # %for.body.preheader
13+
; CHECK-NEXT: slli a1, a1, 32
14+
; CHECK-NEXT: srli a1, a1, 30
15+
; CHECK-NEXT: add a1, a0, a1
16+
; CHECK-NEXT: .LBB0_2: # %for.body
1317
; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
1418
; CHECK-NEXT: lw a2, 0(a0)
1519
; CHECK-NEXT: addi a2, a2, 4
1620
; CHECK-NEXT: sw a2, 0(a0)
17-
; CHECK-NEXT: addi a1, a1, -1
1821
; CHECK-NEXT: addi a0, a0, 4
19-
; CHECK-NEXT: bnez a1, .LBB0_1
20-
; CHECK-NEXT: .LBB0_2: # %for.cond.cleanup
22+
; CHECK-NEXT: bne a0, a1, .LBB0_2
23+
; CHECK-NEXT: .LBB0_3: # %for.cond.cleanup
2124
; CHECK-NEXT: ret
2225
entry:
2326
%cmp3 = icmp sgt i32 %n, 0

llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll

Lines changed: 16 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -206,33 +206,19 @@ define <8 x float> @splat_idx_v8f32(<8 x float> %v, i64 %idx) {
206206

207207
; Test that we pull the vlse of the constant pool out of the loop.
208208
define dso_local void @splat_load_licm(float* %0) {
209-
; RV32-LABEL: splat_load_licm:
210-
; RV32: # %bb.0:
211-
; RV32-NEXT: li a1, 1024
212-
; RV32-NEXT: lui a2, 263168
213-
; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma
214-
; RV32-NEXT: vmv.v.x v8, a2
215-
; RV32-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
216-
; RV32-NEXT: vse32.v v8, (a0)
217-
; RV32-NEXT: addi a1, a1, -4
218-
; RV32-NEXT: addi a0, a0, 16
219-
; RV32-NEXT: bnez a1, .LBB12_1
220-
; RV32-NEXT: # %bb.2:
221-
; RV32-NEXT: ret
222-
;
223-
; RV64-LABEL: splat_load_licm:
224-
; RV64: # %bb.0:
225-
; RV64-NEXT: li a1, 1024
226-
; RV64-NEXT: lui a2, 263168
227-
; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma
228-
; RV64-NEXT: vmv.v.x v8, a2
229-
; RV64-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
230-
; RV64-NEXT: vse32.v v8, (a0)
231-
; RV64-NEXT: addiw a1, a1, -4
232-
; RV64-NEXT: addi a0, a0, 16
233-
; RV64-NEXT: bnez a1, .LBB12_1
234-
; RV64-NEXT: # %bb.2:
235-
; RV64-NEXT: ret
209+
; CHECK-LABEL: splat_load_licm:
210+
; CHECK: # %bb.0:
211+
; CHECK-NEXT: lui a1, 1
212+
; CHECK-NEXT: add a1, a0, a1
213+
; CHECK-NEXT: lui a2, 263168
214+
; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma
215+
; CHECK-NEXT: vmv.v.x v8, a2
216+
; CHECK-NEXT: .LBB12_1: # =>This Inner Loop Header: Depth=1
217+
; CHECK-NEXT: vse32.v v8, (a0)
218+
; CHECK-NEXT: addi a0, a0, 16
219+
; CHECK-NEXT: bne a0, a1, .LBB12_1
220+
; CHECK-NEXT: # %bb.2:
221+
; CHECK-NEXT: ret
236222
br label %2
237223

238224
2: ; preds = %2, %1
@@ -1408,3 +1394,6 @@ define <2 x double> @vid_step2_v2f64() {
14081394
; CHECK-NEXT: ret
14091395
ret <2 x double> <double 0.0, double 2.0>
14101396
}
1397+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1398+
; RV32: {{.*}}
1399+
; RV64: {{.*}}

0 commit comments

Comments
 (0)