Skip to content

Commit a152314

Browse files
committed
[LoopVectorize] Refine runtime memory check costs when there is an outer loop
When we generate runtime memory checks for an inner loop it's possible that these checks are invariant in the outer loop and so will get hoisted out. In such cases, the effective cost of the checks should reduce to reflect the outer loop trip count. This fixes a 25% performance regression introduced by commit 49b0e6d when building the SPEC2017 x264 benchmark with PGO, where we decided the inner loop trip count wasn't high enough to warrant the (incorrect) high cost of the runtime checks. Also, when runtime memory checks consist entirely of diff checks these are likely to be outer loop invariant.
1 parent a4caa47 commit a152314

File tree

2 files changed

+39
-10
lines changed

2 files changed

+39
-10
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2070,7 +2070,7 @@ class GeneratedRTChecks {
20702070
}
20712071
}
20722072

2073-
InstructionCost getCost() {
2073+
InstructionCost getCost(Loop *OuterLoop) {
20742074
if (SCEVCheckBlock || MemCheckBlock)
20752075
LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
20762076

@@ -2091,16 +2091,45 @@ class GeneratedRTChecks {
20912091
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
20922092
RTCheckCost += C;
20932093
}
2094-
if (MemCheckBlock)
2094+
if (MemCheckBlock) {
2095+
InstructionCost MemCheckCost = 0;
20952096
for (Instruction &I : *MemCheckBlock) {
20962097
if (MemCheckBlock->getTerminator() == &I)
20972098
continue;
20982099
InstructionCost C =
20992100
TTI->getInstructionCost(&I, TTI::TCK_RecipThroughput);
21002101
LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
2101-
RTCheckCost += C;
2102+
MemCheckCost += C;
2103+
}
2104+
2105+
// If the runtime memory checks are being created inside an outer loop
2106+
// we should find out if these checks are outer loop invariant. If so,
2107+
// the checks will be hoisted out and so the effective cost will reduce
2108+
// according to the outer loop trip count.
2109+
if (OuterLoop) {
2110+
ScalarEvolution *SE = MemCheckExp.getSE();
2111+
const SCEV *Cond = SE->getSCEV(MemRuntimeCheckCond);
2112+
if (SE->isLoopInvariant(Cond, OuterLoop)) {
2113+
if (std::optional<unsigned> OuterTC =
2114+
getSmallBestKnownTC(*SE, OuterLoop))
2115+
MemCheckCost /= *OuterTC;
2116+
else {
2117+
// It seems reasonable to assume that we can reduce the effective
2118+
// cost of the checks even when we know nothing about the trip
2119+
// count. Here I've assumed that the outer loop executes at least
2120+
// twice.
2121+
MemCheckCost /= 2;
2122+
}
2123+
2124+
// Let's ensure the cost is always at least 1.
2125+
if (MemCheckCost == 0)
2126+
MemCheckCost = 1;
2127+
}
21022128
}
21032129

2130+
RTCheckCost += MemCheckCost;
2131+
}
2132+
21042133
if (SCEVCheckBlock || MemCheckBlock)
21052134
LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
21062135
<< "\n");
@@ -9754,7 +9783,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
97549783
std::optional<unsigned> VScale, Loop *L,
97559784
ScalarEvolution &SE,
97569785
ScalarEpilogueLowering SEL) {
9757-
InstructionCost CheckCost = Checks.getCost();
9786+
InstructionCost CheckCost = Checks.getCost(L->getParentLoop());
97589787
if (!CheckCost.isValid())
97599788
return false;
97609789

llvm/test/Transforms/LoopVectorize/AArch64/low_trip_memcheck_cost.ll

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ target triple = "aarch64-unknown-linux-gnu"
77
define void @outer_no_tc(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
88
; CHECK-LABEL: LV: Checking a loop in 'outer_no_tc'
99
; CHECK: Calculating cost of runtime checks:
10-
; CHECK: Total cost of runtime checks: 6
10+
; CHECK: Total cost of runtime checks: 3
1111
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
1212
entry:
1313
br label %outer.loop
@@ -43,7 +43,7 @@ outer.exit:
4343
define void @outer_known_tc3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
4444
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3'
4545
; CHECK: Calculating cost of runtime checks:
46-
; CHECK: Total cost of runtime checks: 6
46+
; CHECK: Total cost of runtime checks: 2
4747
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
4848
entry:
4949
br label %outer.loop
@@ -79,7 +79,7 @@ outer.exit:
7979
define void @outer_known_tc64(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %n) {
8080
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc64'
8181
; CHECK: Calculating cost of runtime checks:
82-
; CHECK: Total cost of runtime checks: 6
82+
; CHECK: Total cost of runtime checks: 1
8383
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
8484
entry:
8585
br label %outer.loop
@@ -115,7 +115,7 @@ outer.exit:
115115
define void @outer_pgo_3(ptr nocapture noundef %a, ptr nocapture noundef readonly %b, i64 noundef %m, i64 noundef %n) {
116116
; CHECK-LABEL: LV: Checking a loop in 'outer_pgo_3'
117117
; CHECK: Calculating cost of runtime checks:
118-
; CHECK: Total cost of runtime checks: 6
118+
; CHECK: Total cost of runtime checks: 2
119119
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:16
120120
entry:
121121
br label %outer.loop
@@ -151,8 +151,8 @@ outer.exit:
151151
define void @outer_known_tc3_full_range_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %n) {
152152
; CHECK-LABEL: LV: Checking a loop in 'outer_known_tc3_full_range_checks'
153153
; CHECK: Calculating cost of runtime checks:
154-
; CHECK: Total cost of runtime checks: 6
155-
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:8
154+
; CHECK: Total cost of runtime checks: 2
155+
; CHECK-NEXT: LV: Minimum required TC for runtime checks to be profitable:4
156156
entry:
157157
br label %outer.loop
158158

0 commit comments

Comments
 (0)