Skip to content

Commit 629331a

Browse files
committed
[LoopVectorize] Further improve cost model for early exit loops
Following on from #125058, this patch takes into account the work done in the vector early exit block when assessing the profitability of vectorising the loop. I have renamed areRuntimeChecksProfitable to isOutsideLoopWorkProfitable and we now pass in the early exit costs. As part of this, I have added the ExtractFirstActive opcode to VPInstruction::computeCost. It's worth pointing out that when we assess profitability of the loop we calculate a minimum trip count and compare that against the *maximum* trip count. However, since the loop has an early exit the runtime trip count can still end up being less than the minimum. Alternatively, we may never take the early exit at all at runtime and so we have the opposite problem of over-estimating the cost of the loop. The loop vectoriser cannot simultaneously take two contradictory positions and so I feel the only sensible thing to do is be conservative and assume the loop will be more expensive than loops without early exits.
1 parent 52db30e commit 629331a

File tree

4 files changed

+105
-12
lines changed

4 files changed

+105
-12
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10077,19 +10077,46 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1007710077
}
1007810078
}
1007910079

10080-
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10081-
VectorizationFactor &VF, Loop *L,
10082-
PredicatedScalarEvolution &PSE,
10083-
ScalarEpilogueLowering SEL,
10084-
std::optional<unsigned> VScale) {
10080+
static InstructionCost calculateEarlyExitCost(LoopVectorizationCostModel &CM,
10081+
VPlan &Plan, ElementCount VF) {
10082+
InstructionCost Cost = 0;
10083+
VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(), CM,
10084+
CM.CostKind);
10085+
LLVM_DEBUG(
10086+
dbgs() << "Calculating cost of work in vector early exit block:\n");
10087+
for (auto *ExitVPBB : Plan.getExitBlocks()) {
10088+
for (auto *PredVPBB : ExitVPBB->getPredecessors())
10089+
if (PredVPBB != Plan.getMiddleBlock())
10090+
for (auto &R : *(cast<VPBasicBlock>(PredVPBB)))
10091+
Cost += R.cost(VF, CostCtx);
10092+
}
10093+
return Cost;
10094+
}
10095+
10096+
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
10097+
VectorizationFactor &VF, Loop *L,
10098+
const TargetTransformInfo &TTI,
10099+
PredicatedScalarEvolution &PSE,
10100+
ScalarEpilogueLowering SEL,
10101+
std::optional<unsigned> VScale,
10102+
InstructionCost EarlyExitCost) {
1008510103
InstructionCost CheckCost = Checks.getCost();
10086-
if (!CheckCost.isValid())
10104+
if (!CheckCost.isValid() && !EarlyExitCost.isValid())
1008710105
return false;
1008810106

10107+
InstructionCost TotalCost = 0;
10108+
if (CheckCost.isValid())
10109+
TotalCost += CheckCost;
10110+
10111+
// Add on the cost of work required in the vector early exit block, if one
10112+
// exists.
10113+
if (EarlyExitCost.isValid())
10114+
TotalCost += EarlyExitCost;
10115+
1008910116
// When interleaving only scalar and vector cost will be equal, which in turn
1009010117
// would lead to a divide by 0. Fall back to hard threshold.
1009110118
if (VF.Width.isScalar()) {
10092-
if (CheckCost > VectorizeMemoryCheckThreshold) {
10119+
if (TotalCost > VectorizeMemoryCheckThreshold) {
1009310120
LLVM_DEBUG(
1009410121
dbgs()
1009510122
<< "LV: Interleaving only is not profitable due to runtime checks\n");
@@ -10132,7 +10159,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
1013210159
// the computations are performed on doubles, not integers and the result
1013310160
// is rounded up, hence we get an upper estimate of the TC.
1013410161
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
10135-
uint64_t RtC = *CheckCost.getValue();
10162+
uint64_t RtC = *TotalCost.getValue();
1013610163
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
1013710164
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
1013810165

@@ -10468,8 +10495,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1046810495
// iteration count is low. However, setting the epilogue policy to
1046910496
// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
1047010497
// with runtime checks. It's more effective to let
10471-
// `areRuntimeChecksProfitable` determine if vectorization is beneficial
10472-
// for the loop.
10498+
// `isOutsideLoopWorkProfitable` determine if vectorization is
10499+
// beneficial for the loop.
1047310500
if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
1047410501
SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
1047510502
} else {
@@ -10564,12 +10591,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1056410591
if (VF.Width.isVector() || SelectedIC > 1)
1056510592
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
1056610593

10594+
InstructionCost EarlyExitCost = InstructionCost::getInvalid();
10595+
if (VF.Width.isVector() && LVL.hasUncountableEarlyExit())
10596+
EarlyExitCost =
10597+
calculateEarlyExitCost(CM, LVP.getPlanFor(VF.Width), VF.Width);
10598+
1056710599
// Check if it is profitable to vectorize with runtime checks.
1056810600
bool ForceVectorization =
1056910601
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1057010602
if (!ForceVectorization &&
10571-
!areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL,
10572-
CM.getVScaleForTuning())) {
10603+
!isOutsideLoopWorkProfitable(Checks, VF, L, *TTI, PSE, SEL,
10604+
CM.getVScaleForTuning(), EarlyExitCost)) {
1057310605
ORE->emit([&]() {
1057410606
return OptimizationRemarkAnalysisAliasing(
1057510607
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,19 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
726726
return Ctx.TTI.getArithmeticReductionCost(
727727
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
728728
}
729+
case VPInstruction::ExtractFirstActive: {
730+
// Calculate the cost of determining the lane index.
731+
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(1)), VF);
732+
IntrinsicCostAttributes Attrs(
733+
Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx),
734+
{PoisonValue::get(PredTy), ConstantInt::getTrue(Ctx.LLVMCtx)});
735+
InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
736+
// Add on the cost of extracting the element.
737+
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
738+
Cost += Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
739+
Ctx.CostKind);
740+
return Cost;
741+
}
729742
default:
730743
// TODO: Fill out other opcodes!
731744
return 0;
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2+
; REQUIRES: asserts
3+
; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -disable-output \
4+
; RUN: -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK
5+
6+
target triple = "aarch64-unknown-linux-gnu"
7+
8+
declare void @init_mem(ptr, i64);
9+
10+
define i64 @same_exit_block_pre_inc_use1() #1 {
11+
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1'
12+
; CHECK: LV: Selecting VF: vscale x 16
13+
; CHECK: Calculating cost of work in vector early exit block:
14+
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
15+
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
16+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:32
17+
entry:
18+
%p1 = alloca [1024 x i8]
19+
%p2 = alloca [1024 x i8]
20+
call void @init_mem(ptr %p1, i64 1024)
21+
call void @init_mem(ptr %p2, i64 1024)
22+
br label %loop
23+
24+
loop:
25+
%index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
26+
%index2 = phi i64 [ %index2.next, %loop.inc ], [ 15, %entry ]
27+
%arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
28+
%ld1 = load i8, ptr %arrayidx, align 1
29+
%arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
30+
%ld2 = load i8, ptr %arrayidx1, align 1
31+
%cmp3 = icmp eq i8 %ld1, %ld2
32+
br i1 %cmp3, label %loop.inc, label %loop.end
33+
34+
loop.inc:
35+
%index.next = add i64 %index, 1
36+
%index2.next = add i64 %index2, 2
37+
%exitcond = icmp ne i64 %index.next, 67
38+
br i1 %exitcond, label %loop, label %loop.end
39+
40+
loop.end:
41+
%val1 = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
42+
%val2 = phi i64 [ %index2, %loop ], [ 98, %loop.inc ]
43+
%retval = add i64 %val1, %val2
44+
ret i64 %retval
45+
}
46+
47+
attributes #1 = { "target-features"="+sve" vscale_range(1,16) }

llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ define i64 @loop_contains_safe_div() #1 {
274274
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
275275
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
276276
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
277+
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP12]])
277278
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
278279
; CHECK: vector.ph:
279280
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()

0 commit comments

Comments
 (0)