Skip to content

Commit 942ba8a

Browse files
committed
[LoopVectorize] Further improve cost model for early exit loops
Following on from llvm#125058, this patch takes into account the work done in the vector early exit block when assessing the profitability of vectorising the loop. I have renamed areRuntimeChecksProfitable to isOutsideLoopWorkProfitable and we now pass in the early exit costs. As part of this, I have added the ExtractFirstActive opcode to VPInstruction::computeCost. It's worth pointing out that when we assess profitability of the loop we calculate a minimum trip count and compare that against the *maximum* trip count. However, since the loop has an early exit the runtime trip count can still end up being less than the minimum. Alternatively, we may never take the early exit at all at runtime and so we have the opposite problem of over-estimating the cost of the loop. The loop vectoriser cannot simultaneously take two contradictory positions and so I feel the only sensible thing to do is be conservative and assume the loop will be more expensive than loops without early exits. We may find in future that we need to adjust the cost according to the probability of taking the early exit. This will become even more important once we support multiple early exits. However, we have to start somewhere and we can always revisit this later.
1 parent 85cf958 commit 942ba8a

File tree

4 files changed

+105
-12
lines changed

4 files changed

+105
-12
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -10165,19 +10165,46 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1016510165
}
1016610166
}
1016710167

10168-
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10169-
VectorizationFactor &VF, Loop *L,
10170-
PredicatedScalarEvolution &PSE,
10171-
ScalarEpilogueLowering SEL,
10172-
std::optional<unsigned> VScale) {
10168+
static InstructionCost calculateEarlyExitCost(LoopVectorizationCostModel &CM,
10169+
VPlan &Plan, ElementCount VF) {
10170+
InstructionCost Cost = 0;
10171+
VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(), CM,
10172+
CM.CostKind);
10173+
LLVM_DEBUG(
10174+
dbgs() << "Calculating cost of work in vector early exit block:\n");
10175+
for (auto *ExitVPBB : Plan.getExitBlocks()) {
10176+
for (auto *PredVPBB : ExitVPBB->getPredecessors())
10177+
if (PredVPBB != Plan.getMiddleBlock())
10178+
for (auto &R : *(cast<VPBasicBlock>(PredVPBB)))
10179+
Cost += R.cost(VF, CostCtx);
10180+
}
10181+
return Cost;
10182+
}
10183+
10184+
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
10185+
VectorizationFactor &VF, Loop *L,
10186+
const TargetTransformInfo &TTI,
10187+
PredicatedScalarEvolution &PSE,
10188+
ScalarEpilogueLowering SEL,
10189+
std::optional<unsigned> VScale,
10190+
InstructionCost EarlyExitCost) {
1017310191
InstructionCost CheckCost = Checks.getCost();
10174-
if (!CheckCost.isValid())
10192+
if (!CheckCost.isValid() && !EarlyExitCost.isValid())
1017510193
return false;
1017610194

10195+
InstructionCost TotalCost = 0;
10196+
if (CheckCost.isValid())
10197+
TotalCost += CheckCost;
10198+
10199+
// Add on the cost of work required in the vector early exit block, if one
10200+
// exists.
10201+
if (EarlyExitCost.isValid())
10202+
TotalCost += EarlyExitCost;
10203+
1017710204
// When interleaving only scalar and vector cost will be equal, which in turn
1017810205
// would lead to a divide by 0. Fall back to hard threshold.
1017910206
if (VF.Width.isScalar()) {
10180-
if (CheckCost > VectorizeMemoryCheckThreshold) {
10207+
if (TotalCost > VectorizeMemoryCheckThreshold) {
1018110208
LLVM_DEBUG(
1018210209
dbgs()
1018310210
<< "LV: Interleaving only is not profitable due to runtime checks\n");
@@ -10220,7 +10247,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
1022010247
// the computations are performed on doubles, not integers and the result
1022110248
// is rounded up, hence we get an upper estimate of the TC.
1022210249
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
10223-
uint64_t RtC = *CheckCost.getValue();
10250+
uint64_t RtC = *TotalCost.getValue();
1022410251
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
1022510252
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
1022610253

@@ -10548,8 +10575,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1054810575
// iteration count is low. However, setting the epilogue policy to
1054910576
// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
1055010577
// with runtime checks. It's more effective to let
10551-
// `areRuntimeChecksProfitable` determine if vectorization is beneficial
10552-
// for the loop.
10578+
// `isOutsideLoopWorkProfitable` determine if vectorization is
10579+
// beneficial for the loop.
1055310580
if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
1055410581
SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
1055510582
} else {
@@ -10644,12 +10671,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1064410671
if (VF.Width.isVector() || SelectedIC > 1)
1064510672
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
1064610673

10674+
InstructionCost EarlyExitCost = InstructionCost::getInvalid();
10675+
if (VF.Width.isVector() && LVL.hasUncountableEarlyExit())
10676+
EarlyExitCost =
10677+
calculateEarlyExitCost(CM, LVP.getPlanFor(VF.Width), VF.Width);
10678+
1064710679
// Check if it is profitable to vectorize with runtime checks.
1064810680
bool ForceVectorization =
1064910681
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1065010682
if (!ForceVectorization &&
10651-
!areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL,
10652-
CM.getVScaleForTuning())) {
10683+
!isOutsideLoopWorkProfitable(Checks, VF, L, *TTI, PSE, SEL,
10684+
CM.getVScaleForTuning(), EarlyExitCost)) {
1065310685
ORE->emit([&]() {
1065410686
return OptimizationRemarkAnalysisAliasing(
1065510687
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -739,6 +739,19 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
739739
return Ctx.TTI.getArithmeticReductionCost(
740740
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
741741
}
742+
case VPInstruction::ExtractFirstActive: {
743+
// Calculate the cost of determining the lane index.
744+
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(1)), VF);
745+
IntrinsicCostAttributes Attrs(
746+
Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx),
747+
{PoisonValue::get(PredTy), ConstantInt::getTrue(Ctx.LLVMCtx)});
748+
InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
749+
// Add on the cost of extracting the element.
750+
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
751+
Cost += Ctx.TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy,
752+
Ctx.CostKind);
753+
return Cost;
754+
}
742755
default:
743756
// TODO: Compute cost other VPInstructions once the legacy cost model has
744757
// been retired.
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
2+
; REQUIRES: asserts
3+
; RUN: opt -S < %s -p loop-vectorize -enable-early-exit-vectorization -disable-output \
4+
; RUN: -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefixes=CHECK
5+
6+
target triple = "aarch64-unknown-linux-gnu"
7+
8+
declare void @init_mem(ptr, i64);
9+
10+
define i64 @same_exit_block_pre_inc_use1() #1 {
11+
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1'
12+
; CHECK: LV: Selecting VF: vscale x 16
13+
; CHECK: Calculating cost of work in vector early exit block:
14+
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
15+
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
16+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:32
17+
entry:
18+
%p1 = alloca [1024 x i8]
19+
%p2 = alloca [1024 x i8]
20+
call void @init_mem(ptr %p1, i64 1024)
21+
call void @init_mem(ptr %p2, i64 1024)
22+
br label %loop
23+
24+
loop:
25+
%index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
26+
%index2 = phi i64 [ %index2.next, %loop.inc ], [ 15, %entry ]
27+
%arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
28+
%ld1 = load i8, ptr %arrayidx, align 1
29+
%arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
30+
%ld2 = load i8, ptr %arrayidx1, align 1
31+
%cmp3 = icmp eq i8 %ld1, %ld2
32+
br i1 %cmp3, label %loop.inc, label %loop.end
33+
34+
loop.inc:
35+
%index.next = add i64 %index, 1
36+
%index2.next = add i64 %index2, 2
37+
%exitcond = icmp ne i64 %index.next, 67
38+
br i1 %exitcond, label %loop, label %loop.end
39+
40+
loop.end:
41+
%val1 = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
42+
%val2 = phi i64 [ %index2, %loop ], [ 98, %loop.inc ]
43+
%retval = add i64 %val1, %val2
44+
ret i64 %retval
45+
}
46+
47+
attributes #1 = { "target-features"="+sve" vscale_range(1,16) }

llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,7 @@ define i64 @loop_contains_safe_div() #1 {
274274
; CHECK-NEXT: call void @init_mem(ptr [[P2]], i64 1024)
275275
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
276276
; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 4
277+
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.umax.i64(i64 12, i64 [[TMP12]])
277278
; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
278279
; CHECK: vector.ph:
279280
; CHECK-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()

0 commit comments

Comments
 (0)