Skip to content

Commit 4ca3e8e

Browse files
committed
Address review comments
1 parent 942ba8a commit 4ca3e8e

File tree

3 files changed

+78
-29
lines changed

3 files changed

+78
-29
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 34 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -10165,6 +10165,12 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1016510165
}
1016610166
}
1016710167

10168+
/// For loops with uncountable early exits, find the cost of doing work when
10169+
/// exiting the loop early, such as calculating the final exit values of
10170+
/// variables used outside the loop.
10171+
/// TODO: This is currently overly pessimistic because the loop may not take
10172+
/// the early exit, but better to keep this conservative for now. In future,
10173+
/// it might be possible to relax this by using branch probabilities.
1016810174
static InstructionCost calculateEarlyExitCost(LoopVectorizationCostModel &CM,
1016910175
VPlan &Plan, ElementCount VF) {
1017010176
InstructionCost Cost = 0;
@@ -10173,37 +10179,44 @@ static InstructionCost calculateEarlyExitCost(LoopVectorizationCostModel &CM,
1017310179
LLVM_DEBUG(
1017410180
dbgs() << "Calculating cost of work in vector early exit block:\n");
1017510181
for (auto *ExitVPBB : Plan.getExitBlocks()) {
10176-
for (auto *PredVPBB : ExitVPBB->getPredecessors())
10182+
for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
10183+
// If the predecessor is not the middle.block, then it must be the
10184+
// vector.early.exit block, which may contain work to calculate the exit
10185+
// values of variables used outside the loop.
1017710186
if (PredVPBB != Plan.getMiddleBlock())
1017810187
for (auto &R : *(cast<VPBasicBlock>(PredVPBB)))
1017910188
Cost += R.cost(VF, CostCtx);
10189+
}
1018010190
}
1018110191
return Cost;
1018210192
}
1018310193

10194+
/// This function determines whether or not it's still profitable to vectorize
10195+
/// the loop given the extra work we have to do outside of the loop:
10196+
/// 1. Perform the runtime checks before entering the loop to ensure it's safe
10197+
/// to vectorize.
10198+
/// 2. In the case of loops with uncountable early exits, we may have to do
10199+
/// extra work when exiting the loop early, such as calculating the final
10200+
/// exit values of variables used outside the loop.
1018410201
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
10185-
VectorizationFactor &VF, Loop *L,
10186-
const TargetTransformInfo &TTI,
10202+
VectorizationFactor &VF,
10203+
LoopVectorizationCostModel &CM,
1018710204
PredicatedScalarEvolution &PSE,
10188-
ScalarEpilogueLowering SEL,
10189-
std::optional<unsigned> VScale,
10190-
InstructionCost EarlyExitCost) {
10191-
InstructionCost CheckCost = Checks.getCost();
10192-
if (!CheckCost.isValid() && !EarlyExitCost.isValid())
10205+
VPlan &Plan,
10206+
ScalarEpilogueLowering SEL) {
10207+
InstructionCost TotalCost = Checks.getCost();
10208+
if (!TotalCost.isValid())
1019310209
return false;
1019410210

10195-
InstructionCost TotalCost = 0;
10196-
if (CheckCost.isValid())
10197-
TotalCost += CheckCost;
10198-
1019910211
// Add on the cost of work required in the vector early exit block, if one
1020010212
// exists.
10201-
if (EarlyExitCost.isValid())
10202-
TotalCost += EarlyExitCost;
10213+
if (CM.Legal->hasUncountableEarlyExit())
10214+
TotalCost += calculateEarlyExitCost(CM, Plan, VF.Width);
1020310215

1020410216
// When interleaving only scalar and vector cost will be equal, which in turn
1020510217
// would lead to a divide by 0. Fall back to hard threshold.
1020610218
if (VF.Width.isScalar()) {
10219+
// TODO: Should we rename VectorizeMemoryCheckThreshold?
1020710220
if (TotalCost > VectorizeMemoryCheckThreshold) {
1020810221
LLVM_DEBUG(
1020910222
dbgs()
@@ -10229,7 +10242,9 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1022910242
// The total cost of the vector loop is
1023010243
// RtC + VecC * (TC / VF) + EpiC
1023110244
// where
10232-
// * RtC is the cost of the generated runtime checks
10245+
// * RtC is the cost of the generated runtime checks plus the cost of
10246+
// performing any additional work in the vector.early.exit block for loops
10247+
// with uncountable early exits.
1023310248
// * VecC is the cost of a single vector iteration.
1023410249
// * TC is the actual trip count of the loop
1023510250
// * VF is the vectorization factor
@@ -10246,7 +10261,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1024610261
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
1024710262
// the computations are performed on doubles, not integers and the result
1024810263
// is rounded up, hence we get an upper estimate of the TC.
10249-
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
10264+
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, CM.getVScaleForTuning());
1025010265
uint64_t RtC = *TotalCost.getValue();
1025110266
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
1025210267
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
@@ -10274,7 +10289,7 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
1027410289

1027510290
// Skip vectorization if the expected trip count is less than the minimum
1027610291
// required trip count.
10277-
if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
10292+
if (auto ExpectedTC = getSmallBestKnownTC(PSE, CM.TheLoop)) {
1027810293
if (ElementCount::isKnownLT(ElementCount::getFixed(*ExpectedTC),
1027910294
VF.MinProfitableTripCount)) {
1028010295
LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
@@ -10671,17 +10686,12 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1067110686
if (VF.Width.isVector() || SelectedIC > 1)
1067210687
Checks.create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
1067310688

10674-
InstructionCost EarlyExitCost = InstructionCost::getInvalid();
10675-
if (VF.Width.isVector() && LVL.hasUncountableEarlyExit())
10676-
EarlyExitCost =
10677-
calculateEarlyExitCost(CM, LVP.getPlanFor(VF.Width), VF.Width);
10678-
1067910689
// Check if it is profitable to vectorize with runtime checks.
1068010690
bool ForceVectorization =
1068110691
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1068210692
if (!ForceVectorization &&
10683-
!isOutsideLoopWorkProfitable(Checks, VF, L, *TTI, PSE, SEL,
10684-
CM.getVScaleForTuning(), EarlyExitCost)) {
10693+
!isOutsideLoopWorkProfitable(Checks, VF, CM, PSE,
10694+
LVP.getPlanFor(VF.Width), SEL)) {
1068510695
ORE->emit([&]() {
1068610696
return OptimizationRemarkAnalysisAliasing(
1068710697
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -742,9 +742,9 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
742742
case VPInstruction::ExtractFirstActive: {
743743
// Calculate the cost of determining the lane index.
744744
auto *PredTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(1)), VF);
745-
IntrinsicCostAttributes Attrs(
746-
Intrinsic::experimental_cttz_elts, Type::getInt64Ty(Ctx.LLVMCtx),
747-
{PoisonValue::get(PredTy), ConstantInt::getTrue(Ctx.LLVMCtx)});
745+
IntrinsicCostAttributes Attrs(Intrinsic::experimental_cttz_elts,
746+
Type::getInt64Ty(Ctx.LLVMCtx),
747+
{PredTy, Type::getInt1Ty(Ctx.LLVMCtx)});
748748
InstructionCost Cost = Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
749749
// Add on the cost of extracting the element.
750750
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,8 +7,8 @@ target triple = "aarch64-unknown-linux-gnu"
77

88
declare void @init_mem(ptr, i64);
99

10-
define i64 @same_exit_block_pre_inc_use1() #1 {
11-
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1'
10+
define i64 @same_exit_block_pre_inc_use1_sve() #1 {
11+
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_sve'
1212
; CHECK: LV: Selecting VF: vscale x 16
1313
; CHECK: Calculating cost of work in vector early exit block:
1414
; CHECK-NEXT: Cost of 6 for VF vscale x 16: EMIT vp<{{.*}}> = extract-first-active
@@ -44,4 +44,43 @@ loop.end:
4444
ret i64 %retval
4545
}
4646

47+
define i64 @same_exit_block_pre_inc_use1_nosve() {
48+
; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_nosve'
49+
; CHECK: LV: Selecting VF: 16
50+
; CHECK: Calculating cost of work in vector early exit block:
51+
; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active
52+
; CHECK-NEXT: Cost of 50 for VF 16: EMIT vp<{{.*}}> = extract-first-active
53+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:176
54+
; CHECK-NEXT: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (64 < 176)
55+
; CHECK-NEXT: LV: Too many memory checks needed.
56+
entry:
57+
%p1 = alloca [1024 x i8]
58+
%p2 = alloca [1024 x i8]
59+
call void @init_mem(ptr %p1, i64 1024)
60+
call void @init_mem(ptr %p2, i64 1024)
61+
br label %loop
62+
63+
loop:
64+
%index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
65+
%index2 = phi i64 [ %index2.next, %loop.inc ], [ 15, %entry ]
66+
%arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
67+
%ld1 = load i8, ptr %arrayidx, align 1
68+
%arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
69+
%ld2 = load i8, ptr %arrayidx1, align 1
70+
%cmp3 = icmp eq i8 %ld1, %ld2
71+
br i1 %cmp3, label %loop.inc, label %loop.end
72+
73+
loop.inc:
74+
%index.next = add i64 %index, 1
75+
%index2.next = add i64 %index2, 2
76+
%exitcond = icmp ne i64 %index.next, 67
77+
br i1 %exitcond, label %loop, label %loop.end
78+
79+
loop.end:
80+
%val1 = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
81+
%val2 = phi i64 [ %index2, %loop ], [ 98, %loop.inc ]
82+
%retval = add i64 %val1, %val2
83+
ret i64 %retval
84+
}
85+
4786
attributes #1 = { "target-features"="+sve" vscale_range(1,16) }

0 commit comments

Comments
 (0)