Skip to content

Commit 90fd99c

Browse files
committed
Recommit "[VPlan] First step towards VPlan cost modeling. (#92555)"
This reverts commit 46080ab. Extra tests have been added in 52d29eb. Original message: This adds a new interface to compute the cost of recipes, VPBasicBlocks, VPRegionBlocks and VPlan, initially falling back to the legacy cost model for all recipes. Follow-up patches will gradually migrate recipes to compute their own costs step-by-step. It also adds getBestPlan function to LVP which computes the cost of all VPlans and picks the most profitable one together with the most profitable VF. The VPlan selected by the VPlan cost model is executed and there is an assert to catch cases where the VPlan cost model and the legacy cost model disagree. Even though I checked a number of different build configurations on AArch64 and X86, there may be some differences that have been missed. Additional discussions and context can be found in @arcbbb's #67647 and #67934 which is an earlier version of the current PR. PR: #92555
1 parent b650764 commit 90fd99c

File tree

8 files changed

+414
-27
lines changed

8 files changed

+414
-27
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -344,6 +344,16 @@ class LoopVectorizationPlanner {
344344
/// A builder used to construct the current plan.
345345
VPBuilder Builder;
346346

347+
/// Computes the cost of \p Plan for vectorization factor \p VF.
348+
///
349+
/// The current implementation requires access to the
350+
/// LoopVectorizationLegality to handle inductions and reductions, which is
351+
/// why it is kept separate from the VPlan-only cost infrastructure.
352+
///
353+
/// TODO: Move to VPlan::cost once the use of LoopVectorizationLegality has
354+
/// been retired.
355+
InstructionCost cost(VPlan &Plan, ElementCount VF) const;
356+
347357
public:
348358
LoopVectorizationPlanner(
349359
Loop *L, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI,
@@ -365,6 +375,9 @@ class LoopVectorizationPlanner {
365375
/// Return the best VPlan for \p VF.
366376
VPlan &getBestPlanFor(ElementCount VF) const;
367377

378+
/// Return the most profitable plan and fix its VF to the most profitable one.
379+
VPlan &getBestPlan() const;
380+
368381
/// Generate the IR code for the vectorized loop captured in VPlan \p BestPlan
369382
/// according to the best selected \p VF and \p UF.
370383
///
@@ -443,7 +456,9 @@ class LoopVectorizationPlanner {
443456
ElementCount MinVF);
444457

445458
/// \return The most profitable vectorization factor and the cost of that VF.
446-
/// This method checks every VF in \p CandidateVFs.
459+
/// This method checks every VF in \p CandidateVFs. This is now only used to
460+
/// verify the decisions by the new VPlan-based cost-model and will be retired
461+
/// once the VPlan-based cost-model is stabilized.
447462
VectorizationFactor
448463
selectVectorizationFactor(const ElementCountSet &CandidateVFs);
449464

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 200 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -290,7 +290,7 @@ static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
290290
cl::desc("A flag that overrides the target's max interleave factor for "
291291
"vectorized loops."));
292292

293-
static cl::opt<unsigned> ForceTargetInstructionCost(
293+
cl::opt<unsigned> ForceTargetInstructionCost(
294294
"force-target-instruction-cost", cl::init(0), cl::Hidden,
295295
cl::desc("A flag that overrides the target's expected cost for "
296296
"an instruction to a single constant value. Mostly "
@@ -412,14 +412,6 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
412412
return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
413413
}
414414

415-
/// A helper function that returns the reciprocal of the block probability of
416-
/// predicated blocks. If we return X, we are assuming the predicated block
417-
/// will execute once for every X iterations of the loop header.
418-
///
419-
/// TODO: We should use actual block probability here, if available. Currently,
420-
/// we always assume predicated blocks have a 50% chance of executing.
421-
static unsigned getReciprocalPredBlockProb() { return 2; }
422-
423415
/// Returns "best known" trip count for the specified loop \p L as defined by
424416
/// the following procedure:
425417
/// 1) Returns exact trip count if it is known.
@@ -1621,6 +1613,16 @@ class LoopVectorizationCostModel {
16211613
/// \p VF is the vectorization factor chosen for the original loop.
16221614
bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
16231615

1616+
/// Return the cost of instructions in an inloop reduction pattern, if I is
1617+
/// part of that pattern.
1618+
std::optional<InstructionCost>
1619+
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1620+
TTI::TargetCostKind CostKind) const;
1621+
1622+
/// Returns the execution time cost of an instruction for a given vector
1623+
/// width. Vector width of one means scalar.
1624+
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1625+
16241626
private:
16251627
unsigned NumPredStores = 0;
16261628

@@ -1646,21 +1648,11 @@ class LoopVectorizationCostModel {
16461648
/// of elements.
16471649
ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
16481650

1649-
/// Returns the execution time cost of an instruction for a given vector
1650-
/// width. Vector width of one means scalar.
1651-
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1652-
16531651
/// The cost-computation logic from getInstructionCost which provides
16541652
/// the vector type as an output parameter.
16551653
InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
16561654
Type *&VectorTy);
16571655

1658-
/// Return the cost of instructions in an inloop reduction pattern, if I is
1659-
/// part of that pattern.
1660-
std::optional<InstructionCost>
1661-
getReductionPatternCost(Instruction *I, ElementCount VF, Type *VectorTy,
1662-
TTI::TargetCostKind CostKind) const;
1663-
16641656
/// Calculate vectorization cost of memory instruction \p I.
16651657
InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
16661658

@@ -7288,7 +7280,10 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
72887280
if (!MaxFactors.hasVector())
72897281
return VectorizationFactor::Disabled();
72907282

7291-
// Select the optimal vectorization factor.
7283+
// Select the optimal vectorization factor according to the legacy cost-model.
7284+
// This is now only used to verify the decisions by the new VPlan-based
7285+
// cost-model and will be retired once the VPlan-based cost-model is
7286+
// stabilized.
72927287
VectorizationFactor VF = selectVectorizationFactor(VFCandidates);
72937288
assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
72947289
if (!hasPlanWithVF(VF.Width)) {
@@ -7299,6 +7294,182 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
72997294
return VF;
73007295
}
73017296

7297+
InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
7298+
ElementCount VF) const {
7299+
return CM.getInstructionCost(UI, VF).first;
7300+
}
7301+
7302+
bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
7303+
return (IsVector && CM.VecValuesToIgnore.contains(UI)) ||
7304+
SkipCostComputation.contains(UI);
7305+
}
7306+
7307+
InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7308+
ElementCount VF) const {
7309+
InstructionCost Cost = 0;
7310+
LLVMContext &LLVMCtx = OrigLoop->getHeader()->getContext();
7311+
VPCostContext CostCtx(CM.TTI, Legal->getWidestInductionType(), LLVMCtx, CM);
7312+
7313+
// Cost modeling for inductions is inaccurate in the legacy cost model
7314+
// compared to the recipes that are generated. To match here initially during
7315+
// VPlan cost model bring up directly use the induction costs from the legacy
7316+
// cost model. Note that we do this as pre-processing; the VPlan may not have
7317+
// any recipes associated with the original induction increment instruction
7318+
// and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
7319+
// the cost of both induction increment instructions that are represented by
7320+
// recipes and those that are not, to avoid distinguishing between them here,
7321+
// and skip all recipes that represent induction increments (the former case)
7322+
// later on, if they exist, to avoid counting them twice. Similarly we
7323+
// pre-compute the cost of any optimized truncates.
7324+
// TODO: Switch to more accurate costing based on VPlan.
7325+
for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
7326+
Instruction *IVInc = cast<Instruction>(
7327+
IV->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
7328+
if (CostCtx.SkipCostComputation.insert(IVInc).second) {
7329+
InstructionCost InductionCost = CostCtx.getLegacyCost(IVInc, VF);
7330+
LLVM_DEBUG({
7331+
dbgs() << "Cost of " << InductionCost << " for VF " << VF
7332+
<< ":\n induction increment " << *IVInc << "\n";
7333+
IVInc->dump();
7334+
});
7335+
Cost += InductionCost;
7336+
}
7337+
for (User *U : IV->users()) {
7338+
auto *CI = cast<Instruction>(U);
7339+
if (!CostCtx.CM.isOptimizableIVTruncate(CI, VF))
7340+
continue;
7341+
assert(!CostCtx.SkipCostComputation.contains(CI) &&
7342+
"Same cast for multiple inductions?");
7343+
CostCtx.SkipCostComputation.insert(CI);
7344+
InstructionCost CastCost = CostCtx.getLegacyCost(CI, VF);
7345+
LLVM_DEBUG({
7346+
dbgs() << "Cost of " << CastCost << " for VF " << VF
7347+
<< ":\n induction cast " << *CI << "\n";
7348+
CI->dump();
7349+
});
7350+
Cost += CastCost;
7351+
}
7352+
}
7353+
7354+
/// Compute the cost of all exiting conditions of the loop using the legacy
7355+
/// cost model. This is to match the legacy behavior, which adds the cost of
7356+
/// all exit conditions. Note that this over-estimates the cost, as there will
7357+
/// be a single condition to control the vector loop.
7358+
SmallVector<BasicBlock *> Exiting;
7359+
CM.TheLoop->getExitingBlocks(Exiting);
7360+
// Add the cost of all exit conditions.
7361+
for (BasicBlock *EB : Exiting) {
7362+
auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
7363+
if (!Term)
7364+
continue;
7365+
if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
7366+
assert(!CostCtx.SkipCostComputation.contains(CondI) &&
7367+
"Condition already skipped?");
7368+
CostCtx.SkipCostComputation.insert(CondI);
7369+
Cost += CostCtx.getLegacyCost(CondI, VF);
7370+
}
7371+
}
7372+
7373+
// The legacy cost model has special logic to compute the cost of in-loop
7374+
// reductions, which may be smaller than the sum of all instructions involved
7375+
// in the reduction. For AnyOf reductions, VPlan codegen may remove the select
7376+
// which the legacy cost model uses to assign cost. Pre-compute their costs
7377+
// for now.
7378+
// TODO: Switch to costing based on VPlan once the logic has been ported.
7379+
for (const auto &[RedPhi, RdxDesc] : Legal->getReductionVars()) {
7380+
if (!CM.isInLoopReduction(RedPhi) &&
7381+
!RecurrenceDescriptor::isAnyOfRecurrenceKind(
7382+
RdxDesc.getRecurrenceKind()))
7383+
continue;
7384+
7385+
// AnyOf reduction codegen may remove the select. To match the legacy cost
7386+
// model, pre-compute the cost for AnyOf reductions here.
7387+
if (RecurrenceDescriptor::isAnyOfRecurrenceKind(
7388+
RdxDesc.getRecurrenceKind())) {
7389+
auto *Select = cast<SelectInst>(*find_if(
7390+
RedPhi->users(), [](User *U) { return isa<SelectInst>(U); }));
7391+
assert(!CostCtx.SkipCostComputation.contains(Select) &&
7392+
"reduction op visited multiple times");
7393+
CostCtx.SkipCostComputation.insert(Select);
7394+
auto ReductionCost = CostCtx.getLegacyCost(Select, VF);
7395+
LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7396+
<< ":\n any-of reduction " << *Select << "\n");
7397+
Cost += ReductionCost;
7398+
continue;
7399+
}
7400+
7401+
const auto &ChainOps = RdxDesc.getReductionOpChain(RedPhi, OrigLoop);
7402+
SetVector<Instruction *> ChainOpsAndOperands(ChainOps.begin(),
7403+
ChainOps.end());
7404+
// Also include the operands of instructions in the chain, as the cost-model
7405+
// may mark extends as free.
7406+
for (auto *ChainOp : ChainOps) {
7407+
for (Value *Op : ChainOp->operands()) {
7408+
if (auto *I = dyn_cast<Instruction>(Op))
7409+
ChainOpsAndOperands.insert(I);
7410+
}
7411+
}
7412+
7413+
// Pre-compute the cost for I, if it has a reduction pattern cost.
7414+
for (Instruction *I : ChainOpsAndOperands) {
7415+
auto ReductionCost = CM.getReductionPatternCost(
7416+
I, VF, ToVectorTy(I->getType(), VF), TTI::TCK_RecipThroughput);
7417+
if (!ReductionCost)
7418+
continue;
7419+
7420+
assert(!CostCtx.SkipCostComputation.contains(I) &&
7421+
"reduction op visited multiple times");
7422+
CostCtx.SkipCostComputation.insert(I);
7423+
LLVM_DEBUG(dbgs() << "Cost of " << ReductionCost << " for VF " << VF
7424+
<< ":\n in-loop reduction " << *I << "\n");
7425+
Cost += *ReductionCost;
7426+
}
7427+
}
7428+
7429+
// Now compute and add the VPlan-based cost.
7430+
Cost += Plan.cost(VF, CostCtx);
7431+
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost << "\n");
7432+
return Cost;
7433+
}
7434+
7435+
VPlan &LoopVectorizationPlanner::getBestPlan() const {
7436+
// If there is a single VPlan with a single VF, return it directly.
7437+
VPlan &FirstPlan = *VPlans[0];
7438+
if (VPlans.size() == 1 && size(FirstPlan.vectorFactors()) == 1)
7439+
return FirstPlan;
7440+
7441+
VPlan *BestPlan = &FirstPlan;
7442+
ElementCount ScalarVF = ElementCount::getFixed(1);
7443+
assert(hasPlanWithVF(ScalarVF) &&
7444+
"More than a single plan/VF w/o any plan having scalar VF");
7445+
7446+
InstructionCost ScalarCost = cost(getBestPlanFor(ScalarVF), ScalarVF);
7447+
VectorizationFactor BestFactor(ScalarVF, ScalarCost, ScalarCost);
7448+
7449+
bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7450+
if (ForceVectorization) {
7451+
// Ignore scalar width, because the user explicitly wants vectorization.
7452+
// Initialize cost to max so that VF = 2 is, at least, chosen during cost
7453+
// evaluation.
7454+
BestFactor.Cost = InstructionCost::getMax();
7455+
}
7456+
7457+
for (auto &P : VPlans) {
7458+
for (ElementCount VF : P->vectorFactors()) {
7459+
if (VF.isScalar())
7460+
continue;
7461+
InstructionCost Cost = cost(*P, VF);
7462+
VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7463+
if (isMoreProfitable(CurrentFactor, BestFactor)) {
7464+
BestFactor = CurrentFactor;
7465+
BestPlan = &*P;
7466+
}
7467+
}
7468+
}
7469+
BestPlan->setVF(BestFactor.Width);
7470+
return *BestPlan;
7471+
}
7472+
73027473
VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
73037474
assert(count_if(VPlans,
73047475
[VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
@@ -10157,8 +10328,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1015710328
VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
1015810329
PSI, Checks);
1015910330

10160-
VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
10161-
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
10331+
VPlan &BestPlan = LVP.getBestPlan();
10332+
assert(size(BestPlan.vectorFactors()) == 1 &&
10333+
"Plan should have a single VF");
10334+
ElementCount Width = *BestPlan.vectorFactors().begin();
10335+
LLVM_DEBUG(dbgs() << "VF picked by VPlan cost model: " << Width
10336+
<< "\n");
10337+
assert(VF.Width == Width &&
10338+
"VPlan cost model and legacy cost model disagreed");
10339+
LVP.executePlan(Width, IC, BestPlan, LB, DT, false);
1016210340
++LoopsVectorized;
1016310341

1016410342
// Add metadata to disable runtime unrolling a scalar loop when there

0 commit comments

Comments
 (0)