Skip to content

Commit 1218071

Browse files
authored
[NFC][LoopVectorize] Introduce new getEstimatedRuntimeVF function (#116247)
There are lots of places where we try to estimate the runtime vectorisation factor based on the getVScaleForTuning TTI hook. I've added a new getEstimatedRuntimeVF function and taught several places in the vectoriser to use this new function.
1 parent 55fad5e commit 1218071

File tree

1 file changed

+36
-45
lines changed

1 file changed

+36
-45
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 36 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1520,7 +1520,7 @@ class LoopVectorizationCostModel {
15201520
/// \p Multiplier is an aditional scaling factor applied to VF before
15211521
/// comparing to EpilogueVectorizationMinVF.
15221522
bool isEpilogueVectorizationProfitable(const ElementCount VF,
1523-
const unsigned Multiplier) const;
1523+
const unsigned IC) const;
15241524

15251525
/// Returns the execution time cost of an instruction for a given vector
15261526
/// width. Vector width of one means scalar.
@@ -4292,6 +4292,21 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
42924292
return TTI.getVScaleForTuning();
42934293
}
42944294

4295+
/// This function attempts to return a value that represents the vectorization
4296+
/// factor at runtime. For fixed-width VFs we know this precisely at compile
4297+
/// time, but for scalable VFs we calculate it based on an estimate of the
4298+
/// vscale value.
4299+
static unsigned getEstimatedRuntimeVF(const Loop *L,
4300+
const TargetTransformInfo &TTI,
4301+
ElementCount VF) {
4302+
unsigned EstimatedVF = VF.getKnownMinValue();
4303+
if (VF.isScalable())
4304+
if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4305+
EstimatedVF *= *VScale;
4306+
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4307+
return EstimatedVF;
4308+
}
4309+
42954310
bool LoopVectorizationPlanner::isMoreProfitable(
42964311
const VectorizationFactor &A, const VectorizationFactor &B,
42974312
const unsigned MaxTripCount) const {
@@ -4594,17 +4609,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45944609
InstructionCost C = CM.expectedCost(VF);
45954610
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
45964611

4597-
unsigned AssumedMinimumVscale =
4598-
getVScaleForTuning(OrigLoop, TTI).value_or(1);
4599-
unsigned Width =
4600-
Candidate.Width.isScalable()
4601-
? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4602-
: Candidate.Width.getFixedValue();
4612+
unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
46034613
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
46044614
<< " costs: " << (Candidate.Cost / Width));
46054615
if (VF.isScalable())
46064616
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4607-
<< AssumedMinimumVscale << ")");
4617+
<< getVScaleForTuning(OrigLoop, TTI).value_or(1)
4618+
<< ")");
46084619
LLVM_DEBUG(dbgs() << ".\n");
46094620

46104621
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -4670,7 +4681,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
46704681
}
46714682

46724683
bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4673-
const ElementCount VF, const unsigned Multiplier) const {
4684+
const ElementCount VF, const unsigned IC) const {
46744685
// FIXME: We need a much better cost-model to take different parameters such
46754686
// as register pressure, code size increase and cost of extra branches into
46764687
// account. For now we apply a very crude heuristic and only consider loops
@@ -4685,9 +4696,13 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46854696
if (TTI.getMaxInterleaveFactor(VF) <= 1)
46864697
return false;
46874698

4688-
if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
4689-
return true;
4690-
return false;
4699+
// TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4700+
// VFs when deciding profitability.
4701+
// See related "TODO: extend to support scalable VFs." in
4702+
// selectEpilogueVectorizationFactor.
4703+
unsigned Multiplier = VF.isFixed() ? IC : 1;
4704+
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >=
4705+
EpilogueVectorizationMinVF;
46914706
}
46924707

46934708
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
@@ -4730,11 +4745,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47304745
return Result;
47314746
}
47324747

4733-
unsigned Multiplier = IC;
4734-
if (MainLoopVF.isScalable())
4735-
Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1);
4736-
4737-
if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) {
4748+
if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
47384749
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
47394750
"this loop\n");
47404751
return Result;
@@ -4743,12 +4754,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47434754
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
47444755
// the main loop handles 8 lanes per iteration. We could still benefit from
47454756
// vectorizing the epilogue loop with VF=4.
4746-
ElementCount EstimatedRuntimeVF = MainLoopVF;
4747-
if (MainLoopVF.isScalable()) {
4748-
EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
4749-
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
4750-
EstimatedRuntimeVF *= *VScale;
4751-
}
4757+
ElementCount EstimatedRuntimeVF =
4758+
ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
47524759

47534760
ScalarEvolution &SE = *PSE.getSE();
47544761
Type *TCType = Legal->getWidestInductionType();
@@ -4988,13 +4995,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49884995
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
49894996
}
49904997

4991-
unsigned EstimatedVF = VF.getKnownMinValue();
4992-
if (VF.isScalable()) {
4993-
if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
4994-
EstimatedVF *= *VScale;
4995-
}
4996-
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4997-
4998+
unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
49984999
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
49995000
if (KnownTC > 0) {
50005001
// At least one iteration must be scalar when this constraint holds. So the
@@ -7426,10 +7427,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
74267427
// Now compute and add the VPlan-based cost.
74277428
Cost += Plan.cost(VF, CostCtx);
74287429
#ifndef NDEBUG
7429-
unsigned EstimatedWidth = VF.getKnownMinValue();
7430-
if (VF.isScalable())
7431-
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
7432-
EstimatedWidth *= *VScale;
7430+
unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
74337431
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
74347432
<< " (Estimated cost per lane: ");
74357433
if (Cost.isValid()) {
@@ -9811,8 +9809,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
98119809
}
98129810

98139811
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9814-
VectorizationFactor &VF,
9815-
std::optional<unsigned> VScale, Loop *L,
9812+
VectorizationFactor &VF, Loop *L,
9813+
const TargetTransformInfo &TTI,
98169814
PredicatedScalarEvolution &PSE,
98179815
ScalarEpilogueLowering SEL) {
98189816
InstructionCost CheckCost = Checks.getCost();
@@ -9864,13 +9862,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
98649862
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
98659863
// the computations are performed on doubles, not integers and the result
98669864
// is rounded up, hence we get an upper estimate of the TC.
9867-
unsigned IntVF = VF.Width.getKnownMinValue();
9868-
if (VF.Width.isScalable()) {
9869-
unsigned AssumedMinimumVscale = 1;
9870-
if (VScale)
9871-
AssumedMinimumVscale = *VScale;
9872-
IntVF *= AssumedMinimumVscale;
9873-
}
9865+
unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
98749866
uint64_t RtC = *CheckCost.getValue();
98759867
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
98769868
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
@@ -10119,8 +10111,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1011910111
bool ForceVectorization =
1012010112
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1012110113
if (!ForceVectorization &&
10122-
!areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
10123-
PSE, SEL)) {
10114+
!areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
1012410115
ORE->emit([&]() {
1012510116
return OptimizationRemarkAnalysisAliasing(
1012610117
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

0 commit comments

Comments
 (0)