Skip to content

Commit d992006

Browse files
committed
[NFC][LoopVectorize] Introduce new getEstimatedRuntimeVF function
There are lots of places where we try to estimate the runtime vectorisation factor based on the getVScaleForTuning TTI hook. I've added a new getEstimatedRuntimeVF function and taught several places in the vectoriser to use this new function.
1 parent 6a12b43 commit d992006

File tree

1 file changed

+35
-41
lines changed

1 file changed

+35
-41
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 35 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1519,7 +1519,7 @@ class LoopVectorizationCostModel {
15191519
/// \p Multiplier is an aditional scaling factor applied to VF before
15201520
/// comparing to EpilogueVectorizationMinVF.
15211521
bool isEpilogueVectorizationProfitable(const ElementCount VF,
1522-
const unsigned Multiplier) const;
1522+
const unsigned IC) const;
15231523

15241524
/// Returns the execution time cost of an instruction for a given vector
15251525
/// width. Vector width of one means scalar.
@@ -4291,6 +4291,21 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
42914291
return TTI.getVScaleForTuning();
42924292
}
42934293

4294+
/// This functions attempts to return a value that represents the vectorization
4295+
/// factor at runtime. For fixed-width VFs we know this precisely at compile
4296+
/// time, but for scalable VFs we calculate it based on an estimate of the
4297+
/// vscale value.
4298+
static unsigned getEstimatedRuntimeVF(const Loop *L,
4299+
const TargetTransformInfo &TTI,
4300+
ElementCount VF) {
4301+
unsigned EstimatedVF = VF.getKnownMinValue();
4302+
if (VF.isScalable())
4303+
if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4304+
EstimatedVF *= *VScale;
4305+
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4306+
return EstimatedVF;
4307+
}
4308+
42944309
bool LoopVectorizationPlanner::isMoreProfitable(
42954310
const VectorizationFactor &A, const VectorizationFactor &B,
42964311
const unsigned MaxTripCount) const {
@@ -4593,17 +4608,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45934608
InstructionCost C = CM.expectedCost(VF);
45944609
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
45954610

4596-
unsigned AssumedMinimumVscale =
4597-
getVScaleForTuning(OrigLoop, TTI).value_or(1);
4598-
unsigned Width =
4599-
Candidate.Width.isScalable()
4600-
? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
4601-
: Candidate.Width.getFixedValue();
4611+
unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
46024612
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
46034613
<< " costs: " << (Candidate.Cost / Width));
46044614
if (VF.isScalable())
46054615
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4606-
<< AssumedMinimumVscale << ")");
4616+
<< getVScaleForTuning(OrigLoop, TTI).value_or(1)
4617+
<< ")");
46074618
LLVM_DEBUG(dbgs() << ".\n");
46084619

46094620
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -4669,7 +4680,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
46694680
}
46704681

46714682
bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4672-
const ElementCount VF, const unsigned Multiplier) const {
4683+
const ElementCount VF, const unsigned IC) const {
46734684
// FIXME: We need a much better cost-model to take different parameters such
46744685
// as register pressure, code size increase and cost of extra branches into
46754686
// account. For now we apply a very crude heuristic and only consider loops
@@ -4684,9 +4695,13 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46844695
if (TTI.getMaxInterleaveFactor(VF) <= 1)
46854696
return false;
46864697

4687-
if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
4688-
return true;
4689-
return false;
4698+
// TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4699+
// VFs when deciding profitability.
4700+
// See related "TODO: extend to support scalable VFs." in
4701+
// selectEpilogueVectorizationFactor.
4702+
unsigned Multiplier = VF.isFixed() ? IC : 1;
4703+
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >=
4704+
EpilogueVectorizationMinVF;
46904705
}
46914706

46924707
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
@@ -4729,11 +4744,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47294744
return Result;
47304745
}
47314746

4732-
unsigned Multiplier = IC;
4733-
if (MainLoopVF.isScalable())
4734-
Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1);
4735-
4736-
if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) {
4747+
if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
47374748
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
47384749
"this loop\n");
47394750
return Result;
@@ -4742,12 +4753,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47424753
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
47434754
// the main loop handles 8 lanes per iteration. We could still benefit from
47444755
// vectorizing the epilogue loop with VF=4.
4745-
ElementCount EstimatedRuntimeVF = MainLoopVF;
4746-
if (MainLoopVF.isScalable()) {
4747-
EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
4748-
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
4749-
EstimatedRuntimeVF *= *VScale;
4750-
}
4756+
ElementCount EstimatedRuntimeVF =
4757+
ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
47514758

47524759
ScalarEvolution &SE = *PSE.getSE();
47534760
Type *TCType = Legal->getWidestInductionType();
@@ -4987,13 +4994,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49874994
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
49884995
}
49894996

4990-
unsigned EstimatedVF = VF.getKnownMinValue();
4991-
if (VF.isScalable()) {
4992-
if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
4993-
EstimatedVF *= *VScale;
4994-
}
4995-
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
4996-
4997+
unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
49974998
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
49984999
if (KnownTC > 0) {
49995000
// At least one iteration must be scalar when this constraint holds. So the
@@ -9797,8 +9798,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
97979798
}
97989799

97999800
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9800-
VectorizationFactor &VF,
9801-
std::optional<unsigned> VScale, Loop *L,
9801+
VectorizationFactor &VF, Loop *L,
9802+
const TargetTransformInfo &TTI,
98029803
PredicatedScalarEvolution &PSE,
98039804
ScalarEpilogueLowering SEL) {
98049805
InstructionCost CheckCost = Checks.getCost();
@@ -9850,13 +9851,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
98509851
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
98519852
// the computations are performed on doubles, not integers and the result
98529853
// is rounded up, hence we get an upper estimate of the TC.
9853-
unsigned IntVF = VF.Width.getKnownMinValue();
9854-
if (VF.Width.isScalable()) {
9855-
unsigned AssumedMinimumVscale = 1;
9856-
if (VScale)
9857-
AssumedMinimumVscale = *VScale;
9858-
IntVF *= AssumedMinimumVscale;
9859-
}
9854+
unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
98609855
uint64_t RtC = *CheckCost.getValue();
98619856
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
98629857
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
@@ -10105,8 +10100,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1010510100
bool ForceVectorization =
1010610101
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1010710102
if (!ForceVectorization &&
10108-
!areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
10109-
PSE, SEL)) {
10103+
!areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
1011010104
ORE->emit([&]() {
1011110105
return OptimizationRemarkAnalysisAliasing(
1011210106
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

0 commit comments

Comments
 (0)