Skip to content

[NFC][LoopVectorize] Introduce new getEstimatedRuntimeVF function #116247

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Nov 19, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 36 additions & 45 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1520,7 +1520,7 @@ class LoopVectorizationCostModel {
/// \p Multiplier is an aditional scaling factor applied to VF before
/// comparing to EpilogueVectorizationMinVF.
bool isEpilogueVectorizationProfitable(const ElementCount VF,
const unsigned Multiplier) const;
const unsigned IC) const;

/// Returns the execution time cost of an instruction for a given vector
/// width. Vector width of one means scalar.
Expand Down Expand Up @@ -4292,6 +4292,21 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
return TTI.getVScaleForTuning();
}

/// This function attempts to return a value that represents the vectorization
/// factor at runtime. For fixed-width VFs we know this precisely at compile
/// time, but for scalable VFs we calculate it based on an estimate of the
/// vscale value.
static unsigned getEstimatedRuntimeVF(const Loop *L,
const TargetTransformInfo &TTI,
ElementCount VF) {
unsigned EstimatedVF = VF.getKnownMinValue();
if (VF.isScalable())
if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
EstimatedVF *= *VScale;
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
return EstimatedVF;
}

bool LoopVectorizationPlanner::isMoreProfitable(
const VectorizationFactor &A, const VectorizationFactor &B,
const unsigned MaxTripCount) const {
Expand Down Expand Up @@ -4594,17 +4609,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost C = CM.expectedCost(VF);
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);

unsigned AssumedMinimumVscale =
getVScaleForTuning(OrigLoop, TTI).value_or(1);
unsigned Width =
Candidate.Width.isScalable()
? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
: Candidate.Width.getFixedValue();
unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
<< " costs: " << (Candidate.Cost / Width));
if (VF.isScalable())
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
<< AssumedMinimumVscale << ")");
<< getVScaleForTuning(OrigLoop, TTI).value_or(1)
<< ")");
LLVM_DEBUG(dbgs() << ".\n");

if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
Expand Down Expand Up @@ -4670,7 +4681,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
}

bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
const ElementCount VF, const unsigned Multiplier) const {
const ElementCount VF, const unsigned IC) const {
// FIXME: We need a much better cost-model to take different parameters such
// as register pressure, code size increase and cost of extra branches into
// account. For now we apply a very crude heuristic and only consider loops
Expand All @@ -4685,9 +4696,13 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
if (TTI.getMaxInterleaveFactor(VF) <= 1)
return false;

if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
return true;
return false;
// TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
// VFs when deciding profitability.
// See related "TODO: extend to support scalable VFs." in
// selectEpilogueVectorizationFactor.
unsigned Multiplier = VF.isFixed() ? IC : 1;
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >=
EpilogueVectorizationMinVF;
}

VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
Expand Down Expand Up @@ -4730,11 +4745,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
return Result;
}

unsigned Multiplier = IC;
if (MainLoopVF.isScalable())
Multiplier = getVScaleForTuning(OrigLoop, TTI).value_or(1);

if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, Multiplier)) {
if (!CM.isEpilogueVectorizationProfitable(MainLoopVF, IC)) {
LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
"this loop\n");
return Result;
Expand All @@ -4743,12 +4754,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
ElementCount EstimatedRuntimeVF = MainLoopVF;
if (MainLoopVF.isScalable()) {
EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
EstimatedRuntimeVF *= *VScale;
}
ElementCount EstimatedRuntimeVF =
ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));

ScalarEvolution &SE = *PSE.getSE();
Type *TCType = Legal->getWidestInductionType();
Expand Down Expand Up @@ -4988,13 +4995,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}

unsigned EstimatedVF = VF.getKnownMinValue();
if (VF.isScalable()) {
if (std::optional<unsigned> VScale = getVScaleForTuning(TheLoop, TTI))
EstimatedVF *= *VScale;
}
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");

unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (KnownTC > 0) {
// At least one iteration must be scalar when this constraint holds. So the
Expand Down Expand Up @@ -7426,10 +7427,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);
#ifndef NDEBUG
unsigned EstimatedWidth = VF.getKnownMinValue();
if (VF.isScalable())
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI))
EstimatedWidth *= *VScale;
unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
<< " (Estimated cost per lane: ");
if (Cost.isValid()) {
Expand Down Expand Up @@ -9811,8 +9809,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
}

static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
VectorizationFactor &VF,
std::optional<unsigned> VScale, Loop *L,
VectorizationFactor &VF, Loop *L,
const TargetTransformInfo &TTI,
PredicatedScalarEvolution &PSE,
ScalarEpilogueLowering SEL) {
InstructionCost CheckCost = Checks.getCost();
Expand Down Expand Up @@ -9864,13 +9862,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
// the computations are performed on doubles, not integers and the result
// is rounded up, hence we get an upper estimate of the TC.
unsigned IntVF = VF.Width.getKnownMinValue();
if (VF.Width.isScalable()) {
unsigned AssumedMinimumVscale = 1;
if (VScale)
AssumedMinimumVscale = *VScale;
IntVF *= AssumedMinimumVscale;
}
unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
uint64_t RtC = *CheckCost.getValue();
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
Expand Down Expand Up @@ -10119,8 +10111,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (!ForceVectorization &&
!areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L,
PSE, SEL)) {
!areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
Expand Down
Loading