Skip to content

[LoopVectorize][NFC] Cache the result of getVScaleForTuning #124732

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Feb 3, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 47 additions & 35 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -978,7 +978,10 @@ class LoopVectorizationCostModel {
InterleavedAccessInfo &IAI)
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
initializeVScaleForTuning();
}

/// \return An upper bound for the vectorization factors (both fixed and
/// scalable). If the factors are 0, vectorization and interleaving should be
Expand Down Expand Up @@ -1554,9 +1557,34 @@ class LoopVectorizationCostModel {
/// trivially hoistable.
bool shouldConsiderInvariant(Value *Op);

/// Return the value of vscale used for tuning the cost model.
std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }

private:
unsigned NumPredStores = 0;

/// Used to store the value of vscale used for tuning the cost model. It is
/// initialized during object construction.
std::optional<unsigned> VScaleForTuning;

/// Initializes the value of vscale used for tuning the cost model. If
/// vscale_range.min == vscale_range.max then return vscale_range.max, else
/// return the value returned by the corresponding TTI method.
void initializeVScaleForTuning() {
const Function *Fn = TheLoop->getHeader()->getParent();
if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
auto Min = Attr.getVScaleRangeMin();
auto Max = Attr.getVScaleRangeMax();
if (Max && Min == Max) {
VScaleForTuning = Max;
return;
}
}

VScaleForTuning = TTI.getVScaleForTuning();
}

/// \return An upper bound for the vectorization factors for both
/// fixed and scalable vectorization, where the minimum-known number of
/// elements is a power-of-2 larger than zero. If scalable vectorization is
Expand Down Expand Up @@ -4231,33 +4259,15 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
return MaxVF;
}

/// Convenience function that returns the value of vscale_range iff
/// vscale_range.min == vscale_range.max or otherwise returns the value
/// returned by the corresponding TTI method.
static std::optional<unsigned>
getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
const Function *Fn = L->getHeader()->getParent();
if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
auto Min = Attr.getVScaleRangeMin();
auto Max = Attr.getVScaleRangeMax();
if (Max && Min == Max)
return Max;
}

return TTI.getVScaleForTuning();
}

/// This function attempts to return a value that represents the vectorization
/// factor at runtime. For fixed-width VFs we know this precisely at compile
/// time, but for scalable VFs we calculate it based on an estimate of the
/// vscale value.
static unsigned getEstimatedRuntimeVF(const Loop *L,
const TargetTransformInfo &TTI,
ElementCount VF) {
static unsigned getEstimatedRuntimeVF(ElementCount VF,
std::optional<unsigned> VScale) {
unsigned EstimatedVF = VF.getKnownMinValue();
if (VF.isScalable())
if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
if (VScale)
EstimatedVF *= *VScale;
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
return EstimatedVF;
Expand All @@ -4272,7 +4282,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(
// Improve estimate for the vector width if it is scalable.
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
if (A.Width.isScalable())
EstimatedWidthA *= *VScale;
if (B.Width.isScalable())
Expand Down Expand Up @@ -4565,13 +4575,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
InstructionCost C = CM.expectedCost(VF);
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);

unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
unsigned Width =
getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
<< " costs: " << (Candidate.Cost / Width));
if (VF.isScalable())
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
<< getVScaleForTuning(OrigLoop, TTI).value_or(1)
<< ")");
<< CM.getVScaleForTuning().value_or(1) << ")");
LLVM_DEBUG(dbgs() << ".\n");

if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
Expand Down Expand Up @@ -4660,7 +4670,8 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
? EpilogueVectorizationMinVF
: TTI.getEpilogueVectorizationMinVF();
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
MinVFThreshold;
}

VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
Expand Down Expand Up @@ -4712,8 +4723,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
// the main loop handles 8 lanes per iteration. We could still benefit from
// vectorizing the epilogue loop with VF=4.
ElementCount EstimatedRuntimeVF =
ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));

ScalarEvolution &SE = *PSE.getSE();
Type *TCType = Legal->getWidestInductionType();
Expand Down Expand Up @@ -4959,7 +4970,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
}

unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
if (KnownTC > 0) {
// At least one iteration must be scalar when this constraint holds. So the
Expand Down Expand Up @@ -7388,7 +7399,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
// Now compute and add the VPlan-based cost.
Cost += Plan.cost(VF, CostCtx);
#ifndef NDEBUG
unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
<< " (Estimated cost per lane: ");
if (Cost.isValid()) {
Expand Down Expand Up @@ -10033,9 +10044,9 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {

static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
VectorizationFactor &VF, Loop *L,
const TargetTransformInfo &TTI,
PredicatedScalarEvolution &PSE,
ScalarEpilogueLowering SEL) {
ScalarEpilogueLowering SEL,
std::optional<unsigned> VScale) {
InstructionCost CheckCost = Checks.getCost();
if (!CheckCost.isValid())
return false;
Expand Down Expand Up @@ -10085,7 +10096,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
// the computations are performed on doubles, not integers and the result
// is rounded up, hence we get an upper estimate of the TC.
unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
uint64_t RtC = *CheckCost.getValue();
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
Expand Down Expand Up @@ -10522,7 +10533,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
bool ForceVectorization =
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
if (!ForceVectorization &&
!areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
!areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL,
CM.getVScaleForTuning())) {
ORE->emit([&]() {
return OptimizationRemarkAnalysisAliasing(
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
Expand Down