Skip to content

Commit 50d5d06

Browse files
authored
[LoopVectorize][NFC] Cache the result of getVScaleForTuning (#124732)
We currently call getVScaleForTuning in many places, doing a lot of work asking the same question with the same answer. I've refactored the code to cache the value if the max scalable VF != 0 and pull out the cached value from LoopVectorizationCostModel.
1 parent 6303563 commit 50d5d06

File tree

1 file changed

+47
-35
lines changed

1 file changed

+47
-35
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 47 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -989,7 +989,10 @@ class LoopVectorizationCostModel {
989989
InterleavedAccessInfo &IAI)
990990
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
991991
TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
992-
Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
992+
Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {
993+
if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
994+
initializeVScaleForTuning();
995+
}
993996

994997
/// \return An upper bound for the vectorization factors (both fixed and
995998
/// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1565,9 +1568,34 @@ class LoopVectorizationCostModel {
15651568
/// trivially hoistable.
15661569
bool shouldConsiderInvariant(Value *Op);
15671570

1571+
/// Return the value of vscale used for tuning the cost model.
1572+
std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1573+
15681574
private:
15691575
unsigned NumPredStores = 0;
15701576

1577+
/// Used to store the value of vscale used for tuning the cost model. It is
1578+
/// initialized during object construction.
1579+
std::optional<unsigned> VScaleForTuning;
1580+
1581+
/// Initializes the value of vscale used for tuning the cost model. If
1582+
/// vscale_range.min == vscale_range.max then return vscale_range.max, else
1583+
/// return the value returned by the corresponding TTI method.
1584+
void initializeVScaleForTuning() {
1585+
const Function *Fn = TheLoop->getHeader()->getParent();
1586+
if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
1587+
auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
1588+
auto Min = Attr.getVScaleRangeMin();
1589+
auto Max = Attr.getVScaleRangeMax();
1590+
if (Max && Min == Max) {
1591+
VScaleForTuning = Max;
1592+
return;
1593+
}
1594+
}
1595+
1596+
VScaleForTuning = TTI.getVScaleForTuning();
1597+
}
1598+
15711599
/// \return An upper bound for the vectorization factors for both
15721600
/// fixed and scalable vectorization, where the minimum-known number of
15731601
/// elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -4242,33 +4270,15 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
42424270
return MaxVF;
42434271
}
42444272

4245-
/// Convenience function that returns the value of vscale_range iff
4246-
/// vscale_range.min == vscale_range.max or otherwise returns the value
4247-
/// returned by the corresponding TTI method.
4248-
static std::optional<unsigned>
4249-
getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4250-
const Function *Fn = L->getHeader()->getParent();
4251-
if (Fn->hasFnAttribute(Attribute::VScaleRange)) {
4252-
auto Attr = Fn->getFnAttribute(Attribute::VScaleRange);
4253-
auto Min = Attr.getVScaleRangeMin();
4254-
auto Max = Attr.getVScaleRangeMax();
4255-
if (Max && Min == Max)
4256-
return Max;
4257-
}
4258-
4259-
return TTI.getVScaleForTuning();
4260-
}
4261-
42624273
/// This function attempts to return a value that represents the vectorization
42634274
/// factor at runtime. For fixed-width VFs we know this precisely at compile
42644275
/// time, but for scalable VFs we calculate it based on an estimate of the
42654276
/// vscale value.
4266-
static unsigned getEstimatedRuntimeVF(const Loop *L,
4267-
const TargetTransformInfo &TTI,
4268-
ElementCount VF) {
4277+
static unsigned getEstimatedRuntimeVF(ElementCount VF,
4278+
std::optional<unsigned> VScale) {
42694279
unsigned EstimatedVF = VF.getKnownMinValue();
42704280
if (VF.isScalable())
4271-
if (std::optional<unsigned> VScale = getVScaleForTuning(L, TTI))
4281+
if (VScale)
42724282
EstimatedVF *= *VScale;
42734283
assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
42744284
return EstimatedVF;
@@ -4283,7 +4293,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(
42834293
// Improve estimate for the vector width if it is scalable.
42844294
unsigned EstimatedWidthA = A.Width.getKnownMinValue();
42854295
unsigned EstimatedWidthB = B.Width.getKnownMinValue();
4286-
if (std::optional<unsigned> VScale = getVScaleForTuning(OrigLoop, TTI)) {
4296+
if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
42874297
if (A.Width.isScalable())
42884298
EstimatedWidthA *= *VScale;
42894299
if (B.Width.isScalable())
@@ -4576,13 +4586,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
45764586
InstructionCost C = CM.expectedCost(VF);
45774587
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
45784588

4579-
unsigned Width = getEstimatedRuntimeVF(OrigLoop, TTI, Candidate.Width);
4589+
unsigned Width =
4590+
getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
45804591
LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
45814592
<< " costs: " << (Candidate.Cost / Width));
45824593
if (VF.isScalable())
45834594
LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4584-
<< getVScaleForTuning(OrigLoop, TTI).value_or(1)
4585-
<< ")");
4595+
<< CM.getVScaleForTuning().value_or(1) << ")");
45864596
LLVM_DEBUG(dbgs() << ".\n");
45874597

45884598
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
@@ -4671,7 +4681,8 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
46714681
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
46724682
? EpilogueVectorizationMinVF
46734683
: TTI.getEpilogueVectorizationMinVF();
4674-
return getEstimatedRuntimeVF(TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4684+
return getEstimatedRuntimeVF(VF * Multiplier, VScaleForTuning) >=
4685+
MinVFThreshold;
46754686
}
46764687

46774688
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
@@ -4723,8 +4734,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
47234734
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
47244735
// the main loop handles 8 lanes per iteration. We could still benefit from
47254736
// vectorizing the epilogue loop with VF=4.
4726-
ElementCount EstimatedRuntimeVF =
4727-
ElementCount::getFixed(getEstimatedRuntimeVF(OrigLoop, TTI, MainLoopVF));
4737+
ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4738+
getEstimatedRuntimeVF(MainLoopVF, CM.getVScaleForTuning()));
47284739

47294740
ScalarEvolution &SE = *PSE.getSE();
47304741
Type *TCType = Legal->getWidestInductionType();
@@ -4970,7 +4981,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49704981
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
49714982
}
49724983

4973-
unsigned EstimatedVF = getEstimatedRuntimeVF(TheLoop, TTI, VF);
4984+
unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScaleForTuning);
49744985
unsigned KnownTC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
49754986
if (KnownTC > 0) {
49764987
// At least one iteration must be scalar when this constraint holds. So the
@@ -7399,7 +7410,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
73997410
// Now compute and add the VPlan-based cost.
74007411
Cost += Plan.cost(VF, CostCtx);
74017412
#ifndef NDEBUG
7402-
unsigned EstimatedWidth = getEstimatedRuntimeVF(OrigLoop, CM.TTI, VF);
7413+
unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
74037414
LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
74047415
<< " (Estimated cost per lane: ");
74057416
if (Cost.isValid()) {
@@ -10063,9 +10074,9 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
1006310074

1006410075
static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
1006510076
VectorizationFactor &VF, Loop *L,
10066-
const TargetTransformInfo &TTI,
1006710077
PredicatedScalarEvolution &PSE,
10068-
ScalarEpilogueLowering SEL) {
10078+
ScalarEpilogueLowering SEL,
10079+
std::optional<unsigned> VScale) {
1006910080
InstructionCost CheckCost = Checks.getCost();
1007010081
if (!CheckCost.isValid())
1007110082
return false;
@@ -10115,7 +10126,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
1011510126
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
1011610127
// the computations are performed on doubles, not integers and the result
1011710128
// is rounded up, hence we get an upper estimate of the TC.
10118-
unsigned IntVF = getEstimatedRuntimeVF(L, TTI, VF.Width);
10129+
unsigned IntVF = getEstimatedRuntimeVF(VF.Width, VScale);
1011910130
uint64_t RtC = *CheckCost.getValue();
1012010131
uint64_t Div = ScalarC * IntVF - *VF.Cost.getValue();
1012110132
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(RtC * IntVF, Div);
@@ -10552,7 +10563,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1055210563
bool ForceVectorization =
1055310564
Hints.getForce() == LoopVectorizeHints::FK_Enabled;
1055410565
if (!ForceVectorization &&
10555-
!areRuntimeChecksProfitable(Checks, VF, L, *TTI, PSE, SEL)) {
10566+
!areRuntimeChecksProfitable(Checks, VF, L, PSE, SEL,
10567+
CM.getVScaleForTuning())) {
1055610568
ORE->emit([&]() {
1055710569
return OptimizationRemarkAnalysisAliasing(
1055810570
DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),

0 commit comments

Comments
 (0)