@@ -989,7 +989,10 @@ class LoopVectorizationCostModel {
989
989
InterleavedAccessInfo &IAI)
990
990
: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
991
991
TTI (TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
992
- Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {}
992
+ Hints(Hints), InterleaveInfo(IAI), CostKind(TTI::TCK_RecipThroughput) {
993
+ if (TTI.supportsScalableVectors () || ForceTargetSupportsScalableVectors)
994
+ initializeVScaleForTuning ();
995
+ }
993
996
994
997
// / \return An upper bound for the vectorization factors (both fixed and
995
998
// / scalable). If the factors are 0, vectorization and interleaving should be
@@ -1565,9 +1568,34 @@ class LoopVectorizationCostModel {
1565
1568
// / trivially hoistable.
1566
1569
bool shouldConsiderInvariant (Value *Op);
1567
1570
1571
+ // / Return the value of vscale used for tuning the cost model.
1572
+ std::optional<unsigned > getVScaleForTuning () const { return VScaleForTuning; }
1573
+
1568
1574
private:
1569
1575
unsigned NumPredStores = 0 ;
1570
1576
1577
+ // / Used to store the value of vscale used for tuning the cost model. It is
1578
+ // / initialized during object construction.
1579
+ std::optional<unsigned > VScaleForTuning;
1580
+
1581
+ // / Initializes the value of vscale used for tuning the cost model. If
1582
+ // / vscale_range.min == vscale_range.max then return vscale_range.max, else
1583
+ // / return the value returned by the corresponding TTI method.
1584
+ void initializeVScaleForTuning () {
1585
+ const Function *Fn = TheLoop->getHeader ()->getParent ();
1586
+ if (Fn->hasFnAttribute (Attribute::VScaleRange)) {
1587
+ auto Attr = Fn->getFnAttribute (Attribute::VScaleRange);
1588
+ auto Min = Attr.getVScaleRangeMin ();
1589
+ auto Max = Attr.getVScaleRangeMax ();
1590
+ if (Max && Min == Max) {
1591
+ VScaleForTuning = Max;
1592
+ return ;
1593
+ }
1594
+ }
1595
+
1596
+ VScaleForTuning = TTI.getVScaleForTuning ();
1597
+ }
1598
+
1571
1599
// / \return An upper bound for the vectorization factors for both
1572
1600
// / fixed and scalable vectorization, where the minimum-known number of
1573
1601
// / elements is a power-of-2 larger than zero. If scalable vectorization is
@@ -4242,33 +4270,15 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4242
4270
return MaxVF;
4243
4271
}
4244
4272
4245
- // / Convenience function that returns the value of vscale_range iff
4246
- // / vscale_range.min == vscale_range.max or otherwise returns the value
4247
- // / returned by the corresponding TTI method.
4248
- static std::optional<unsigned >
4249
- getVScaleForTuning (const Loop *L, const TargetTransformInfo &TTI) {
4250
- const Function *Fn = L->getHeader ()->getParent ();
4251
- if (Fn->hasFnAttribute (Attribute::VScaleRange)) {
4252
- auto Attr = Fn->getFnAttribute (Attribute::VScaleRange);
4253
- auto Min = Attr.getVScaleRangeMin ();
4254
- auto Max = Attr.getVScaleRangeMax ();
4255
- if (Max && Min == Max)
4256
- return Max;
4257
- }
4258
-
4259
- return TTI.getVScaleForTuning ();
4260
- }
4261
-
4262
4273
// / This function attempts to return a value that represents the vectorization
4263
4274
// / factor at runtime. For fixed-width VFs we know this precisely at compile
4264
4275
// / time, but for scalable VFs we calculate it based on an estimate of the
4265
4276
// / vscale value.
4266
- static unsigned getEstimatedRuntimeVF (const Loop *L,
4267
- const TargetTransformInfo &TTI,
4268
- ElementCount VF) {
4277
+ static unsigned getEstimatedRuntimeVF (ElementCount VF,
4278
+ std::optional<unsigned > VScale) {
4269
4279
unsigned EstimatedVF = VF.getKnownMinValue ();
4270
4280
if (VF.isScalable ())
4271
- if (std::optional< unsigned > VScale = getVScaleForTuning (L, TTI) )
4281
+ if (VScale)
4272
4282
EstimatedVF *= *VScale;
4273
4283
assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
4274
4284
return EstimatedVF;
@@ -4283,7 +4293,7 @@ bool LoopVectorizationPlanner::isMoreProfitable(
4283
4293
// Improve estimate for the vector width if it is scalable.
4284
4294
unsigned EstimatedWidthA = A.Width .getKnownMinValue ();
4285
4295
unsigned EstimatedWidthB = B.Width .getKnownMinValue ();
4286
- if (std::optional<unsigned > VScale = getVScaleForTuning (OrigLoop, TTI )) {
4296
+ if (std::optional<unsigned > VScale = CM. getVScaleForTuning ()) {
4287
4297
if (A.Width .isScalable ())
4288
4298
EstimatedWidthA *= *VScale;
4289
4299
if (B.Width .isScalable ())
@@ -4576,13 +4586,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4576
4586
InstructionCost C = CM.expectedCost (VF);
4577
4587
VectorizationFactor Candidate (VF, C, ScalarCost.ScalarCost );
4578
4588
4579
- unsigned Width = getEstimatedRuntimeVF (OrigLoop, TTI, Candidate.Width );
4589
+ unsigned Width =
4590
+ getEstimatedRuntimeVF (Candidate.Width , CM.getVScaleForTuning ());
4580
4591
LLVM_DEBUG (dbgs () << " LV: Vector loop of width " << VF
4581
4592
<< " costs: " << (Candidate.Cost / Width));
4582
4593
if (VF.isScalable ())
4583
4594
LLVM_DEBUG (dbgs () << " (assuming a minimum vscale of "
4584
- << getVScaleForTuning (OrigLoop, TTI).value_or (1 )
4585
- << " )" );
4595
+ << CM.getVScaleForTuning ().value_or (1 ) << " )" );
4586
4596
LLVM_DEBUG (dbgs () << " .\n " );
4587
4597
4588
4598
if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI)) {
@@ -4671,7 +4681,8 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4671
4681
unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences () > 0
4672
4682
? EpilogueVectorizationMinVF
4673
4683
: TTI.getEpilogueVectorizationMinVF ();
4674
- return getEstimatedRuntimeVF (TheLoop, TTI, VF * Multiplier) >= MinVFThreshold;
4684
+ return getEstimatedRuntimeVF (VF * Multiplier, VScaleForTuning) >=
4685
+ MinVFThreshold;
4675
4686
}
4676
4687
4677
4688
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor (
@@ -4723,8 +4734,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4723
4734
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4724
4735
// the main loop handles 8 lanes per iteration. We could still benefit from
4725
4736
// vectorizing the epilogue loop with VF=4.
4726
- ElementCount EstimatedRuntimeVF =
4727
- ElementCount::getFixed ( getEstimatedRuntimeVF (OrigLoop, TTI, MainLoopVF ));
4737
+ ElementCount EstimatedRuntimeVF = ElementCount::getFixed (
4738
+ getEstimatedRuntimeVF (MainLoopVF, CM. getVScaleForTuning () ));
4728
4739
4729
4740
ScalarEvolution &SE = *PSE.getSE ();
4730
4741
Type *TCType = Legal->getWidestInductionType ();
@@ -4970,7 +4981,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4970
4981
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4971
4982
}
4972
4983
4973
- unsigned EstimatedVF = getEstimatedRuntimeVF (TheLoop, TTI, VF );
4984
+ unsigned EstimatedVF = getEstimatedRuntimeVF (VF, VScaleForTuning );
4974
4985
unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
4975
4986
if (KnownTC > 0 ) {
4976
4987
// At least one iteration must be scalar when this constraint holds. So the
@@ -7399,7 +7410,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7399
7410
// Now compute and add the VPlan-based cost.
7400
7411
Cost += Plan.cost (VF, CostCtx);
7401
7412
#ifndef NDEBUG
7402
- unsigned EstimatedWidth = getEstimatedRuntimeVF (OrigLoop , CM.TTI , VF );
7413
+ unsigned EstimatedWidth = getEstimatedRuntimeVF (VF , CM.getVScaleForTuning () );
7403
7414
LLVM_DEBUG (dbgs () << " Cost for VF " << VF << " : " << Cost
7404
7415
<< " (Estimated cost per lane: " );
7405
7416
if (Cost.isValid ()) {
@@ -10063,9 +10074,9 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
10063
10074
10064
10075
static bool areRuntimeChecksProfitable (GeneratedRTChecks &Checks,
10065
10076
VectorizationFactor &VF, Loop *L,
10066
- const TargetTransformInfo &TTI,
10067
10077
PredicatedScalarEvolution &PSE,
10068
- ScalarEpilogueLowering SEL) {
10078
+ ScalarEpilogueLowering SEL,
10079
+ std::optional<unsigned > VScale) {
10069
10080
InstructionCost CheckCost = Checks.getCost ();
10070
10081
if (!CheckCost.isValid ())
10071
10082
return false ;
@@ -10115,7 +10126,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
10115
10126
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
10116
10127
// the computations are performed on doubles, not integers and the result
10117
10128
// is rounded up, hence we get an upper estimate of the TC.
10118
- unsigned IntVF = getEstimatedRuntimeVF (L, TTI, VF.Width );
10129
+ unsigned IntVF = getEstimatedRuntimeVF (VF.Width , VScale );
10119
10130
uint64_t RtC = *CheckCost.getValue ();
10120
10131
uint64_t Div = ScalarC * IntVF - *VF.Cost .getValue ();
10121
10132
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil (RtC * IntVF, Div);
@@ -10552,7 +10563,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10552
10563
bool ForceVectorization =
10553
10564
Hints.getForce () == LoopVectorizeHints::FK_Enabled;
10554
10565
if (!ForceVectorization &&
10555
- !areRuntimeChecksProfitable (Checks, VF, L, *TTI, PSE, SEL)) {
10566
+ !areRuntimeChecksProfitable (Checks, VF, L, PSE, SEL,
10567
+ CM.getVScaleForTuning ())) {
10556
10568
ORE->emit ([&]() {
10557
10569
return OptimizationRemarkAnalysisAliasing (
10558
10570
DEBUG_TYPE, " CantReorderMemOps" , L->getStartLoc (),
0 commit comments