@@ -1519,7 +1519,7 @@ class LoopVectorizationCostModel {
1519
1519
// / \p Multiplier is an aditional scaling factor applied to VF before
1520
1520
// / comparing to EpilogueVectorizationMinVF.
1521
1521
bool isEpilogueVectorizationProfitable (const ElementCount VF,
1522
- const unsigned Multiplier ) const ;
1522
+ const unsigned IC ) const ;
1523
1523
1524
1524
// / Returns the execution time cost of an instruction for a given vector
1525
1525
// / width. Vector width of one means scalar.
@@ -4291,6 +4291,21 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4291
4291
return TTI.getVScaleForTuning ();
4292
4292
}
4293
4293
4294
+ // / This functions attempts to return a value that represents the vectorization
4295
+ // / factor at runtime. For fixed-width VFs we know this precisely at compile
4296
+ // / time, but for scalable VFs we calculate it based on an estimate of the
4297
+ // / vscale value.
4298
+ static unsigned getEstimatedRuntimeVF (const Loop *L,
4299
+ const TargetTransformInfo &TTI,
4300
+ ElementCount VF) {
4301
+ unsigned EstimatedVF = VF.getKnownMinValue ();
4302
+ if (VF.isScalable ())
4303
+ if (std::optional<unsigned > VScale = getVScaleForTuning (L, TTI))
4304
+ EstimatedVF *= *VScale;
4305
+ assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
4306
+ return EstimatedVF;
4307
+ }
4308
+
4294
4309
bool LoopVectorizationPlanner::isMoreProfitable (
4295
4310
const VectorizationFactor &A, const VectorizationFactor &B,
4296
4311
const unsigned MaxTripCount) const {
@@ -4593,17 +4608,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4593
4608
InstructionCost C = CM.expectedCost (VF);
4594
4609
VectorizationFactor Candidate (VF, C, ScalarCost.ScalarCost );
4595
4610
4596
- unsigned AssumedMinimumVscale =
4597
- getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4598
- unsigned Width =
4599
- Candidate.Width .isScalable ()
4600
- ? Candidate.Width .getKnownMinValue () * AssumedMinimumVscale
4601
- : Candidate.Width .getFixedValue ();
4611
+ unsigned Width = getEstimatedRuntimeVF (OrigLoop, TTI, Candidate.Width );
4602
4612
LLVM_DEBUG (dbgs () << " LV: Vector loop of width " << VF
4603
4613
<< " costs: " << (Candidate.Cost / Width));
4604
4614
if (VF.isScalable ())
4605
4615
LLVM_DEBUG (dbgs () << " (assuming a minimum vscale of "
4606
- << AssumedMinimumVscale << " )" );
4616
+ << getVScaleForTuning (OrigLoop, TTI).value_or (1 )
4617
+ << " )" );
4607
4618
LLVM_DEBUG (dbgs () << " .\n " );
4608
4619
4609
4620
if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI)) {
@@ -4669,7 +4680,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4669
4680
}
4670
4681
4671
4682
bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable (
4672
- const ElementCount VF, const unsigned Multiplier ) const {
4683
+ const ElementCount VF, const unsigned IC ) const {
4673
4684
// FIXME: We need a much better cost-model to take different parameters such
4674
4685
// as register pressure, code size increase and cost of extra branches into
4675
4686
// account. For now we apply a very crude heuristic and only consider loops
@@ -4684,9 +4695,13 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4684
4695
if (TTI.getMaxInterleaveFactor (VF) <= 1 )
4685
4696
return false ;
4686
4697
4687
- if ((Multiplier * VF.getKnownMinValue ()) >= EpilogueVectorizationMinVF)
4688
- return true ;
4689
- return false ;
4698
+ // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4699
+ // VFs when deciding profitability.
4700
+ // See related "TODO: extend to support scalable VFs." in
4701
+ // selectEpilogueVectorizationFactor.
4702
+ unsigned Multiplier = VF.isFixed () ? IC : 1 ;
4703
+ return getEstimatedRuntimeVF (TheLoop, TTI, VF * Multiplier) >=
4704
+ EpilogueVectorizationMinVF;
4690
4705
}
4691
4706
4692
4707
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor (
@@ -4729,11 +4744,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4729
4744
return Result;
4730
4745
}
4731
4746
4732
- unsigned Multiplier = IC;
4733
- if (MainLoopVF.isScalable ())
4734
- Multiplier = getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4735
-
4736
- if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, Multiplier)) {
4747
+ if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, IC)) {
4737
4748
LLVM_DEBUG (dbgs () << " LEV: Epilogue vectorization is not profitable for "
4738
4749
" this loop\n " );
4739
4750
return Result;
@@ -4742,12 +4753,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4742
4753
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4743
4754
// the main loop handles 8 lanes per iteration. We could still benefit from
4744
4755
// vectorizing the epilogue loop with VF=4.
4745
- ElementCount EstimatedRuntimeVF = MainLoopVF;
4746
- if (MainLoopVF.isScalable ()) {
4747
- EstimatedRuntimeVF = ElementCount::getFixed (MainLoopVF.getKnownMinValue ());
4748
- if (std::optional<unsigned > VScale = getVScaleForTuning (OrigLoop, TTI))
4749
- EstimatedRuntimeVF *= *VScale;
4750
- }
4756
+ ElementCount EstimatedRuntimeVF =
4757
+ ElementCount::getFixed (getEstimatedRuntimeVF (OrigLoop, TTI, MainLoopVF));
4751
4758
4752
4759
ScalarEvolution &SE = *PSE.getSE ();
4753
4760
Type *TCType = Legal->getWidestInductionType ();
@@ -4987,13 +4994,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4987
4994
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4988
4995
}
4989
4996
4990
- unsigned EstimatedVF = VF.getKnownMinValue ();
4991
- if (VF.isScalable ()) {
4992
- if (std::optional<unsigned > VScale = getVScaleForTuning (TheLoop, TTI))
4993
- EstimatedVF *= *VScale;
4994
- }
4995
- assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
4996
-
4997
+ unsigned EstimatedVF = getEstimatedRuntimeVF (TheLoop, TTI, VF);
4997
4998
unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
4998
4999
if (KnownTC > 0 ) {
4999
5000
// At least one iteration must be scalar when this constraint holds. So the
@@ -9797,8 +9798,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9797
9798
}
9798
9799
9799
9800
static bool areRuntimeChecksProfitable (GeneratedRTChecks &Checks,
9800
- VectorizationFactor &VF,
9801
- std::optional< unsigned > VScale, Loop *L ,
9801
+ VectorizationFactor &VF, Loop *L,
9802
+ const TargetTransformInfo &TTI ,
9802
9803
PredicatedScalarEvolution &PSE,
9803
9804
ScalarEpilogueLowering SEL) {
9804
9805
InstructionCost CheckCost = Checks.getCost ();
@@ -9850,13 +9851,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9850
9851
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9851
9852
// the computations are performed on doubles, not integers and the result
9852
9853
// is rounded up, hence we get an upper estimate of the TC.
9853
- unsigned IntVF = VF.Width .getKnownMinValue ();
9854
- if (VF.Width .isScalable ()) {
9855
- unsigned AssumedMinimumVscale = 1 ;
9856
- if (VScale)
9857
- AssumedMinimumVscale = *VScale;
9858
- IntVF *= AssumedMinimumVscale;
9859
- }
9854
+ unsigned IntVF = getEstimatedRuntimeVF (L, TTI, VF.Width );
9860
9855
uint64_t RtC = *CheckCost.getValue ();
9861
9856
uint64_t Div = ScalarC * IntVF - *VF.Cost .getValue ();
9862
9857
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil (RtC * IntVF, Div);
@@ -10105,8 +10100,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10105
10100
bool ForceVectorization =
10106
10101
Hints.getForce () == LoopVectorizeHints::FK_Enabled;
10107
10102
if (!ForceVectorization &&
10108
- !areRuntimeChecksProfitable (Checks, VF, getVScaleForTuning (L, *TTI), L,
10109
- PSE, SEL)) {
10103
+ !areRuntimeChecksProfitable (Checks, VF, L, *TTI, PSE, SEL)) {
10110
10104
ORE->emit ([&]() {
10111
10105
return OptimizationRemarkAnalysisAliasing (
10112
10106
DEBUG_TYPE, " CantReorderMemOps" , L->getStartLoc (),
0 commit comments