@@ -1520,7 +1520,7 @@ class LoopVectorizationCostModel {
1520
1520
// / \p Multiplier is an aditional scaling factor applied to VF before
1521
1521
// / comparing to EpilogueVectorizationMinVF.
1522
1522
bool isEpilogueVectorizationProfitable (const ElementCount VF,
1523
- const unsigned Multiplier ) const ;
1523
+ const unsigned IC ) const ;
1524
1524
1525
1525
// / Returns the execution time cost of an instruction for a given vector
1526
1526
// / width. Vector width of one means scalar.
@@ -4292,6 +4292,21 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4292
4292
return TTI.getVScaleForTuning ();
4293
4293
}
4294
4294
4295
+ // / This function attempts to return a value that represents the vectorization
4296
+ // / factor at runtime. For fixed-width VFs we know this precisely at compile
4297
+ // / time, but for scalable VFs we calculate it based on an estimate of the
4298
+ // / vscale value.
4299
+ static unsigned getEstimatedRuntimeVF (const Loop *L,
4300
+ const TargetTransformInfo &TTI,
4301
+ ElementCount VF) {
4302
+ unsigned EstimatedVF = VF.getKnownMinValue ();
4303
+ if (VF.isScalable ())
4304
+ if (std::optional<unsigned > VScale = getVScaleForTuning (L, TTI))
4305
+ EstimatedVF *= *VScale;
4306
+ assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
4307
+ return EstimatedVF;
4308
+ }
4309
+
4295
4310
bool LoopVectorizationPlanner::isMoreProfitable (
4296
4311
const VectorizationFactor &A, const VectorizationFactor &B,
4297
4312
const unsigned MaxTripCount) const {
@@ -4594,17 +4609,13 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4594
4609
InstructionCost C = CM.expectedCost (VF);
4595
4610
VectorizationFactor Candidate (VF, C, ScalarCost.ScalarCost );
4596
4611
4597
- unsigned AssumedMinimumVscale =
4598
- getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4599
- unsigned Width =
4600
- Candidate.Width .isScalable ()
4601
- ? Candidate.Width .getKnownMinValue () * AssumedMinimumVscale
4602
- : Candidate.Width .getFixedValue ();
4612
+ unsigned Width = getEstimatedRuntimeVF (OrigLoop, TTI, Candidate.Width );
4603
4613
LLVM_DEBUG (dbgs () << " LV: Vector loop of width " << VF
4604
4614
<< " costs: " << (Candidate.Cost / Width));
4605
4615
if (VF.isScalable ())
4606
4616
LLVM_DEBUG (dbgs () << " (assuming a minimum vscale of "
4607
- << AssumedMinimumVscale << " )" );
4617
+ << getVScaleForTuning (OrigLoop, TTI).value_or (1 )
4618
+ << " )" );
4608
4619
LLVM_DEBUG (dbgs () << " .\n " );
4609
4620
4610
4621
if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI)) {
@@ -4670,7 +4681,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4670
4681
}
4671
4682
4672
4683
bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable (
4673
- const ElementCount VF, const unsigned Multiplier ) const {
4684
+ const ElementCount VF, const unsigned IC ) const {
4674
4685
// FIXME: We need a much better cost-model to take different parameters such
4675
4686
// as register pressure, code size increase and cost of extra branches into
4676
4687
// account. For now we apply a very crude heuristic and only consider loops
@@ -4685,9 +4696,13 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4685
4696
if (TTI.getMaxInterleaveFactor (VF) <= 1 )
4686
4697
return false ;
4687
4698
4688
- if ((Multiplier * VF.getKnownMinValue ()) >= EpilogueVectorizationMinVF)
4689
- return true ;
4690
- return false ;
4699
+ // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4700
+ // VFs when deciding profitability.
4701
+ // See related "TODO: extend to support scalable VFs." in
4702
+ // selectEpilogueVectorizationFactor.
4703
+ unsigned Multiplier = VF.isFixed () ? IC : 1 ;
4704
+ return getEstimatedRuntimeVF (TheLoop, TTI, VF * Multiplier) >=
4705
+ EpilogueVectorizationMinVF;
4691
4706
}
4692
4707
4693
4708
VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor (
@@ -4730,11 +4745,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4730
4745
return Result;
4731
4746
}
4732
4747
4733
- unsigned Multiplier = IC;
4734
- if (MainLoopVF.isScalable ())
4735
- Multiplier = getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4736
-
4737
- if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, Multiplier)) {
4748
+ if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, IC)) {
4738
4749
LLVM_DEBUG (dbgs () << " LEV: Epilogue vectorization is not profitable for "
4739
4750
" this loop\n " );
4740
4751
return Result;
@@ -4743,12 +4754,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4743
4754
// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4744
4755
// the main loop handles 8 lanes per iteration. We could still benefit from
4745
4756
// vectorizing the epilogue loop with VF=4.
4746
- ElementCount EstimatedRuntimeVF = MainLoopVF;
4747
- if (MainLoopVF.isScalable ()) {
4748
- EstimatedRuntimeVF = ElementCount::getFixed (MainLoopVF.getKnownMinValue ());
4749
- if (std::optional<unsigned > VScale = getVScaleForTuning (OrigLoop, TTI))
4750
- EstimatedRuntimeVF *= *VScale;
4751
- }
4757
+ ElementCount EstimatedRuntimeVF =
4758
+ ElementCount::getFixed (getEstimatedRuntimeVF (OrigLoop, TTI, MainLoopVF));
4752
4759
4753
4760
ScalarEvolution &SE = *PSE.getSE ();
4754
4761
Type *TCType = Legal->getWidestInductionType ();
@@ -4988,13 +4995,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
4988
4995
MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4989
4996
}
4990
4997
4991
- unsigned EstimatedVF = VF.getKnownMinValue ();
4992
- if (VF.isScalable ()) {
4993
- if (std::optional<unsigned > VScale = getVScaleForTuning (TheLoop, TTI))
4994
- EstimatedVF *= *VScale;
4995
- }
4996
- assert (EstimatedVF >= 1 && " Estimated VF shouldn't be less than 1" );
4997
-
4998
+ unsigned EstimatedVF = getEstimatedRuntimeVF (TheLoop, TTI, VF);
4998
4999
unsigned KnownTC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
4999
5000
if (KnownTC > 0 ) {
5000
5001
// At least one iteration must be scalar when this constraint holds. So the
@@ -7426,10 +7427,7 @@ InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
7426
7427
// Now compute and add the VPlan-based cost.
7427
7428
Cost += Plan.cost (VF, CostCtx);
7428
7429
#ifndef NDEBUG
7429
- unsigned EstimatedWidth = VF.getKnownMinValue ();
7430
- if (VF.isScalable ())
7431
- if (std::optional<unsigned > VScale = getVScaleForTuning (OrigLoop, TTI))
7432
- EstimatedWidth *= *VScale;
7430
+ unsigned EstimatedWidth = getEstimatedRuntimeVF (OrigLoop, CM.TTI , VF);
7433
7431
LLVM_DEBUG (dbgs () << " Cost for VF " << VF << " : " << Cost
7434
7432
<< " (Estimated cost per lane: " );
7435
7433
if (Cost.isValid ()) {
@@ -9811,8 +9809,8 @@ static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9811
9809
}
9812
9810
9813
9811
static bool areRuntimeChecksProfitable (GeneratedRTChecks &Checks,
9814
- VectorizationFactor &VF,
9815
- std::optional< unsigned > VScale, Loop *L ,
9812
+ VectorizationFactor &VF, Loop *L,
9813
+ const TargetTransformInfo &TTI ,
9816
9814
PredicatedScalarEvolution &PSE,
9817
9815
ScalarEpilogueLowering SEL) {
9818
9816
InstructionCost CheckCost = Checks.getCost ();
@@ -9864,13 +9862,7 @@ static bool areRuntimeChecksProfitable(GeneratedRTChecks &Checks,
9864
9862
// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9865
9863
// the computations are performed on doubles, not integers and the result
9866
9864
// is rounded up, hence we get an upper estimate of the TC.
9867
- unsigned IntVF = VF.Width .getKnownMinValue ();
9868
- if (VF.Width .isScalable ()) {
9869
- unsigned AssumedMinimumVscale = 1 ;
9870
- if (VScale)
9871
- AssumedMinimumVscale = *VScale;
9872
- IntVF *= AssumedMinimumVscale;
9873
- }
9865
+ unsigned IntVF = getEstimatedRuntimeVF (L, TTI, VF.Width );
9874
9866
uint64_t RtC = *CheckCost.getValue ();
9875
9867
uint64_t Div = ScalarC * IntVF - *VF.Cost .getValue ();
9876
9868
uint64_t MinTC1 = Div == 0 ? 0 : divideCeil (RtC * IntVF, Div);
@@ -10119,8 +10111,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10119
10111
bool ForceVectorization =
10120
10112
Hints.getForce () == LoopVectorizeHints::FK_Enabled;
10121
10113
if (!ForceVectorization &&
10122
- !areRuntimeChecksProfitable (Checks, VF, getVScaleForTuning (L, *TTI), L,
10123
- PSE, SEL)) {
10114
+ !areRuntimeChecksProfitable (Checks, VF, L, *TTI, PSE, SEL)) {
10124
10115
ORE->emit ([&]() {
10125
10116
return OptimizationRemarkAnalysisAliasing (
10126
10117
DEBUG_TYPE, " CantReorderMemOps" , L->getStartLoc (),
0 commit comments