@@ -1516,7 +1516,10 @@ class LoopVectorizationCostModel {
1516
1516
// / Returns true if epilogue vectorization is considered profitable, and
1517
1517
// / false otherwise.
1518
1518
// / \p VF is the vectorization factor chosen for the original loop.
1519
- bool isEpilogueVectorizationProfitable (const ElementCount VF) const ;
1519
+ // / \p Multiplier is an aditional scaling factor applied to VF before
1520
+ // / comparing to EpilogueVectorizationMinVF.
1521
+ bool isEpilogueVectorizationProfitable (const ElementCount VF,
1522
+ const unsigned Multiplier) const ;
1520
1523
1521
1524
// / Returns the execution time cost of an instruction for a given vector
1522
1525
// / width. Vector width of one means scalar.
@@ -4289,12 +4292,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4289
4292
}
4290
4293
4291
4294
bool LoopVectorizationPlanner::isMoreProfitable (
4292
- const VectorizationFactor &A, const VectorizationFactor &B) const {
4295
+ const VectorizationFactor &A, const VectorizationFactor &B,
4296
+ const unsigned MaxTripCount) const {
4293
4297
InstructionCost CostA = A.Cost ;
4294
4298
InstructionCost CostB = B.Cost ;
4295
4299
4296
- unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount ();
4297
-
4298
4300
// Improve estimate for the vector width if it is scalable.
4299
4301
unsigned EstimatedWidthA = A.Width .getKnownMinValue ();
4300
4302
unsigned EstimatedWidthB = B.Width .getKnownMinValue ();
@@ -4343,6 +4345,12 @@ bool LoopVectorizationPlanner::isMoreProfitable(
4343
4345
return CmpFn (RTCostA, RTCostB);
4344
4346
}
4345
4347
4348
+ bool LoopVectorizationPlanner::isMoreProfitable (
4349
+ const VectorizationFactor &A, const VectorizationFactor &B) const {
4350
+ const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount ();
4351
+ return LoopVectorizationPlanner::isMoreProfitable (A, B, MaxTripCount);
4352
+ }
4353
+
4346
4354
void LoopVectorizationPlanner::emitInvalidCostRemarks (
4347
4355
OptimizationRemarkEmitter *ORE) {
4348
4356
using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
@@ -4661,7 +4669,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4661
4669
}
4662
4670
4663
4671
bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable (
4664
- const ElementCount VF) const {
4672
+ const ElementCount VF, const unsigned Multiplier ) const {
4665
4673
// FIXME: We need a much better cost-model to take different parameters such
4666
4674
// as register pressure, code size increase and cost of extra branches into
4667
4675
// account. For now we apply a very crude heuristic and only consider loops
@@ -4676,9 +4684,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4676
4684
if (TTI.getMaxInterleaveFactor (VF) <= 1 )
4677
4685
return false ;
4678
4686
4679
- unsigned Multiplier = 1 ;
4680
- if (VF.isScalable ())
4681
- Multiplier = getVScaleForTuning (TheLoop, TTI).value_or (1 );
4682
4687
if ((Multiplier * VF.getKnownMinValue ()) >= EpilogueVectorizationMinVF)
4683
4688
return true ;
4684
4689
return false ;
@@ -4724,7 +4729,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4724
4729
return Result;
4725
4730
}
4726
4731
4727
- if (!CM.isEpilogueVectorizationProfitable (MainLoopVF)) {
4732
+ unsigned Multiplier = IC;
4733
+ if (MainLoopVF.isScalable ())
4734
+ Multiplier = getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4735
+
4736
+ if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, Multiplier)) {
4728
4737
LLVM_DEBUG (dbgs () << " LEV: Epilogue vectorization is not profitable for "
4729
4738
" this loop\n " );
4730
4739
return Result;
@@ -4743,16 +4752,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4743
4752
ScalarEvolution &SE = *PSE.getSE ();
4744
4753
Type *TCType = Legal->getWidestInductionType ();
4745
4754
const SCEV *RemainingIterations = nullptr ;
4755
+ unsigned MaxTripCount = 0 ;
4746
4756
for (auto &NextVF : ProfitableVFs) {
4747
4757
// Skip candidate VFs without a corresponding VPlan.
4748
4758
if (!hasPlanWithVF (NextVF.Width ))
4749
4759
continue ;
4750
4760
4751
- // Skip candidate VFs with widths >= the estimate runtime VF (scalable
4752
- // vectors) or the VF of the main loop (fixed vectors).
4761
+ // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4762
+ // vectors) or > the VF of the main loop (fixed vectors).
4753
4763
if ((!NextVF.Width .isScalable () && MainLoopVF.isScalable () &&
4754
4764
ElementCount::isKnownGE (NextVF.Width , EstimatedRuntimeVF)) ||
4755
- ElementCount::isKnownGE (NextVF.Width , MainLoopVF))
4765
+ (NextVF.Width .isScalable () &&
4766
+ ElementCount::isKnownGE (NextVF.Width , MainLoopVF)) ||
4767
+ (!NextVF.Width .isScalable () && !MainLoopVF.isScalable () &&
4768
+ ElementCount::isKnownGT (NextVF.Width , MainLoopVF)))
4756
4769
continue ;
4757
4770
4758
4771
// If NextVF is greater than the number of remaining iterations, the
@@ -4766,6 +4779,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4766
4779
" Trip count SCEV must be computable" );
4767
4780
RemainingIterations = SE.getURemExpr (
4768
4781
TC, SE.getConstant (TCType, MainLoopVF.getKnownMinValue () * IC));
4782
+ MaxTripCount = MainLoopVF.getKnownMinValue () * IC - 1 ;
4783
+ if (SE.isKnownPredicate (CmpInst::ICMP_ULT, RemainingIterations,
4784
+ SE.getConstant (TCType, MaxTripCount))) {
4785
+ MaxTripCount =
4786
+ SE.getUnsignedRangeMax (RemainingIterations).getZExtValue ();
4787
+ }
4788
+ LLVM_DEBUG (dbgs () << " LEV: Maximum Trip Count for Epilogue: "
4789
+ << MaxTripCount << " \n " );
4769
4790
}
4770
4791
if (SE.isKnownPredicate (
4771
4792
CmpInst::ICMP_UGT,
@@ -4774,7 +4795,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4774
4795
continue ;
4775
4796
}
4776
4797
4777
- if (Result.Width .isScalar () || isMoreProfitable (NextVF, Result))
4798
+ if (Result.Width .isScalar () ||
4799
+ isMoreProfitable (NextVF, Result, MaxTripCount))
4778
4800
Result = NextVF;
4779
4801
}
4780
4802
0 commit comments