@@ -1509,7 +1509,10 @@ class LoopVectorizationCostModel {
1509
1509
// / Returns true if epilogue vectorization is considered profitable, and
1510
1510
// / false otherwise.
1511
1511
// / \p VF is the vectorization factor chosen for the original loop.
1512
- bool isEpilogueVectorizationProfitable (const ElementCount VF) const ;
1512
+ // / \p Multiplier is an aditional scaling factor applied to VF before
1513
+ // / comparing to EpilogueVectorizationMinVF.
1514
+ bool isEpilogueVectorizationProfitable (const ElementCount VF,
1515
+ const unsigned Multiplier) const ;
1513
1516
1514
1517
// / Returns the execution time cost of an instruction for a given vector
1515
1518
// / width. Vector width of one means scalar.
@@ -4257,12 +4260,11 @@ getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) {
4257
4260
}
4258
4261
4259
4262
bool LoopVectorizationPlanner::isMoreProfitable (
4260
- const VectorizationFactor &A, const VectorizationFactor &B) const {
4263
+ const VectorizationFactor &A, const VectorizationFactor &B,
4264
+ const unsigned MaxTripCount) const {
4261
4265
InstructionCost CostA = A.Cost ;
4262
4266
InstructionCost CostB = B.Cost ;
4263
4267
4264
- unsigned MaxTripCount = PSE.getSE ()->getSmallConstantMaxTripCount (OrigLoop);
4265
-
4266
4268
// Improve estimate for the vector width if it is scalable.
4267
4269
unsigned EstimatedWidthA = A.Width .getKnownMinValue ();
4268
4270
unsigned EstimatedWidthB = B.Width .getKnownMinValue ();
@@ -4311,6 +4313,13 @@ bool LoopVectorizationPlanner::isMoreProfitable(
4311
4313
return CmpFn (RTCostA, RTCostB);
4312
4314
}
4313
4315
4316
+ bool LoopVectorizationPlanner::isMoreProfitable (
4317
+ const VectorizationFactor &A, const VectorizationFactor &B) const {
4318
+ const unsigned MaxTripCount =
4319
+ PSE.getSE ()->getSmallConstantMaxTripCount (OrigLoop);
4320
+ return LoopVectorizationPlanner::isMoreProfitable (A, B, MaxTripCount);
4321
+ }
4322
+
4314
4323
void LoopVectorizationPlanner::emitInvalidCostRemarks (
4315
4324
OptimizationRemarkEmitter *ORE) {
4316
4325
using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
@@ -4620,7 +4629,7 @@ bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4620
4629
}
4621
4630
4622
4631
bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable (
4623
- const ElementCount VF) const {
4632
+ const ElementCount VF, const unsigned Multiplier ) const {
4624
4633
// FIXME: We need a much better cost-model to take different parameters such
4625
4634
// as register pressure, code size increase and cost of extra branches into
4626
4635
// account. For now we apply a very crude heuristic and only consider loops
@@ -4635,9 +4644,6 @@ bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4635
4644
if (TTI.getMaxInterleaveFactor (VF) <= 1 )
4636
4645
return false ;
4637
4646
4638
- unsigned Multiplier = 1 ;
4639
- if (VF.isScalable ())
4640
- Multiplier = getVScaleForTuning (TheLoop, TTI).value_or (1 );
4641
4647
if ((Multiplier * VF.getKnownMinValue ()) >= EpilogueVectorizationMinVF)
4642
4648
return true ;
4643
4649
return false ;
@@ -4683,7 +4689,11 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4683
4689
return Result;
4684
4690
}
4685
4691
4686
- if (!CM.isEpilogueVectorizationProfitable (MainLoopVF)) {
4692
+ unsigned Multiplier = IC;
4693
+ if (MainLoopVF.isScalable ())
4694
+ Multiplier = getVScaleForTuning (OrigLoop, TTI).value_or (1 );
4695
+
4696
+ if (!CM.isEpilogueVectorizationProfitable (MainLoopVF, Multiplier)) {
4687
4697
LLVM_DEBUG (dbgs () << " LEV: Epilogue vectorization is not profitable for "
4688
4698
" this loop\n " );
4689
4699
return Result;
@@ -4702,16 +4712,20 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4702
4712
ScalarEvolution &SE = *PSE.getSE ();
4703
4713
Type *TCType = Legal->getWidestInductionType ();
4704
4714
const SCEV *RemainingIterations = nullptr ;
4715
+ unsigned MaxTripCount = 0 ;
4705
4716
for (auto &NextVF : ProfitableVFs) {
4706
4717
// Skip candidate VFs without a corresponding VPlan.
4707
4718
if (!hasPlanWithVF (NextVF.Width ))
4708
4719
continue ;
4709
4720
4710
- // Skip candidate VFs with widths >= the estimate runtime VF (scalable
4711
- // vectors) or the VF of the main loop (fixed vectors).
4721
+ // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4722
+ // vectors) or > the VF of the main loop (fixed vectors).
4712
4723
if ((!NextVF.Width .isScalable () && MainLoopVF.isScalable () &&
4713
4724
ElementCount::isKnownGE (NextVF.Width , EstimatedRuntimeVF)) ||
4714
- ElementCount::isKnownGE (NextVF.Width , MainLoopVF))
4725
+ (NextVF.Width .isScalable () &&
4726
+ ElementCount::isKnownGE (NextVF.Width , MainLoopVF)) ||
4727
+ (!NextVF.Width .isScalable () && !MainLoopVF.isScalable () &&
4728
+ ElementCount::isKnownGT (NextVF.Width , MainLoopVF)))
4715
4729
continue ;
4716
4730
4717
4731
// If NextVF is greater than the number of remaining iterations, the
@@ -4725,6 +4739,14 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4725
4739
" Trip count SCEV must be computable" );
4726
4740
RemainingIterations = SE.getURemExpr (
4727
4741
TC, SE.getConstant (TCType, MainLoopVF.getKnownMinValue () * IC));
4742
+ const APInt MaxRemainingIterations =
4743
+ SE.getUnsignedRangeMax (RemainingIterations);
4744
+ // Guard against huge trip counts.
4745
+ if (MaxRemainingIterations.getActiveBits () <= 32 ) {
4746
+ MaxTripCount = MaxRemainingIterations.getZExtValue ();
4747
+ LLVM_DEBUG (dbgs () << " LEV: Maximum Trip Count for Epilogue: "
4748
+ << MaxTripCount << " \n " );
4749
+ }
4728
4750
}
4729
4751
if (SE.isKnownPredicate (
4730
4752
CmpInst::ICMP_UGT,
@@ -4733,7 +4755,8 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4733
4755
continue ;
4734
4756
}
4735
4757
4736
- if (Result.Width .isScalar () || isMoreProfitable (NextVF, Result))
4758
+ if (Result.Width .isScalar () ||
4759
+ isMoreProfitable (NextVF, Result, MaxTripCount))
4737
4760
Result = NextVF;
4738
4761
}
4739
4762
0 commit comments