@@ -836,6 +836,14 @@ void InnerLoopVectorizer::addMetadata(ArrayRef<Value *> To,
836
836
837
837
namespace llvm {
838
838
839
+ // Loop vectorization cost-model hints how the scalar epilogue loop should be
840
+ // lowered.
841
+ enum ScalarEpilogueLowering {
842
+ CM_ScalarEpilogueAllowed,
843
+ CM_ScalarEpilogueNotAllowedOptSize,
844
+ CM_ScalarEpilogueNotAllowedLowTripLoop
845
+ };
846
+
839
847
// / LoopVectorizationCostModel - estimates the expected speedups due to
840
848
// / vectorization.
841
849
// / In many cases vectorization is not profitable. This can happen because of
@@ -845,20 +853,22 @@ namespace llvm {
845
853
// / different operations.
846
854
class LoopVectorizationCostModel {
847
855
public:
848
- LoopVectorizationCostModel (Loop *L, PredicatedScalarEvolution &PSE,
856
+ LoopVectorizationCostModel (ScalarEpilogueLowering SEL, Loop *L,
857
+ PredicatedScalarEvolution &PSE,
849
858
LoopInfo *LI, LoopVectorizationLegality *Legal,
850
859
const TargetTransformInfo &TTI,
851
860
const TargetLibraryInfo *TLI, DemandedBits *DB,
852
861
AssumptionCache *AC,
853
862
OptimizationRemarkEmitter *ORE, const Function *F,
854
863
const LoopVectorizeHints *Hints,
855
864
InterleavedAccessInfo &IAI)
856
- : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
857
- AC (AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
865
+ : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE),
866
+ LI (LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE),
867
+ TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
858
868
859
869
// / \return An upper bound for the vectorization factor, or None if
860
870
// / vectorization and interleaving should be avoided up front.
861
- Optional<unsigned > computeMaxVF (bool OptForSize );
871
+ Optional<unsigned > computeMaxVF ();
862
872
863
873
// / \return The most profitable vectorization factor and the cost of that VF.
864
874
// / This method checks every power of two up to MaxVF. If UserVF is not ZERO
@@ -881,8 +891,7 @@ class LoopVectorizationCostModel {
881
891
// / If interleave count has been specified by metadata it will be returned.
882
892
// / Otherwise, the interleave count is computed and returned. VF and LoopCost
883
893
// / are the selected vectorization factor and the cost of the selected VF.
884
- unsigned selectInterleaveCount (bool OptForSize, unsigned VF,
885
- unsigned LoopCost);
894
+ unsigned selectInterleaveCount (unsigned VF, unsigned LoopCost);
886
895
887
896
// / Memory access instruction may be vectorized in more than one way.
888
897
// / Form of instruction after vectorization depends on cost.
@@ -1157,11 +1166,14 @@ class LoopVectorizationCostModel {
1157
1166
// / to handle accesses with gaps, and there is nothing preventing us from
1158
1167
// / creating a scalar epilogue.
1159
1168
bool requiresScalarEpilogue () const {
1160
- return IsScalarEpilogueAllowed && InterleaveInfo.requiresScalarEpilogue ();
1169
+ return isScalarEpilogueAllowed () && InterleaveInfo.requiresScalarEpilogue ();
1161
1170
}
1162
1171
1163
- // / Returns true if a scalar epilogue is not allowed due to optsize.
1164
- bool isScalarEpilogueAllowed () const { return IsScalarEpilogueAllowed; }
1172
+ // / Returns true if a scalar epilogue is not allowed due to optsize or a
1173
+ // / loop hint annotation.
1174
+ bool isScalarEpilogueAllowed () const {
1175
+ return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1176
+ }
1165
1177
1166
1178
// / Returns true if all loop blocks should be masked to fold tail loop.
1167
1179
bool foldTailByMasking () const { return FoldTailByMasking; }
@@ -1187,7 +1199,7 @@ class LoopVectorizationCostModel {
1187
1199
1188
1200
// / \return An upper bound for the vectorization factor, larger than zero.
1189
1201
// / One is returned if vectorization should best be avoided due to cost.
1190
- unsigned computeFeasibleMaxVF (bool OptForSize, unsigned ConstTripCount);
1202
+ unsigned computeFeasibleMaxVF (unsigned ConstTripCount);
1191
1203
1192
1204
// / The vectorization cost is a combination of the cost itself and a boolean
1193
1205
// / indicating whether any of the contributing operations will actually
@@ -1270,13 +1282,13 @@ class LoopVectorizationCostModel {
1270
1282
SmallPtrSet<BasicBlock *, 4 > PredicatedBBsAfterVectorization;
1271
1283
1272
1284
// / Records whether it is allowed to have the original scalar loop execute at
1273
- // / least once. This may be needed as a fallback loop in case runtime
1285
+ // / least once. This may be needed as a fallback loop in case runtime
1274
1286
// / aliasing/dependence checks fail, or to handle the tail/remainder
1275
1287
// / iterations when the trip count is unknown or doesn't divide by the VF,
1276
1288
// / or as a peel-loop to handle gaps in interleave-groups.
1277
1289
// / Under optsize and when the trip count is very small we don't allow any
1278
1290
// / iterations to execute in the scalar loop.
1279
- bool IsScalarEpilogueAllowed = true ;
1291
+ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed ;
1280
1292
1281
1293
// / All blocks of loop are to be masked to fold tail of scalar iterations.
1282
1294
bool FoldTailByMasking = false ;
@@ -4452,10 +4464,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
4452
4464
// Check if masking is required.
4453
4465
// A Group may need masking for one of two reasons: it resides in a block that
4454
4466
// needs predication, or it was decided to use masking to deal with gaps.
4455
- bool PredicatedAccessRequiresMasking =
4467
+ bool PredicatedAccessRequiresMasking =
4456
4468
Legal->blockNeedsPredication (I->getParent ()) && Legal->isMaskRequired (I);
4457
- bool AccessWithGapsRequiresMasking =
4458
- Group->requiresScalarEpilogue () && !IsScalarEpilogueAllowed ;
4469
+ bool AccessWithGapsRequiresMasking =
4470
+ Group->requiresScalarEpilogue () && !isScalarEpilogueAllowed () ;
4459
4471
if (!PredicatedAccessRequiresMasking && !AccessWithGapsRequiresMasking)
4460
4472
return true ;
4461
4473
@@ -4675,7 +4687,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
4675
4687
Uniforms[VF].insert (Worklist.begin (), Worklist.end ());
4676
4688
}
4677
4689
4678
- Optional<unsigned > LoopVectorizationCostModel::computeMaxVF (bool OptForSize ) {
4690
+ Optional<unsigned > LoopVectorizationCostModel::computeMaxVF () {
4679
4691
if (Legal->getRuntimePointerChecking ()->Need && TTI.hasBranchDivergence ()) {
4680
4692
// TODO: It may by useful to do since it's still likely to be dynamically
4681
4693
// uniform if the target can skip.
@@ -4690,8 +4702,11 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
4690
4702
}
4691
4703
4692
4704
unsigned TC = PSE.getSE ()->getSmallConstantTripCount (TheLoop);
4693
- if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
4694
- return computeFeasibleMaxVF (OptForSize, TC);
4705
+ if (isScalarEpilogueAllowed ())
4706
+ return computeFeasibleMaxVF (TC);
4707
+
4708
+ LLVM_DEBUG (dbgs () << " LV: Not allowing scalar epilogue.\n " <<
4709
+ " LV: Performing code size checks.\n " );
4695
4710
4696
4711
if (Legal->getRuntimePointerChecking ()->Need ) {
4697
4712
ORE->emit (createMissedAnalysis (" CantVersionLoopWithOptForSize" )
@@ -4740,15 +4755,13 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
4740
4755
// Record that scalar epilogue is not allowed.
4741
4756
LLVM_DEBUG (dbgs () << " LV: Not allowing scalar epilogue due to -Os/-Oz.\n " );
4742
4757
4743
- IsScalarEpilogueAllowed = !OptForSize;
4744
-
4745
4758
// We don't create an epilogue when optimizing for size.
4746
4759
// Invalidate interleave groups that require an epilogue if we can't mask
4747
4760
// the interleave-group.
4748
- if (!useMaskedInterleavedAccesses (TTI))
4761
+ if (!useMaskedInterleavedAccesses (TTI))
4749
4762
InterleaveInfo.invalidateGroupsRequiringScalarEpilogue ();
4750
4763
4751
- unsigned MaxVF = computeFeasibleMaxVF (OptForSize, TC);
4764
+ unsigned MaxVF = computeFeasibleMaxVF (TC);
4752
4765
4753
4766
if (TC > 0 && TC % MaxVF == 0 ) {
4754
4767
LLVM_DEBUG (dbgs () << " LV: No tail will remain for any chosen VF.\n " );
@@ -4779,8 +4792,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
4779
4792
}
4780
4793
4781
4794
unsigned
4782
- LoopVectorizationCostModel::computeFeasibleMaxVF (bool OptForSize,
4783
- unsigned ConstTripCount) {
4795
+ LoopVectorizationCostModel::computeFeasibleMaxVF (unsigned ConstTripCount) {
4784
4796
MinBWs = computeMinimumValueSizes (TheLoop->getBlocks (), *DB, &TTI);
4785
4797
unsigned SmallestType, WidestType;
4786
4798
std::tie (SmallestType, WidestType) = getSmallestAndWidestTypes ();
@@ -4818,8 +4830,8 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize,
4818
4830
}
4819
4831
4820
4832
unsigned MaxVF = MaxVectorSize;
4821
- if (TTI.shouldMaximizeVectorBandwidth (OptForSize ) ||
4822
- (MaximizeBandwidth && !OptForSize )) {
4833
+ if (TTI.shouldMaximizeVectorBandwidth (! isScalarEpilogueAllowed () ) ||
4834
+ (MaximizeBandwidth && isScalarEpilogueAllowed () )) {
4823
4835
// Collect all viable vectorization factors larger than the default MaxVF
4824
4836
// (i.e. MaxVectorSize).
4825
4837
SmallVector<unsigned , 8 > VFs;
@@ -4958,8 +4970,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4958
4970
return {MinWidth, MaxWidth};
4959
4971
}
4960
4972
4961
- unsigned LoopVectorizationCostModel::selectInterleaveCount (bool OptForSize,
4962
- unsigned VF,
4973
+ unsigned LoopVectorizationCostModel::selectInterleaveCount (unsigned VF,
4963
4974
unsigned LoopCost) {
4964
4975
// -- The interleave heuristics --
4965
4976
// We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4975,8 +4986,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(bool OptForSize,
4975
4986
// 3. We don't interleave if we think that we will spill registers to memory
4976
4987
// due to the increased register pressure.
4977
4988
4978
- // When we optimize for size, we don't interleave.
4979
- if (OptForSize)
4989
+ if (!isScalarEpilogueAllowed ())
4980
4990
return 1 ;
4981
4991
4982
4992
// We used the distance for the interleave count.
@@ -5626,8 +5636,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5626
5636
}
5627
5637
5628
5638
// Calculate the cost of the whole interleaved group.
5629
- bool UseMaskForGaps =
5630
- Group->requiresScalarEpilogue () && !IsScalarEpilogueAllowed ;
5639
+ bool UseMaskForGaps =
5640
+ Group->requiresScalarEpilogue () && !isScalarEpilogueAllowed () ;
5631
5641
unsigned Cost = TTI.getInterleavedMemoryOpCost (
5632
5642
I->getOpcode (), WideVecTy, Group->getFactor (), Indices,
5633
5643
Group->getAlignment (), AS, Legal->isMaskRequired (I), UseMaskForGaps);
@@ -6167,8 +6177,7 @@ static unsigned determineVPlanVF(const unsigned WidestVectorRegBits,
6167
6177
}
6168
6178
6169
6179
VectorizationFactor
6170
- LoopVectorizationPlanner::planInVPlanNativePath (bool OptForSize,
6171
- unsigned UserVF) {
6180
+ LoopVectorizationPlanner::planInVPlanNativePath (unsigned UserVF) {
6172
6181
unsigned VF = UserVF;
6173
6182
// Outer loop handling: They may require CFG and instruction level
6174
6183
// transformations before even evaluating whether vectorization is profitable.
@@ -6207,10 +6216,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(bool OptForSize,
6207
6216
return VectorizationFactor::Disabled ();
6208
6217
}
6209
6218
6210
- Optional<VectorizationFactor> LoopVectorizationPlanner::plan (bool OptForSize,
6211
- unsigned UserVF) {
6219
+ Optional<VectorizationFactor> LoopVectorizationPlanner::plan (unsigned UserVF) {
6212
6220
assert (OrigLoop->empty () && " Inner loop expected." );
6213
- Optional<unsigned > MaybeMaxVF = CM.computeMaxVF (OptForSize );
6221
+ Optional<unsigned > MaybeMaxVF = CM.computeMaxVF ();
6214
6222
if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
6215
6223
return None;
6216
6224
@@ -7213,8 +7221,15 @@ static bool processLoopInVPlanNativePath(
7213
7221
assert (EnableVPlanNativePath && " VPlan-native path is disabled." );
7214
7222
Function *F = L->getHeader ()->getParent ();
7215
7223
InterleavedAccessInfo IAI (PSE, L, DT, LI, LVL->getLAI ());
7216
- LoopVectorizationCostModel CM (L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
7217
- &Hints, IAI);
7224
+
7225
+ ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7226
+ if (Hints.getForce () != LoopVectorizeHints::FK_Enabled &&
7227
+ (F->hasOptSize () ||
7228
+ llvm::shouldOptimizeForSize (L->getHeader (), PSI, BFI)))
7229
+ SEL = CM_ScalarEpilogueNotAllowedOptSize;
7230
+
7231
+ LoopVectorizationCostModel CM (SEL, L, PSE, LI, LVL, *TTI, TLI,
7232
+ DB, AC, ORE, F, &Hints, IAI);
7218
7233
// Use the planner for outer loop vectorization.
7219
7234
// TODO: CM is not used at this point inside the planner. Turn CM into an
7220
7235
// optional argument if we don't need it in the future.
@@ -7223,15 +7238,8 @@ static bool processLoopInVPlanNativePath(
7223
7238
// Get user vectorization factor.
7224
7239
const unsigned UserVF = Hints.getWidth ();
7225
7240
7226
- // Check the function attributes and profiles to find out if this function
7227
- // should be optimized for size.
7228
- bool OptForSize =
7229
- Hints.getForce () != LoopVectorizeHints::FK_Enabled &&
7230
- (F->hasOptSize () ||
7231
- llvm::shouldOptimizeForSize (L->getHeader (), PSI, BFI));
7232
-
7233
7241
// Plan how to best vectorize, return the best VF and its cost.
7234
- const VectorizationFactor VF = LVP.planInVPlanNativePath (OptForSize, UserVF);
7242
+ const VectorizationFactor VF = LVP.planInVPlanNativePath (UserVF);
7235
7243
7236
7244
// If we are stress testing VPlan builds, do not attempt to generate vector
7237
7245
// code. Masked vector code generation support will follow soon.
@@ -7310,10 +7318,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
7310
7318
7311
7319
// Check the function attributes and profiles to find out if this function
7312
7320
// should be optimized for size.
7313
- bool OptForSize =
7314
- Hints.getForce () != LoopVectorizeHints::FK_Enabled &&
7321
+ ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
7322
+ if ( Hints.getForce () != LoopVectorizeHints::FK_Enabled &&
7315
7323
(F->hasOptSize () ||
7316
- llvm::shouldOptimizeForSize (L->getHeader (), PSI, BFI));
7324
+ llvm::shouldOptimizeForSize (L->getHeader (), PSI, BFI)))
7325
+ SEL = CM_ScalarEpilogueNotAllowedOptSize;
7317
7326
7318
7327
// Entrance to the VPlan-native vectorization path. Outer loops are processed
7319
7328
// here. They may require CFG and instruction level transformations before
@@ -7365,7 +7374,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
7365
7374
// Loops with a very small trip count are considered for vectorization
7366
7375
// under OptForSize, thereby making sure the cost of their loop body is
7367
7376
// dominant, free of runtime guards and scalar iteration overheads.
7368
- OptForSize = true ;
7377
+ SEL = CM_ScalarEpilogueNotAllowedLowTripLoop ;
7369
7378
}
7370
7379
}
7371
7380
@@ -7411,8 +7420,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
7411
7420
}
7412
7421
7413
7422
// Use the cost model.
7414
- LoopVectorizationCostModel CM (L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F ,
7415
- &Hints, IAI);
7423
+ LoopVectorizationCostModel CM (SEL, L, PSE, LI, &LVL, *TTI, TLI,
7424
+ DB, AC, ORE, F, &Hints, IAI);
7416
7425
CM.collectValuesToIgnore ();
7417
7426
7418
7427
// Use the planner for vectorization.
@@ -7422,7 +7431,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
7422
7431
unsigned UserVF = Hints.getWidth ();
7423
7432
7424
7433
// Plan how to best vectorize, return the best VF and its cost.
7425
- Optional<VectorizationFactor> MaybeVF = LVP.plan (OptForSize, UserVF);
7434
+ Optional<VectorizationFactor> MaybeVF = LVP.plan (UserVF);
7426
7435
7427
7436
VectorizationFactor VF = VectorizationFactor::Disabled ();
7428
7437
unsigned IC = 1 ;
@@ -7431,7 +7440,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
7431
7440
if (MaybeVF) {
7432
7441
VF = *MaybeVF;
7433
7442
// Select the interleave count.
7434
- IC = CM.selectInterleaveCount (OptForSize, VF.Width , VF.Cost );
7443
+ IC = CM.selectInterleaveCount (VF.Width , VF.Cost );
7435
7444
}
7436
7445
7437
7446
// Identify the diagnostic messages that should be produced.
0 commit comments