@@ -1446,24 +1446,42 @@ class LoopVectorizationCostModel {
1446
1446
1447
1447
// / Returns true if we're required to use a scalar epilogue for at least
1448
1448
// / the final iteration of the original loop.
1449
- bool requiresScalarEpilogue (bool IsVectorizing ) const {
1449
+ bool requiresScalarEpilogue (ElementCount VF ) const {
1450
1450
if (!isScalarEpilogueAllowed ()) {
1451
- LLVM_DEBUG (dbgs () << " LV: Loop does not require scalar epilogue\n " );
1451
+ LLVM_DEBUG (dbgs () << " LV: Loop with VF = " << VF
1452
+ << " does not require scalar epilogue\n " );
1452
1453
return false ;
1453
1454
}
1454
1455
// If we might exit from anywhere but the latch, must run the exiting
1455
1456
// iteration in scalar form.
1456
1457
if (TheLoop->getExitingBlock () != TheLoop->getLoopLatch ()) {
1457
- LLVM_DEBUG (
1458
- dbgs () << " LV: Loop requires scalar epilogue: multiple exits \n " );
1458
+ LLVM_DEBUG (dbgs () << " LV: Loop with VF = " << VF
1459
+ << " requires scalar epilogue: multiple exists \n " );
1459
1460
return true ;
1460
1461
}
1461
- if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue ()) {
1462
- LLVM_DEBUG (dbgs () << " LV: Loop requires scalar epilogue: "
1463
- " interleaved group requires scalar epilogue\n " );
1464
- return true ;
1462
+ if (VF.isVector ()) {
1463
+ if (InterleaveInfo.requiresScalarEpilogue ()) {
1464
+ // Make sure interleaved groups that require scalar epilogue will be
1465
+ // widened.
1466
+ for (auto *G : InterleaveInfo.getInterleaveGroups ()) {
1467
+ if (!G->requiresScalarEpilogue ())
1468
+ continue ;
1469
+
1470
+ Instruction *I = G->getMember (0 );
1471
+ InstWidening Decision = getWideningDecision (I, VF);
1472
+ if (Decision == CM_Interleave ||
1473
+ (Decision == CM_Unknown &&
1474
+ interleavedAccessCanBeWidened (G->getMember (0 ), VF))) {
1475
+ LLVM_DEBUG (dbgs () << " LV: Loop with VF = " << VF
1476
+ << " requires scalar epilogue: interleaved group "
1477
+ " requires scalar epilogue\n " );
1478
+ return true ;
1479
+ }
1480
+ }
1481
+ }
1465
1482
}
1466
- LLVM_DEBUG (dbgs () << " LV: Loop does not require scalar epilogue\n " );
1483
+ LLVM_DEBUG (dbgs () << " LV: Loop with VF = " << VF
1484
+ << " does not require scalar epilogue\n " );
1467
1485
return false ;
1468
1486
}
1469
1487
@@ -1473,7 +1491,7 @@ class LoopVectorizationCostModel {
1473
1491
// / none.
1474
1492
bool requiresScalarEpilogue (VFRange Range) const {
1475
1493
auto RequiresScalarEpilogue = [this ](ElementCount VF) {
1476
- return requiresScalarEpilogue (VF. isVector () );
1494
+ return requiresScalarEpilogue (VF);
1477
1495
};
1478
1496
bool IsRequired = all_of (Range, RequiresScalarEpilogue);
1479
1497
assert (
@@ -2770,7 +2788,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2770
2788
// the step does not evenly divide the trip count, no adjustment is necessary
2771
2789
// since there will already be scalar iterations. Note that the minimum
2772
2790
// iterations check ensures that N >= Step.
2773
- if (Cost->requiresScalarEpilogue (VF. isVector () )) {
2791
+ if (Cost->requiresScalarEpilogue (VF)) {
2774
2792
auto *IsZero = Builder.CreateICmpEQ (R, ConstantInt::get (R->getType (), 0 ));
2775
2793
R = Builder.CreateSelect (IsZero, Step, R);
2776
2794
}
@@ -2823,8 +2841,8 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2823
2841
// vector trip count is zero. This check also covers the case where adding one
2824
2842
// to the backedge-taken count overflowed leading to an incorrect trip count
2825
2843
// of zero. In this case we will also jump to the scalar loop.
2826
- auto P = Cost->requiresScalarEpilogue (VF. isVector () ) ? ICmpInst::ICMP_ULE
2827
- : ICmpInst::ICMP_ULT;
2844
+ auto P = Cost->requiresScalarEpilogue (VF) ? ICmpInst::ICMP_ULE
2845
+ : ICmpInst::ICMP_ULT;
2828
2846
2829
2847
// If tail is to be folded, vector loop takes care of all iterations.
2830
2848
Type *CountTy = Count->getType ();
@@ -2873,7 +2891,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2873
2891
2874
2892
// Update dominator for Bypass & LoopExit (if needed).
2875
2893
DT->changeImmediateDominator (Bypass, TCCheckBlock);
2876
- if (!Cost->requiresScalarEpilogue (VF. isVector () ))
2894
+ if (!Cost->requiresScalarEpilogue (VF))
2877
2895
// If there is an epilogue which must run, there's no edge from the
2878
2896
// middle block to exit blocks and thus no need to update the immediate
2879
2897
// dominator of the exit blocks.
@@ -2902,7 +2920,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
2902
2920
// Update dominator only if this is first RT check.
2903
2921
if (LoopBypassBlocks.empty ()) {
2904
2922
DT->changeImmediateDominator (Bypass, SCEVCheckBlock);
2905
- if (!Cost->requiresScalarEpilogue (VF. isVector () ))
2923
+ if (!Cost->requiresScalarEpilogue (VF))
2906
2924
// If there is an epilogue which must run, there's no edge from the
2907
2925
// middle block to exit blocks and thus no need to update the immediate
2908
2926
// dominator of the exit blocks.
@@ -2955,7 +2973,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2955
2973
LoopVectorPreHeader = OrigLoop->getLoopPreheader ();
2956
2974
assert (LoopVectorPreHeader && " Invalid loop structure" );
2957
2975
LoopExitBlock = OrigLoop->getUniqueExitBlock (); // may be nullptr
2958
- assert ((LoopExitBlock || Cost->requiresScalarEpilogue (VF. isVector () )) &&
2976
+ assert ((LoopExitBlock || Cost->requiresScalarEpilogue (VF)) &&
2959
2977
" multiple exit loop without required epilogue?" );
2960
2978
2961
2979
LoopMiddleBlock =
@@ -2970,7 +2988,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2970
2988
// unconditional branch from the middle block to the scalar preheader. In that
2971
2989
// case, there's no edge from the middle block to exit blocks and thus no
2972
2990
// need to update the immediate dominator of the exit blocks.
2973
- if (Cost->requiresScalarEpilogue (VF. isVector () )) {
2991
+ if (Cost->requiresScalarEpilogue (VF)) {
2974
2992
assert (
2975
2993
LoopMiddleBlock->getSingleSuccessor () == LoopScalarPreHeader &&
2976
2994
" middle block should have the scalar preheader as single successor" );
@@ -3103,7 +3121,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
3103
3121
// Thus if tail is to be folded, we know we don't need to run the
3104
3122
// remainder and we can use the previous value for the condition (true).
3105
3123
// 3) Otherwise, construct a runtime check.
3106
- if (!Cost->requiresScalarEpilogue (VF. isVector () ) &&
3124
+ if (!Cost->requiresScalarEpilogue (VF) &&
3107
3125
!Cost->foldTailByMasking ()) {
3108
3126
// Here we use the same DebugLoc as the scalar loop latch terminator instead
3109
3127
// of the corresponding compare because they may have ended up with
@@ -3413,7 +3431,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
3413
3431
VPRegionBlock *VectorRegion = State.Plan ->getVectorLoopRegion ();
3414
3432
VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock ();
3415
3433
Loop *VectorLoop = LI->getLoopFor (State.CFG .VPBB2IRBB [LatchVPBB]);
3416
- if (Cost->requiresScalarEpilogue (VF. isVector () )) {
3434
+ if (Cost->requiresScalarEpilogue (VF)) {
3417
3435
// No edge from the middle block to the unique exit block has been inserted
3418
3436
// and there is nothing to fix from vector loop; phis should have incoming
3419
3437
// from scalar loop only.
@@ -4664,7 +4682,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
4664
4682
// When a scalar epilogue is required, at least one iteration of the scalar
4665
4683
// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
4666
4684
// max VF that results in a dead vector loop.
4667
- if (MaxTripCount > 0 && requiresScalarEpilogue (true ))
4685
+ if (MaxTripCount > 0 && requiresScalarEpilogue (MaxVectorElementCount ))
4668
4686
MaxTripCount -= 1 ;
4669
4687
4670
4688
if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
@@ -5304,7 +5322,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5304
5322
// At least one iteration must be scalar when this constraint holds. So the
5305
5323
// maximum available iterations for interleaving is one less.
5306
5324
unsigned AvailableTC =
5307
- requiresScalarEpilogue (VF. isVector () ) ? KnownTC - 1 : KnownTC;
5325
+ requiresScalarEpilogue (VF) ? KnownTC - 1 : KnownTC;
5308
5326
5309
5327
// If trip count is known we select between two prospective ICs, where
5310
5328
// 1) the aggressive IC is capped by the trip count divided by VF
@@ -5333,7 +5351,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5333
5351
} else if (BestKnownTC && *BestKnownTC > 0 ) {
5334
5352
// At least one iteration must be scalar when this constraint holds. So the
5335
5353
// maximum available iterations for interleaving is one less.
5336
- unsigned AvailableTC = requiresScalarEpilogue (VF. isVector () )
5354
+ unsigned AvailableTC = requiresScalarEpilogue (VF)
5337
5355
? (*BestKnownTC) - 1
5338
5356
: *BestKnownTC;
5339
5357
@@ -7640,8 +7658,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7640
7658
7641
7659
// Generate code to check if the loop's trip count is less than VF * UF of the
7642
7660
// main vector loop.
7643
- auto P = Cost->requiresScalarEpilogue (ForEpilogue ? EPI.EpilogueVF .isVector ()
7644
- : VF.isVector ())
7661
+ auto P = Cost->requiresScalarEpilogue (ForEpilogue ? EPI.EpilogueVF : VF)
7645
7662
? ICmpInst::ICMP_ULE
7646
7663
: ICmpInst::ICMP_ULT;
7647
7664
@@ -7663,7 +7680,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7663
7680
7664
7681
// Update dominator for Bypass & LoopExit.
7665
7682
DT->changeImmediateDominator (Bypass, TCCheckBlock);
7666
- if (!Cost->requiresScalarEpilogue (EPI.EpilogueVF . isVector () ))
7683
+ if (!Cost->requiresScalarEpilogue (EPI.EpilogueVF ))
7667
7684
// For loops with multiple exits, there's no edge from the middle block
7668
7685
// to exit blocks (as the epilogue must run) and thus no need to update
7669
7686
// the immediate dominator of the exit blocks.
@@ -7732,7 +7749,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
7732
7749
7733
7750
DT->changeImmediateDominator (LoopScalarPreHeader,
7734
7751
EPI.EpilogueIterationCountCheck );
7735
- if (!Cost->requiresScalarEpilogue (EPI.EpilogueVF . isVector () ))
7752
+ if (!Cost->requiresScalarEpilogue (EPI.EpilogueVF ))
7736
7753
// If there is an epilogue which must run, there's no edge from the
7737
7754
// middle block to exit blocks and thus no need to update the immediate
7738
7755
// dominator of the exit blocks.
@@ -7814,9 +7831,8 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7814
7831
7815
7832
// Generate code to check if the loop's trip count is less than VF * UF of the
7816
7833
// vector epilogue loop.
7817
- auto P = Cost->requiresScalarEpilogue (EPI.EpilogueVF .isVector ())
7818
- ? ICmpInst::ICMP_ULE
7819
- : ICmpInst::ICMP_ULT;
7834
+ auto P = Cost->requiresScalarEpilogue (EPI.EpilogueVF ) ? ICmpInst::ICMP_ULE
7835
+ : ICmpInst::ICMP_ULT;
7820
7836
7821
7837
Value *CheckMinIters =
7822
7838
Builder.CreateICmp (P, Count,
0 commit comments