Skip to content

Commit c340589

Browse files
[LV] Don't require scalar epilogue for unsupported IAG with tail
LV should check that all groups that require scalar epilogue can be widened, otherwise if InterleavedAccessGroup cannot be widened and does have tail element, current logic in LV requires to emit scalar epilogue, which leads to inefficient vector code.
1 parent 02b57de commit c340589

File tree

8 files changed

+507
-324
lines changed

8 files changed

+507
-324
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 45 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1446,24 +1446,42 @@ class LoopVectorizationCostModel {
14461446

14471447
/// Returns true if we're required to use a scalar epilogue for at least
14481448
/// the final iteration of the original loop.
1449-
bool requiresScalarEpilogue(bool IsVectorizing) const {
1449+
bool requiresScalarEpilogue(ElementCount VF) const {
14501450
if (!isScalarEpilogueAllowed()) {
1451-
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1451+
LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
1452+
<< " does not require scalar epilogue\n");
14521453
return false;
14531454
}
14541455
// If we might exit from anywhere but the latch, must run the exiting
14551456
// iteration in scalar form.
14561457
if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
1457-
LLVM_DEBUG(
1458-
dbgs() << "LV: Loop requires scalar epilogue: multiple exits\n");
1458+
LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
1459+
<< " requires scalar epilogue: multiple exists\n");
14591460
return true;
14601461
}
1461-
if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1462-
LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1463-
"interleaved group requires scalar epilogue\n");
1464-
return true;
1462+
if (VF.isVector()) {
1463+
if (InterleaveInfo.requiresScalarEpilogue()) {
1464+
// Make sure interleaved groups that require scalar epilogue will be
1465+
// widened.
1466+
for (auto *G : InterleaveInfo.getInterleaveGroups()) {
1467+
if (!G->requiresScalarEpilogue())
1468+
continue;
1469+
1470+
Instruction *I = G->getMember(0);
1471+
InstWidening Decision = getWideningDecision(I, VF);
1472+
if (Decision == CM_Interleave ||
1473+
(Decision == CM_Unknown &&
1474+
interleavedAccessCanBeWidened(G->getMember(0), VF))) {
1475+
LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
1476+
<< " requires scalar epilogue: interleaved group "
1477+
"requires scalar epilogue\n");
1478+
return true;
1479+
}
1480+
}
1481+
}
14651482
}
1466-
LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1483+
LLVM_DEBUG(dbgs() << "LV: Loop with VF = " << VF
1484+
<< " does not require scalar epilogue\n");
14671485
return false;
14681486
}
14691487

@@ -1473,7 +1491,7 @@ class LoopVectorizationCostModel {
14731491
/// none.
14741492
bool requiresScalarEpilogue(VFRange Range) const {
14751493
auto RequiresScalarEpilogue = [this](ElementCount VF) {
1476-
return requiresScalarEpilogue(VF.isVector());
1494+
return requiresScalarEpilogue(VF);
14771495
};
14781496
bool IsRequired = all_of(Range, RequiresScalarEpilogue);
14791497
assert(
@@ -2770,7 +2788,7 @@ InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
27702788
// the step does not evenly divide the trip count, no adjustment is necessary
27712789
// since there will already be scalar iterations. Note that the minimum
27722790
// iterations check ensures that N >= Step.
2773-
if (Cost->requiresScalarEpilogue(VF.isVector())) {
2791+
if (Cost->requiresScalarEpilogue(VF)) {
27742792
auto *IsZero = Builder.CreateICmpEQ(R, ConstantInt::get(R->getType(), 0));
27752793
R = Builder.CreateSelect(IsZero, Step, R);
27762794
}
@@ -2823,8 +2841,8 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
28232841
// vector trip count is zero. This check also covers the case where adding one
28242842
// to the backedge-taken count overflowed leading to an incorrect trip count
28252843
// of zero. In this case we will also jump to the scalar loop.
2826-
auto P = Cost->requiresScalarEpilogue(VF.isVector()) ? ICmpInst::ICMP_ULE
2827-
: ICmpInst::ICMP_ULT;
2844+
auto P = Cost->requiresScalarEpilogue(VF) ? ICmpInst::ICMP_ULE
2845+
: ICmpInst::ICMP_ULT;
28282846

28292847
// If tail is to be folded, vector loop takes care of all iterations.
28302848
Type *CountTy = Count->getType();
@@ -2873,7 +2891,7 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
28732891

28742892
// Update dominator for Bypass & LoopExit (if needed).
28752893
DT->changeImmediateDominator(Bypass, TCCheckBlock);
2876-
if (!Cost->requiresScalarEpilogue(VF.isVector()))
2894+
if (!Cost->requiresScalarEpilogue(VF))
28772895
// If there is an epilogue which must run, there's no edge from the
28782896
// middle block to exit blocks and thus no need to update the immediate
28792897
// dominator of the exit blocks.
@@ -2902,7 +2920,7 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) {
29022920
// Update dominator only if this is first RT check.
29032921
if (LoopBypassBlocks.empty()) {
29042922
DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
2905-
if (!Cost->requiresScalarEpilogue(VF.isVector()))
2923+
if (!Cost->requiresScalarEpilogue(VF))
29062924
// If there is an epilogue which must run, there's no edge from the
29072925
// middle block to exit blocks and thus no need to update the immediate
29082926
// dominator of the exit blocks.
@@ -2955,7 +2973,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
29552973
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
29562974
assert(LoopVectorPreHeader && "Invalid loop structure");
29572975
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
2958-
assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF.isVector())) &&
2976+
assert((LoopExitBlock || Cost->requiresScalarEpilogue(VF)) &&
29592977
"multiple exit loop without required epilogue?");
29602978

29612979
LoopMiddleBlock =
@@ -2970,7 +2988,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
29702988
// unconditional branch from the middle block to the scalar preheader. In that
29712989
// case, there's no edge from the middle block to exit blocks and thus no
29722990
// need to update the immediate dominator of the exit blocks.
2973-
if (Cost->requiresScalarEpilogue(VF.isVector())) {
2991+
if (Cost->requiresScalarEpilogue(VF)) {
29742992
assert(
29752993
LoopMiddleBlock->getSingleSuccessor() == LoopScalarPreHeader &&
29762994
" middle block should have the scalar preheader as single successor");
@@ -3103,7 +3121,7 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
31033121
// Thus if tail is to be folded, we know we don't need to run the
31043122
// remainder and we can use the previous value for the condition (true).
31053123
// 3) Otherwise, construct a runtime check.
3106-
if (!Cost->requiresScalarEpilogue(VF.isVector()) &&
3124+
if (!Cost->requiresScalarEpilogue(VF) &&
31073125
!Cost->foldTailByMasking()) {
31083126
// Here we use the same DebugLoc as the scalar loop latch terminator instead
31093127
// of the corresponding compare because they may have ended up with
@@ -3413,7 +3431,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
34133431
VPRegionBlock *VectorRegion = State.Plan->getVectorLoopRegion();
34143432
VPBasicBlock *LatchVPBB = VectorRegion->getExitingBasicBlock();
34153433
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[LatchVPBB]);
3416-
if (Cost->requiresScalarEpilogue(VF.isVector())) {
3434+
if (Cost->requiresScalarEpilogue(VF)) {
34173435
// No edge from the middle block to the unique exit block has been inserted
34183436
// and there is nothing to fix from vector loop; phis should have incoming
34193437
// from scalar loop only.
@@ -4664,7 +4682,7 @@ ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
46644682
// When a scalar epilogue is required, at least one iteration of the scalar
46654683
// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
46664684
// max VF that results in a dead vector loop.
4667-
if (MaxTripCount > 0 && requiresScalarEpilogue(true))
4685+
if (MaxTripCount > 0 && requiresScalarEpilogue(MaxVectorElementCount))
46684686
MaxTripCount -= 1;
46694687

46704688
if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
@@ -5304,7 +5322,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
53045322
// At least one iteration must be scalar when this constraint holds. So the
53055323
// maximum available iterations for interleaving is one less.
53065324
unsigned AvailableTC =
5307-
requiresScalarEpilogue(VF.isVector()) ? KnownTC - 1 : KnownTC;
5325+
requiresScalarEpilogue(VF) ? KnownTC - 1 : KnownTC;
53085326

53095327
// If trip count is known we select between two prospective ICs, where
53105328
// 1) the aggressive IC is capped by the trip count divided by VF
@@ -5333,7 +5351,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
53335351
} else if (BestKnownTC && *BestKnownTC > 0) {
53345352
// At least one iteration must be scalar when this constraint holds. So the
53355353
// maximum available iterations for interleaving is one less.
5336-
unsigned AvailableTC = requiresScalarEpilogue(VF.isVector())
5354+
unsigned AvailableTC = requiresScalarEpilogue(VF)
53375355
? (*BestKnownTC) - 1
53385356
: *BestKnownTC;
53395357

@@ -7640,8 +7658,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
76407658

76417659
// Generate code to check if the loop's trip count is less than VF * UF of the
76427660
// main vector loop.
7643-
auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF.isVector()
7644-
: VF.isVector())
7661+
auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF)
76457662
? ICmpInst::ICMP_ULE
76467663
: ICmpInst::ICMP_ULT;
76477664

@@ -7663,7 +7680,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
76637680

76647681
// Update dominator for Bypass & LoopExit.
76657682
DT->changeImmediateDominator(Bypass, TCCheckBlock);
7666-
if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7683+
if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
76677684
// For loops with multiple exits, there's no edge from the middle block
76687685
// to exit blocks (as the epilogue must run) and thus no need to update
76697686
// the immediate dominator of the exit blocks.
@@ -7732,7 +7749,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
77327749

77337750
DT->changeImmediateDominator(LoopScalarPreHeader,
77347751
EPI.EpilogueIterationCountCheck);
7735-
if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
7752+
if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF))
77367753
// If there is an epilogue which must run, there's no edge from the
77377754
// middle block to exit blocks and thus no need to update the immediate
77387755
// dominator of the exit blocks.
@@ -7814,9 +7831,8 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
78147831

78157832
// Generate code to check if the loop's trip count is less than VF * UF of the
78167833
// vector epilogue loop.
7817-
auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector())
7818-
? ICmpInst::ICMP_ULE
7819-
: ICmpInst::ICMP_ULT;
7834+
auto P = Cost->requiresScalarEpilogue(EPI.EpilogueVF) ? ICmpInst::ICMP_ULE
7835+
: ICmpInst::ICMP_ULT;
78207836

78217837
Value *CheckMinIters =
78227838
Builder.CreateICmp(P, Count,

0 commit comments

Comments
 (0)