Skip to content

Commit 3e5ce49

Browse files
committed
[LV] Unconditionally branch from middle to scalar preheader if the scalar loop must execute
If we know that the scalar epilogue is required to run, modify the CFG to end the middle block with an unconditional branch to scalar preheader. This is instead of a conditional branch to either the preheader or the exit block. The motivation to do this is to support multiple exit blocks. Specifically, the current structure forces us to identify immediate dominators and *which* exit block to branch from in the middle terminator. For the multiple exit case - where we know require scalar will hold - these questions are ill formed. This is the last change needed to support multiple exit loops, but since the diffs are already large enough, I'm going to land this, and then enable separately. You can think of this as being NFCI-ish prep work, but the changes are a bit too involved for me to feel comfortable tagging the change that way. Differential Revision: https://reviews.llvm.org/D94892
1 parent b68a6b0 commit 3e5ce49

File tree

5 files changed

+123
-90
lines changed

5 files changed

+123
-90
lines changed

llvm/lib/Transforms/Utils/LoopVersioning.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI,
4444
AliasChecks(Checks.begin(), Checks.end()),
4545
Preds(LAI.getPSE().getUnionPredicate()), LAI(LAI), LI(LI), DT(DT),
4646
SE(SE) {
47-
assert(L->getUniqueExitBlock() && "No single exit block");
4847
}
4948

5049
void LoopVersioning::versionLoop(
5150
const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
51+
assert(VersionedLoop->getUniqueExitBlock() && "No single exit block");
5252
assert(VersionedLoop->isLoopSimplifyForm() &&
5353
"Loop is not in loop-simplify form");
5454

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 87 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -852,7 +852,7 @@ class InnerLoopVectorizer {
852852
/// Middle Block between the vector and the scalar.
853853
BasicBlock *LoopMiddleBlock;
854854

855-
/// The (unique) ExitBlock of the scalar loop. Note that
855+
/// The unique ExitBlock of the scalar loop if one exists. Note that
856856
/// there can be multiple exiting edges reaching this block.
857857
BasicBlock *LoopExitBlock;
858858

@@ -3147,9 +3147,13 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
31473147
DT->getNode(Bypass)->getIDom()) &&
31483148
"TC check is expected to dominate Bypass");
31493149

3150-
// Update dominator for Bypass & LoopExit.
3150+
// Update dominator for Bypass & LoopExit (if needed).
31513151
DT->changeImmediateDominator(Bypass, TCCheckBlock);
3152-
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3152+
if (!Cost->requiresScalarEpilogue())
3153+
// If there is an epilogue which must run, there's no edge from the
3154+
// middle block to exit blocks and thus no need to update the immediate
3155+
// dominator of the exit blocks.
3156+
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
31533157

31543158
ReplaceInstWithInst(
31553159
TCCheckBlock->getTerminator(),
@@ -3188,7 +3192,11 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
31883192
// Update dominator only if this is first RT check.
31893193
if (LoopBypassBlocks.empty()) {
31903194
DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3191-
DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3195+
if (!Cost->requiresScalarEpilogue())
3196+
// If there is an epilogue which must run, there's no edge from the
3197+
// middle block to exit blocks and thus no need to update the immediate
3198+
// dominator of the exit blocks.
3199+
DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
31923200
}
31933201

31943202
ReplaceInstWithInst(
@@ -3244,7 +3252,11 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
32443252
// Update dominator only if this is first RT check.
32453253
if (LoopBypassBlocks.empty()) {
32463254
DT->changeImmediateDominator(Bypass, MemCheckBlock);
3247-
DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3255+
if (!Cost->requiresScalarEpilogue())
3256+
// If there is an epilogue which must run, there's no edge from the
3257+
// middle block to exit blocks and thus no need to update the immediate
3258+
// dominator of the exit blocks.
3259+
DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
32483260
}
32493261

32503262
Instruction *FirstCheckInst;
@@ -3369,9 +3381,10 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
33693381
Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
33703382
LoopScalarBody = OrigLoop->getHeader();
33713383
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3372-
LoopExitBlock = OrigLoop->getUniqueExitBlock();
3373-
assert(LoopExitBlock && "Must have an exit block");
33743384
assert(LoopVectorPreHeader && "Invalid loop structure");
3385+
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3386+
assert((LoopExitBlock || Cost->requiresScalarEpilogue()) &&
3387+
"multiple exit loop without required epilogue?");
33753388

33763389
LoopMiddleBlock =
33773390
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
@@ -3380,12 +3393,20 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
33803393
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
33813394
nullptr, Twine(Prefix) + "scalar.ph");
33823395

3383-
// Set up branch from middle block to the exit and scalar preheader blocks.
3384-
// completeLoopSkeleton will update the condition to use an iteration check,
3385-
// if required to decide whether to execute the remainder.
3386-
BranchInst *BrInst =
3387-
BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
33883396
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3397+
3398+
// Set up the middle block terminator. Two cases:
3399+
// 1) If we know that we must execute the scalar epilogue, emit an
3400+
// unconditional branch.
3401+
// 2) Otherwise, we must have a single unique exit block (due to how we
3402+
// implement the multiple exit case). In this case, set up a conditonal
3403+
// branch from the middle block to the loop scalar preheader, and the
3404+
// exit block. completeLoopSkeleton will update the condition to use an
3405+
// iteration check, if required to decide whether to execute the remainder.
3406+
BranchInst *BrInst = Cost->requiresScalarEpilogue() ?
3407+
BranchInst::Create(LoopScalarPreHeader) :
3408+
BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3409+
Builder.getTrue());
33893410
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
33903411
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
33913412

@@ -3397,7 +3418,11 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
33973418
nullptr, nullptr, Twine(Prefix) + "vector.body");
33983419

33993420
// Update dominator for loop exit.
3400-
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3421+
if (!Cost->requiresScalarEpilogue())
3422+
// If there is an epilogue which must run, there's no edge from the
3423+
// middle block to exit blocks and thus no need to update the immediate
3424+
// dominator of the exit blocks.
3425+
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
34013426

34023427
// Create and register the new vector loop.
34033428
Loop *Lp = LI->AllocateLoop();
@@ -3494,10 +3519,14 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
34943519
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
34953520

34963521
// Add a check in the middle block to see if we have completed
3497-
// all of the iterations in the first vector loop.
3498-
// If (N - N%VF) == N, then we *don't* need to run the remainder.
3499-
// If tail is to be folded, we know we don't need to run the remainder.
3500-
if (!Cost->foldTailByMasking()) {
3522+
// all of the iterations in the first vector loop. Three cases:
3523+
// 1) If we require a scalar epilogue, there is no conditional branch as
3524+
// we unconditionally branch to the scalar preheader. Do nothing.
3525+
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3526+
// Thus if tail is to be folded, we know we don't need to run the
3527+
// remainder and we can use the previous value for the condition (true).
3528+
// 3) Otherwise, construct a runtime check.
3529+
if (!Cost->requiresScalarEpilogue() && !Cost->foldTailByMasking()) {
35013530
Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
35023531
Count, VectorTripCount, "cmp.n",
35033532
LoopMiddleBlock->getTerminator());
@@ -3561,17 +3590,17 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
35613590
| [ ]_| <-- vector loop.
35623591
| |
35633592
| v
3564-
| -[ ] <--- middle-block.
3565-
| / |
3566-
| / v
3567-
-|- >[ ] <--- new preheader.
3593+
\ -[ ] <--- middle-block.
3594+
\/ |
3595+
/\ v
3596+
| ->[ ] <--- new preheader.
35683597
| |
3569-
| v
3598+
(opt) v <-- edge from middle to exit iff epilogue is not required.
35703599
| [ ] \
3571-
| [ ]_| <-- old scalar loop to handle remainder.
3600+
| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
35723601
\ |
35733602
\ v
3574-
>[ ] <-- exit block.
3603+
>[ ] <-- exit block(s).
35753604
...
35763605
*/
35773606

@@ -3975,13 +4004,18 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
39754004
// Forget the original basic block.
39764005
PSE.getSE()->forgetLoop(OrigLoop);
39774006

3978-
// Fix-up external users of the induction variables.
3979-
for (auto &Entry : Legal->getInductionVars())
3980-
fixupIVUsers(Entry.first, Entry.second,
3981-
getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3982-
IVEndValues[Entry.first], LoopMiddleBlock);
4007+
// If we inserted an edge from the middle block to the unique exit block,
4008+
// update uses outside the loop (phis) to account for the newly inserted
4009+
// edge.
4010+
if (!Cost->requiresScalarEpilogue()) {
4011+
// Fix-up external users of the induction variables.
4012+
for (auto &Entry : Legal->getInductionVars())
4013+
fixupIVUsers(Entry.first, Entry.second,
4014+
getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4015+
IVEndValues[Entry.first], LoopMiddleBlock);
39834016

3984-
fixLCSSAPHIs();
4017+
fixLCSSAPHIs();
4018+
}
39854019
for (Instruction *PI : PredicatedInstructions)
39864020
sinkScalarOperands(&*PI);
39874021

@@ -4199,12 +4233,13 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
41994233
// recurrence in the exit block, and then add an edge for the middle block.
42004234
// Note that LCSSA does not imply single entry when the original scalar loop
42014235
// had multiple exiting edges (as we always run the last iteration in the
4202-
// scalar epilogue); in that case, the exiting path through middle will be
4203-
// dynamically dead and the value picked for the phi doesn't matter.
4204-
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4205-
if (any_of(LCSSAPhi.incoming_values(),
4206-
[Phi](Value *V) { return V == Phi; }))
4207-
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4236+
// scalar epilogue); in that case, there is no edge from middle to exit and
4237+
// and thus no phis which needed updated.
4238+
if (!Cost->requiresScalarEpilogue())
4239+
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4240+
if (any_of(LCSSAPhi.incoming_values(),
4241+
[Phi](Value *V) { return V == Phi; }))
4242+
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
42084243
}
42094244

42104245
void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
@@ -4369,10 +4404,11 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
43694404
// We know that the loop is in LCSSA form. We need to update the PHI nodes
43704405
// in the exit blocks. See comment on analogous loop in
43714406
// fixFirstOrderRecurrence for a more complete explaination of the logic.
4372-
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4373-
if (any_of(LCSSAPhi.incoming_values(),
4374-
[LoopExitInst](Value *V) { return V == LoopExitInst; }))
4375-
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4407+
if (!Cost->requiresScalarEpilogue())
4408+
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4409+
if (any_of(LCSSAPhi.incoming_values(),
4410+
[LoopExitInst](Value *V) { return V == LoopExitInst; }))
4411+
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
43764412

43774413
// Fix the scalar loop reduction variable with the incoming reduction sum
43784414
// from the vector body and from the backedge value.
@@ -8021,7 +8057,11 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
80218057

80228058
// Update dominator for Bypass & LoopExit.
80238059
DT->changeImmediateDominator(Bypass, TCCheckBlock);
8024-
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8060+
if (!Cost->requiresScalarEpilogue())
8061+
// For loops with multiple exits, there's no edge from the middle block
8062+
// to exit blocks (as the epilogue must run) and thus no need to update
8063+
// the immediate dominator of the exit blocks.
8064+
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
80258065

80268066
LoopBypassBlocks.push_back(TCCheckBlock);
80278067

@@ -8085,7 +8125,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
80858125

80868126
DT->changeImmediateDominator(LoopScalarPreHeader,
80878127
EPI.EpilogueIterationCountCheck);
8088-
DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
8128+
if (!Cost->requiresScalarEpilogue())
8129+
// If there is an epilogue which must run, there's no edge from the
8130+
// middle block to exit blocks and thus no need to update the immediate
8131+
// dominator of the exit blocks.
8132+
DT->changeImmediateDominator(LoopExitBlock,
8133+
EPI.EpilogueIterationCountCheck);
80898134

80908135
// Keep track of bypass blocks, as they feed start values to the induction
80918136
// phis in the scalar loop preheader.

llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -471,10 +471,9 @@ define i16 @multiple_exit(i16* %p, i32 %n) {
471471
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
472472
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
473473
; CHECK: middle.block:
474-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
475474
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
476475
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
477-
; CHECK-NEXT: br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
476+
; CHECK-NEXT: br label [[SCALAR_PH]]
478477
; CHECK: scalar.ph:
479478
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
480479
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -486,14 +485,14 @@ define i16 @multiple_exit(i16* %p, i32 %n) {
486485
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
487486
; CHECK-NEXT: [[REC_NEXT]] = load i16, i16* [[B]], align 2
488487
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
489-
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
488+
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
490489
; CHECK: for.body:
491490
; CHECK-NEXT: store i16 [[SCALAR_RECUR]], i16* [[B]], align 4
492491
; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1
493492
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
494493
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP7:!llvm.loop !.*]]
495494
; CHECK: if.end:
496-
; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
495+
; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[SCALAR_RECUR]], [[FOR_COND]] ]
497496
; CHECK-NEXT: ret i16 [[REC_LCSSA]]
498497
;
499498
entry:
@@ -558,10 +557,9 @@ define i16 @multiple_exit2(i16* %p, i32 %n) {
558557
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
559558
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
560559
; CHECK: middle.block:
561-
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
562560
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
563561
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
564-
; CHECK-NEXT: br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
562+
; CHECK-NEXT: br label [[SCALAR_PH]]
565563
; CHECK: scalar.ph:
566564
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
567565
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -573,14 +571,14 @@ define i16 @multiple_exit2(i16* %p, i32 %n) {
573571
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
574572
; CHECK-NEXT: [[REC_NEXT]] = load i16, i16* [[B]], align 2
575573
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
576-
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
574+
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
577575
; CHECK: for.body:
578576
; CHECK-NEXT: store i16 [[SCALAR_RECUR]], i16* [[B]], align 4
579577
; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1
580578
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
581579
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP9:!llvm.loop !.*]]
582580
; CHECK: if.end:
583-
; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ 10, [[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
581+
; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ 10, [[FOR_BODY]] ]
584582
; CHECK-NEXT: ret i16 [[REC_LCSSA]]
585583
;
586584
entry:

0 commit comments

Comments
 (0)