Skip to content

Commit 7fe41ac

Browse files
committed
Revert "[LV] Unconditionally branch from middle to scalar preheader if the scalar loop must execute"
This reverts commit 3e5ce49. Tests started failing on PPC, for example: http://lab.llvm.org:8011/#/builders/105/builds/5569
1 parent 78935ea commit 7fe41ac

File tree

5 files changed

+90
-123
lines changed

5 files changed

+90
-123
lines changed

llvm/lib/Transforms/Utils/LoopVersioning.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,11 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI,
4444
AliasChecks(Checks.begin(), Checks.end()),
4545
Preds(LAI.getPSE().getUnionPredicate()), LAI(LAI), LI(LI), DT(DT),
4646
SE(SE) {
47+
assert(L->getUniqueExitBlock() && "No single exit block");
4748
}
4849

4950
void LoopVersioning::versionLoop(
5051
const SmallVectorImpl<Instruction *> &DefsUsedOutside) {
51-
assert(VersionedLoop->getUniqueExitBlock() && "No single exit block");
5252
assert(VersionedLoop->isLoopSimplifyForm() &&
5353
"Loop is not in loop-simplify form");
5454

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 42 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -852,7 +852,7 @@ class InnerLoopVectorizer {
852852
/// Middle Block between the vector and the scalar.
853853
BasicBlock *LoopMiddleBlock;
854854

855-
/// The unique ExitBlock of the scalar loop if one exists. Note that
855+
/// The (unique) ExitBlock of the scalar loop. Note that
856856
/// there can be multiple exiting edges reaching this block.
857857
BasicBlock *LoopExitBlock;
858858

@@ -3147,13 +3147,9 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
31473147
DT->getNode(Bypass)->getIDom()) &&
31483148
"TC check is expected to dominate Bypass");
31493149

3150-
// Update dominator for Bypass & LoopExit (if needed).
3150+
// Update dominator for Bypass & LoopExit.
31513151
DT->changeImmediateDominator(Bypass, TCCheckBlock);
3152-
if (!Cost->requiresScalarEpilogue())
3153-
// If there is an epilogue which must run, there's no edge from the
3154-
// middle block to exit blocks and thus no need to update the immediate
3155-
// dominator of the exit blocks.
3156-
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
3152+
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
31573153

31583154
ReplaceInstWithInst(
31593155
TCCheckBlock->getTerminator(),
@@ -3192,11 +3188,7 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
31923188
// Update dominator only if this is first RT check.
31933189
if (LoopBypassBlocks.empty()) {
31943190
DT->changeImmediateDominator(Bypass, SCEVCheckBlock);
3195-
if (!Cost->requiresScalarEpilogue())
3196-
// If there is an epilogue which must run, there's no edge from the
3197-
// middle block to exit blocks and thus no need to update the immediate
3198-
// dominator of the exit blocks.
3199-
DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
3191+
DT->changeImmediateDominator(LoopExitBlock, SCEVCheckBlock);
32003192
}
32013193

32023194
ReplaceInstWithInst(
@@ -3252,11 +3244,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
32523244
// Update dominator only if this is first RT check.
32533245
if (LoopBypassBlocks.empty()) {
32543246
DT->changeImmediateDominator(Bypass, MemCheckBlock);
3255-
if (!Cost->requiresScalarEpilogue())
3256-
// If there is an epilogue which must run, there's no edge from the
3257-
// middle block to exit blocks and thus no need to update the immediate
3258-
// dominator of the exit blocks.
3259-
DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
3247+
DT->changeImmediateDominator(LoopExitBlock, MemCheckBlock);
32603248
}
32613249

32623250
Instruction *FirstCheckInst;
@@ -3381,10 +3369,9 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
33813369
Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
33823370
LoopScalarBody = OrigLoop->getHeader();
33833371
LoopVectorPreHeader = OrigLoop->getLoopPreheader();
3372+
LoopExitBlock = OrigLoop->getUniqueExitBlock();
3373+
assert(LoopExitBlock && "Must have an exit block");
33843374
assert(LoopVectorPreHeader && "Invalid loop structure");
3385-
LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
3386-
assert((LoopExitBlock || Cost->requiresScalarEpilogue()) &&
3387-
"multiple exit loop without required epilogue?");
33883375

33893376
LoopMiddleBlock =
33903377
SplitBlock(LoopVectorPreHeader, LoopVectorPreHeader->getTerminator(), DT,
@@ -3393,20 +3380,12 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
33933380
SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
33943381
nullptr, Twine(Prefix) + "scalar.ph");
33953382

3383+
// Set up branch from middle block to the exit and scalar preheader blocks.
3384+
// completeLoopSkeleton will update the condition to use an iteration check,
3385+
// if required to decide whether to execute the remainder.
3386+
BranchInst *BrInst =
3387+
BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, Builder.getTrue());
33963388
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
3397-
3398-
// Set up the middle block terminator. Two cases:
3399-
// 1) If we know that we must execute the scalar epilogue, emit an
3400-
// unconditional branch.
3401-
// 2) Otherwise, we must have a single unique exit block (due to how we
3402-
// implement the multiple exit case). In this case, set up a conditonal
3403-
// branch from the middle block to the loop scalar preheader, and the
3404-
// exit block. completeLoopSkeleton will update the condition to use an
3405-
// iteration check, if required to decide whether to execute the remainder.
3406-
BranchInst *BrInst = Cost->requiresScalarEpilogue() ?
3407-
BranchInst::Create(LoopScalarPreHeader) :
3408-
BranchInst::Create(LoopExitBlock, LoopScalarPreHeader,
3409-
Builder.getTrue());
34103389
BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc());
34113390
ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
34123391

@@ -3418,11 +3397,7 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
34183397
nullptr, nullptr, Twine(Prefix) + "vector.body");
34193398

34203399
// Update dominator for loop exit.
3421-
if (!Cost->requiresScalarEpilogue())
3422-
// If there is an epilogue which must run, there's no edge from the
3423-
// middle block to exit blocks and thus no need to update the immediate
3424-
// dominator of the exit blocks.
3425-
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
3400+
DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
34263401

34273402
// Create and register the new vector loop.
34283403
Loop *Lp = LI->AllocateLoop();
@@ -3519,14 +3494,10 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
35193494
auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
35203495

35213496
// Add a check in the middle block to see if we have completed
3522-
// all of the iterations in the first vector loop. Three cases:
3523-
// 1) If we require a scalar epilogue, there is no conditional branch as
3524-
// we unconditionally branch to the scalar preheader. Do nothing.
3525-
// 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3526-
// Thus if tail is to be folded, we know we don't need to run the
3527-
// remainder and we can use the previous value for the condition (true).
3528-
// 3) Otherwise, construct a runtime check.
3529-
if (!Cost->requiresScalarEpilogue() && !Cost->foldTailByMasking()) {
3497+
// all of the iterations in the first vector loop.
3498+
// If (N - N%VF) == N, then we *don't* need to run the remainder.
3499+
// If tail is to be folded, we know we don't need to run the remainder.
3500+
if (!Cost->foldTailByMasking()) {
35303501
Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
35313502
Count, VectorTripCount, "cmp.n",
35323503
LoopMiddleBlock->getTerminator());
@@ -3590,17 +3561,17 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
35903561
| [ ]_| <-- vector loop.
35913562
| |
35923563
| v
3593-
\ -[ ] <--- middle-block.
3594-
\/ |
3595-
/\ v
3596-
| ->[ ] <--- new preheader.
3564+
| -[ ] <--- middle-block.
3565+
| / |
3566+
| / v
3567+
-|- >[ ] <--- new preheader.
35973568
| |
3598-
(opt) v <-- edge from middle to exit iff epilogue is not required.
3569+
| v
35993570
| [ ] \
3600-
| [ ]_| <-- old scalar loop to handle remainder (scalar epilogue).
3571+
| [ ]_| <-- old scalar loop to handle remainder.
36013572
\ |
36023573
\ v
3603-
>[ ] <-- exit block(s).
3574+
>[ ] <-- exit block.
36043575
...
36053576
*/
36063577

@@ -4021,18 +3992,13 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
40213992
// Forget the original basic block.
40223993
PSE.getSE()->forgetLoop(OrigLoop);
40233994

4024-
// If we inserted an edge from the middle block to the unique exit block,
4025-
// update uses outside the loop (phis) to account for the newly inserted
4026-
// edge.
4027-
if (!Cost->requiresScalarEpilogue()) {
4028-
// Fix-up external users of the induction variables.
4029-
for (auto &Entry : Legal->getInductionVars())
4030-
fixupIVUsers(Entry.first, Entry.second,
4031-
getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
4032-
IVEndValues[Entry.first], LoopMiddleBlock);
3995+
// Fix-up external users of the induction variables.
3996+
for (auto &Entry : Legal->getInductionVars())
3997+
fixupIVUsers(Entry.first, Entry.second,
3998+
getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
3999+
IVEndValues[Entry.first], LoopMiddleBlock);
40334000

4034-
fixLCSSAPHIs();
4035-
}
4001+
fixLCSSAPHIs();
40364002
for (Instruction *PI : PredicatedInstructions)
40374003
sinkScalarOperands(&*PI);
40384004

@@ -4250,13 +4216,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
42504216
// recurrence in the exit block, and then add an edge for the middle block.
42514217
// Note that LCSSA does not imply single entry when the original scalar loop
42524218
// had multiple exiting edges (as we always run the last iteration in the
4253-
// scalar epilogue); in that case, there is no edge from middle to exit and
4254-
// and thus no phis which needed updated.
4255-
if (!Cost->requiresScalarEpilogue())
4256-
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4257-
if (any_of(LCSSAPhi.incoming_values(),
4258-
[Phi](Value *V) { return V == Phi; }))
4259-
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4219+
// scalar epilogue); in that case, the exiting path through middle will be
4220+
// dynamically dead and the value picked for the phi doesn't matter.
4221+
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4222+
if (any_of(LCSSAPhi.incoming_values(),
4223+
[Phi](Value *V) { return V == Phi; }))
4224+
LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
42604225
}
42614226

42624227
void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
@@ -4421,11 +4386,10 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
44214386
// We know that the loop is in LCSSA form. We need to update the PHI nodes
44224387
// in the exit blocks. See comment on analogous loop in
44234388
// fixFirstOrderRecurrence for a more complete explaination of the logic.
4424-
if (!Cost->requiresScalarEpilogue())
4425-
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4426-
if (any_of(LCSSAPhi.incoming_values(),
4427-
[LoopExitInst](Value *V) { return V == LoopExitInst; }))
4428-
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
4389+
for (PHINode &LCSSAPhi : LoopExitBlock->phis())
4390+
if (any_of(LCSSAPhi.incoming_values(),
4391+
[LoopExitInst](Value *V) { return V == LoopExitInst; }))
4392+
LCSSAPhi.addIncoming(ReducedPartRdx, LoopMiddleBlock);
44294393

44304394
// Fix the scalar loop reduction variable with the incoming reduction sum
44314395
// from the vector body and from the backedge value.
@@ -8074,11 +8038,7 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
80748038

80758039
// Update dominator for Bypass & LoopExit.
80768040
DT->changeImmediateDominator(Bypass, TCCheckBlock);
8077-
if (!Cost->requiresScalarEpilogue())
8078-
// For loops with multiple exits, there's no edge from the middle block
8079-
// to exit blocks (as the epilogue must run) and thus no need to update
8080-
// the immediate dominator of the exit blocks.
8081-
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
8041+
DT->changeImmediateDominator(LoopExitBlock, TCCheckBlock);
80828042

80838043
LoopBypassBlocks.push_back(TCCheckBlock);
80848044

@@ -8142,12 +8102,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
81428102

81438103
DT->changeImmediateDominator(LoopScalarPreHeader,
81448104
EPI.EpilogueIterationCountCheck);
8145-
if (!Cost->requiresScalarEpilogue())
8146-
// If there is an epilogue which must run, there's no edge from the
8147-
// middle block to exit blocks and thus no need to update the immediate
8148-
// dominator of the exit blocks.
8149-
DT->changeImmediateDominator(LoopExitBlock,
8150-
EPI.EpilogueIterationCountCheck);
8105+
DT->changeImmediateDominator(LoopExitBlock, EPI.EpilogueIterationCountCheck);
81518106

81528107
// Keep track of bypass blocks, as they feed start values to the induction
81538108
// phis in the scalar loop preheader.

llvm/test/Transforms/LoopVectorize/first-order-recurrence-complex.ll

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -471,9 +471,10 @@ define i16 @multiple_exit(i16* %p, i32 %n) {
471471
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
472472
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]]
473473
; CHECK: middle.block:
474+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
474475
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
475476
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
476-
; CHECK-NEXT: br label [[SCALAR_PH]]
477+
; CHECK-NEXT: br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
477478
; CHECK: scalar.ph:
478479
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
479480
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -485,14 +486,14 @@ define i16 @multiple_exit(i16* %p, i32 %n) {
485486
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
486487
; CHECK-NEXT: [[REC_NEXT]] = load i16, i16* [[B]], align 2
487488
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
488-
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
489+
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
489490
; CHECK: for.body:
490491
; CHECK-NEXT: store i16 [[SCALAR_RECUR]], i16* [[B]], align 4
491492
; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1
492493
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
493494
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP7:!llvm.loop !.*]]
494495
; CHECK: if.end:
495-
; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[SCALAR_RECUR]], [[FOR_COND]] ]
496+
; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
496497
; CHECK-NEXT: ret i16 [[REC_LCSSA]]
497498
;
498499
entry:
@@ -557,9 +558,10 @@ define i16 @multiple_exit2(i16* %p, i32 %n) {
557558
; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
558559
; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]]
559560
; CHECK: middle.block:
561+
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
560562
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 3
561563
; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT_FOR_PHI:%.*]] = extractelement <4 x i16> [[WIDE_LOAD]], i32 2
562-
; CHECK-NEXT: br label [[SCALAR_PH]]
564+
; CHECK-NEXT: br i1 [[CMP_N]], label [[IF_END:%.*]], label [[SCALAR_PH]]
563565
; CHECK: scalar.ph:
564566
; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[VECTOR_RECUR_EXTRACT]], [[MIDDLE_BLOCK]] ]
565567
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
@@ -571,14 +573,14 @@ define i16 @multiple_exit2(i16* %p, i32 %n) {
571573
; CHECK-NEXT: [[B:%.*]] = getelementptr inbounds i16, i16* [[P]], i64 [[IPROM]]
572574
; CHECK-NEXT: [[REC_NEXT]] = load i16, i16* [[B]], align 2
573575
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[I]], [[N]]
574-
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END:%.*]]
576+
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[IF_END]]
575577
; CHECK: for.body:
576578
; CHECK-NEXT: store i16 [[SCALAR_RECUR]], i16* [[B]], align 4
577579
; CHECK-NEXT: [[INC]] = add nsw i32 [[I]], 1
578580
; CHECK-NEXT: [[CMP2:%.*]] = icmp slt i32 [[I]], 2096
579581
; CHECK-NEXT: br i1 [[CMP2]], label [[FOR_COND]], label [[IF_END]], [[LOOP9:!llvm.loop !.*]]
580582
; CHECK: if.end:
581-
; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ 10, [[FOR_BODY]] ]
583+
; CHECK-NEXT: [[REC_LCSSA:%.*]] = phi i16 [ [[SCALAR_RECUR]], [[FOR_COND]] ], [ 10, [[FOR_BODY]] ], [ [[VECTOR_RECUR_EXTRACT_FOR_PHI]], [[MIDDLE_BLOCK]] ]
582584
; CHECK-NEXT: ret i16 [[REC_LCSSA]]
583585
;
584586
entry:

0 commit comments

Comments
 (0)