@@ -852,7 +852,7 @@ class InnerLoopVectorizer {
852
852
// / Middle Block between the vector and the scalar.
853
853
BasicBlock *LoopMiddleBlock;
854
854
855
- // / The ( unique) ExitBlock of the scalar loop. Note that
855
+ // / The unique ExitBlock of the scalar loop if one exists . Note that
856
856
// / there can be multiple exiting edges reaching this block.
857
857
BasicBlock *LoopExitBlock;
858
858
@@ -3147,9 +3147,13 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3147
3147
DT->getNode (Bypass)->getIDom ()) &&
3148
3148
" TC check is expected to dominate Bypass" );
3149
3149
3150
- // Update dominator for Bypass & LoopExit.
3150
+ // Update dominator for Bypass & LoopExit (if needed) .
3151
3151
DT->changeImmediateDominator (Bypass, TCCheckBlock);
3152
- DT->changeImmediateDominator (LoopExitBlock, TCCheckBlock);
3152
+ if (!Cost->requiresScalarEpilogue ())
3153
+ // If there is an epilogue which must run, there's no edge from the
3154
+ // middle block to exit blocks and thus no need to update the immediate
3155
+ // dominator of the exit blocks.
3156
+ DT->changeImmediateDominator (LoopExitBlock, TCCheckBlock);
3153
3157
3154
3158
ReplaceInstWithInst (
3155
3159
TCCheckBlock->getTerminator (),
@@ -3188,7 +3192,11 @@ void InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3188
3192
// Update dominator only if this is first RT check.
3189
3193
if (LoopBypassBlocks.empty ()) {
3190
3194
DT->changeImmediateDominator (Bypass, SCEVCheckBlock);
3191
- DT->changeImmediateDominator (LoopExitBlock, SCEVCheckBlock);
3195
+ if (!Cost->requiresScalarEpilogue ())
3196
+ // If there is an epilogue which must run, there's no edge from the
3197
+ // middle block to exit blocks and thus no need to update the immediate
3198
+ // dominator of the exit blocks.
3199
+ DT->changeImmediateDominator (LoopExitBlock, SCEVCheckBlock);
3192
3200
}
3193
3201
3194
3202
ReplaceInstWithInst (
@@ -3244,7 +3252,11 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
3244
3252
// Update dominator only if this is first RT check.
3245
3253
if (LoopBypassBlocks.empty ()) {
3246
3254
DT->changeImmediateDominator (Bypass, MemCheckBlock);
3247
- DT->changeImmediateDominator (LoopExitBlock, MemCheckBlock);
3255
+ if (!Cost->requiresScalarEpilogue ())
3256
+ // If there is an epilogue which must run, there's no edge from the
3257
+ // middle block to exit blocks and thus no need to update the immediate
3258
+ // dominator of the exit blocks.
3259
+ DT->changeImmediateDominator (LoopExitBlock, MemCheckBlock);
3248
3260
}
3249
3261
3250
3262
Instruction *FirstCheckInst;
@@ -3369,9 +3381,10 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
3369
3381
Loop *InnerLoopVectorizer::createVectorLoopSkeleton (StringRef Prefix) {
3370
3382
LoopScalarBody = OrigLoop->getHeader ();
3371
3383
LoopVectorPreHeader = OrigLoop->getLoopPreheader ();
3372
- LoopExitBlock = OrigLoop->getUniqueExitBlock ();
3373
- assert (LoopExitBlock && " Must have an exit block" );
3374
3384
assert (LoopVectorPreHeader && " Invalid loop structure" );
3385
+ LoopExitBlock = OrigLoop->getUniqueExitBlock (); // may be nullptr
3386
+ assert ((LoopExitBlock || Cost->requiresScalarEpilogue ()) &&
3387
+ " multiple exit loop without required epilogue?" );
3375
3388
3376
3389
LoopMiddleBlock =
3377
3390
SplitBlock (LoopVectorPreHeader, LoopVectorPreHeader->getTerminator (), DT,
@@ -3380,12 +3393,20 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3380
3393
SplitBlock (LoopMiddleBlock, LoopMiddleBlock->getTerminator (), DT, LI,
3381
3394
nullptr , Twine (Prefix) + " scalar.ph" );
3382
3395
3383
- // Set up branch from middle block to the exit and scalar preheader blocks.
3384
- // completeLoopSkeleton will update the condition to use an iteration check,
3385
- // if required to decide whether to execute the remainder.
3386
- BranchInst *BrInst =
3387
- BranchInst::Create (LoopExitBlock, LoopScalarPreHeader, Builder.getTrue ());
3388
3396
auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
3397
+
3398
+ // Set up the middle block terminator. Two cases:
3399
+ // 1) If we know that we must execute the scalar epilogue, emit an
3400
+ // unconditional branch.
3401
+ // 2) Otherwise, we must have a single unique exit block (due to how we
3402
+ // implement the multiple exit case). In this case, set up a conditonal
3403
+ // branch from the middle block to the loop scalar preheader, and the
3404
+ // exit block. completeLoopSkeleton will update the condition to use an
3405
+ // iteration check, if required to decide whether to execute the remainder.
3406
+ BranchInst *BrInst = Cost->requiresScalarEpilogue () ?
3407
+ BranchInst::Create (LoopScalarPreHeader) :
3408
+ BranchInst::Create (LoopExitBlock, LoopScalarPreHeader,
3409
+ Builder.getTrue ());
3389
3410
BrInst->setDebugLoc (ScalarLatchTerm->getDebugLoc ());
3390
3411
ReplaceInstWithInst (LoopMiddleBlock->getTerminator (), BrInst);
3391
3412
@@ -3397,7 +3418,11 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3397
3418
nullptr , nullptr , Twine (Prefix) + " vector.body" );
3398
3419
3399
3420
// Update dominator for loop exit.
3400
- DT->changeImmediateDominator (LoopExitBlock, LoopMiddleBlock);
3421
+ if (!Cost->requiresScalarEpilogue ())
3422
+ // If there is an epilogue which must run, there's no edge from the
3423
+ // middle block to exit blocks and thus no need to update the immediate
3424
+ // dominator of the exit blocks.
3425
+ DT->changeImmediateDominator (LoopExitBlock, LoopMiddleBlock);
3401
3426
3402
3427
// Create and register the new vector loop.
3403
3428
Loop *Lp = LI->AllocateLoop ();
@@ -3494,10 +3519,14 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3494
3519
auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
3495
3520
3496
3521
// Add a check in the middle block to see if we have completed
3497
- // all of the iterations in the first vector loop.
3498
- // If (N - N%VF) == N, then we *don't* need to run the remainder.
3499
- // If tail is to be folded, we know we don't need to run the remainder.
3500
- if (!Cost->foldTailByMasking ()) {
3522
+ // all of the iterations in the first vector loop. Three cases:
3523
+ // 1) If we require a scalar epilogue, there is no conditional branch as
3524
+ // we unconditionally branch to the scalar preheader. Do nothing.
3525
+ // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3526
+ // Thus if tail is to be folded, we know we don't need to run the
3527
+ // remainder and we can use the previous value for the condition (true).
3528
+ // 3) Otherwise, construct a runtime check.
3529
+ if (!Cost->requiresScalarEpilogue () && !Cost->foldTailByMasking ()) {
3501
3530
Instruction *CmpN = CmpInst::Create (Instruction::ICmp, CmpInst::ICMP_EQ,
3502
3531
Count, VectorTripCount, " cmp.n" ,
3503
3532
LoopMiddleBlock->getTerminator ());
@@ -3561,17 +3590,17 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3561
3590
| [ ]_| <-- vector loop.
3562
3591
| |
3563
3592
| v
3564
- | -[ ] <--- middle-block.
3565
- | / |
3566
- | / v
3567
- -|- >[ ] <--- new preheader.
3593
+ \ -[ ] <--- middle-block.
3594
+ \/ |
3595
+ /\ v
3596
+ | - >[ ] <--- new preheader.
3568
3597
| |
3569
- | v
3598
+ (opt) v <-- edge from middle to exit iff epilogue is not required.
3570
3599
| [ ] \
3571
- | [ ]_| <-- old scalar loop to handle remainder.
3600
+ | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue) .
3572
3601
\ |
3573
3602
\ v
3574
- >[ ] <-- exit block.
3603
+ >[ ] <-- exit block(s) .
3575
3604
...
3576
3605
*/
3577
3606
@@ -3975,13 +4004,18 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
3975
4004
// Forget the original basic block.
3976
4005
PSE.getSE ()->forgetLoop (OrigLoop);
3977
4006
3978
- // Fix-up external users of the induction variables.
3979
- for (auto &Entry : Legal->getInductionVars ())
3980
- fixupIVUsers (Entry.first , Entry.second ,
3981
- getOrCreateVectorTripCount (LI->getLoopFor (LoopVectorBody)),
3982
- IVEndValues[Entry.first ], LoopMiddleBlock);
4007
+ // If we inserted an edge from the middle block to the unique exit block,
4008
+ // update uses outside the loop (phis) to account for the newly inserted
4009
+ // edge.
4010
+ if (!Cost->requiresScalarEpilogue ()) {
4011
+ // Fix-up external users of the induction variables.
4012
+ for (auto &Entry : Legal->getInductionVars ())
4013
+ fixupIVUsers (Entry.first , Entry.second ,
4014
+ getOrCreateVectorTripCount (LI->getLoopFor (LoopVectorBody)),
4015
+ IVEndValues[Entry.first ], LoopMiddleBlock);
3983
4016
3984
- fixLCSSAPHIs ();
4017
+ fixLCSSAPHIs ();
4018
+ }
3985
4019
for (Instruction *PI : PredicatedInstructions)
3986
4020
sinkScalarOperands (&*PI);
3987
4021
@@ -4199,12 +4233,13 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
4199
4233
// recurrence in the exit block, and then add an edge for the middle block.
4200
4234
// Note that LCSSA does not imply single entry when the original scalar loop
4201
4235
// had multiple exiting edges (as we always run the last iteration in the
4202
- // scalar epilogue); in that case, the exiting path through middle will be
4203
- // dynamically dead and the value picked for the phi doesn't matter.
4204
- for (PHINode &LCSSAPhi : LoopExitBlock->phis ())
4205
- if (any_of (LCSSAPhi.incoming_values (),
4206
- [Phi](Value *V) { return V == Phi; }))
4207
- LCSSAPhi.addIncoming (ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4236
+ // scalar epilogue); in that case, there is no edge from middle to exit and
4237
+ // and thus no phis which needed updated.
4238
+ if (!Cost->requiresScalarEpilogue ())
4239
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis ())
4240
+ if (any_of (LCSSAPhi.incoming_values (),
4241
+ [Phi](Value *V) { return V == Phi; }))
4242
+ LCSSAPhi.addIncoming (ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4208
4243
}
4209
4244
4210
4245
void InnerLoopVectorizer::fixReduction (PHINode *Phi) {
@@ -4369,10 +4404,11 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
4369
4404
// We know that the loop is in LCSSA form. We need to update the PHI nodes
4370
4405
// in the exit blocks. See comment on analogous loop in
4371
4406
// fixFirstOrderRecurrence for a more complete explaination of the logic.
4372
- for (PHINode &LCSSAPhi : LoopExitBlock->phis ())
4373
- if (any_of (LCSSAPhi.incoming_values (),
4374
- [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4375
- LCSSAPhi.addIncoming (ReducedPartRdx, LoopMiddleBlock);
4407
+ if (!Cost->requiresScalarEpilogue ())
4408
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis ())
4409
+ if (any_of (LCSSAPhi.incoming_values (),
4410
+ [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4411
+ LCSSAPhi.addIncoming (ReducedPartRdx, LoopMiddleBlock);
4376
4412
4377
4413
// Fix the scalar loop reduction variable with the incoming reduction sum
4378
4414
// from the vector body and from the backedge value.
@@ -8021,7 +8057,11 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8021
8057
8022
8058
// Update dominator for Bypass & LoopExit.
8023
8059
DT->changeImmediateDominator (Bypass, TCCheckBlock);
8024
- DT->changeImmediateDominator (LoopExitBlock, TCCheckBlock);
8060
+ if (!Cost->requiresScalarEpilogue ())
8061
+ // For loops with multiple exits, there's no edge from the middle block
8062
+ // to exit blocks (as the epilogue must run) and thus no need to update
8063
+ // the immediate dominator of the exit blocks.
8064
+ DT->changeImmediateDominator (LoopExitBlock, TCCheckBlock);
8025
8065
8026
8066
LoopBypassBlocks.push_back (TCCheckBlock);
8027
8067
@@ -8085,7 +8125,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8085
8125
8086
8126
DT->changeImmediateDominator (LoopScalarPreHeader,
8087
8127
EPI.EpilogueIterationCountCheck );
8088
- DT->changeImmediateDominator (LoopExitBlock, EPI.EpilogueIterationCountCheck );
8128
+ if (!Cost->requiresScalarEpilogue ())
8129
+ // If there is an epilogue which must run, there's no edge from the
8130
+ // middle block to exit blocks and thus no need to update the immediate
8131
+ // dominator of the exit blocks.
8132
+ DT->changeImmediateDominator (LoopExitBlock,
8133
+ EPI.EpilogueIterationCountCheck );
8089
8134
8090
8135
// Keep track of bypass blocks, as they feed start values to the induction
8091
8136
// phis in the scalar loop preheader.
0 commit comments