@@ -816,7 +816,7 @@ class InnerLoopVectorizer {
816
816
// / Middle Block between the vector and the scalar.
817
817
BasicBlock *LoopMiddleBlock;
818
818
819
- // / The ( unique) ExitBlock of the scalar loop. Note that
819
+ // / The unique ExitBlock of the scalar loop if one exists . Note that
820
820
// / there can be multiple exiting edges reaching this block.
821
821
BasicBlock *LoopExitBlock;
822
822
@@ -3268,9 +3268,13 @@ void InnerLoopVectorizer::emitMinimumIterationCountCheck(Loop *L,
3268
3268
DT->getNode (Bypass)->getIDom ()) &&
3269
3269
" TC check is expected to dominate Bypass" );
3270
3270
3271
- // Update dominator for Bypass & LoopExit.
3271
+ // Update dominator for Bypass & LoopExit (if needed) .
3272
3272
DT->changeImmediateDominator (Bypass, TCCheckBlock);
3273
- DT->changeImmediateDominator (LoopExitBlock, TCCheckBlock);
3273
+ if (!Cost->requiresScalarEpilogue (VF))
3274
+ // If there is an epilogue which must run, there's no edge from the
3275
+ // middle block to exit blocks and thus no need to update the immediate
3276
+ // dominator of the exit blocks.
3277
+ DT->changeImmediateDominator (LoopExitBlock, TCCheckBlock);
3274
3278
3275
3279
ReplaceInstWithInst (
3276
3280
TCCheckBlock->getTerminator (),
@@ -3294,7 +3298,11 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(Loop *L, BasicBlock *Bypass) {
3294
3298
// Update dominator only if this is first RT check.
3295
3299
if (LoopBypassBlocks.empty ()) {
3296
3300
DT->changeImmediateDominator (Bypass, SCEVCheckBlock);
3297
- DT->changeImmediateDominator (LoopExitBlock, SCEVCheckBlock);
3301
+ if (!Cost->requiresScalarEpilogue (VF))
3302
+ // If there is an epilogue which must run, there's no edge from the
3303
+ // middle block to exit blocks and thus no need to update the immediate
3304
+ // dominator of the exit blocks.
3305
+ DT->changeImmediateDominator (LoopExitBlock, SCEVCheckBlock);
3298
3306
}
3299
3307
3300
3308
LoopBypassBlocks.push_back (SCEVCheckBlock);
@@ -3447,9 +3455,10 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
3447
3455
Loop *InnerLoopVectorizer::createVectorLoopSkeleton (StringRef Prefix) {
3448
3456
LoopScalarBody = OrigLoop->getHeader ();
3449
3457
LoopVectorPreHeader = OrigLoop->getLoopPreheader ();
3450
- LoopExitBlock = OrigLoop->getUniqueExitBlock ();
3451
- assert (LoopExitBlock && " Must have an exit block" );
3452
3458
assert (LoopVectorPreHeader && " Invalid loop structure" );
3459
+ LoopExitBlock = OrigLoop->getUniqueExitBlock (); // may be nullptr
3460
+ assert ((LoopExitBlock || Cost->requiresScalarEpilogue (VF)) &&
3461
+ " multiple exit loop without required epilogue?" );
3453
3462
3454
3463
LoopMiddleBlock =
3455
3464
SplitBlock (LoopVectorPreHeader, LoopVectorPreHeader->getTerminator (), DT,
@@ -3458,12 +3467,20 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3458
3467
SplitBlock (LoopMiddleBlock, LoopMiddleBlock->getTerminator (), DT, LI,
3459
3468
nullptr , Twine (Prefix) + " scalar.ph" );
3460
3469
3461
- // Set up branch from middle block to the exit and scalar preheader blocks.
3462
- // completeLoopSkeleton will update the condition to use an iteration check,
3463
- // if required to decide whether to execute the remainder.
3464
- BranchInst *BrInst =
3465
- BranchInst::Create (LoopExitBlock, LoopScalarPreHeader, Builder.getTrue ());
3466
3470
auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
3471
+
3472
+ // Set up the middle block terminator. Two cases:
3473
+ // 1) If we know that we must execute the scalar epilogue, emit an
3474
+ // unconditional branch.
3475
+ // 2) Otherwise, we must have a single unique exit block (due to how we
3476
+ // implement the multiple exit case). In this case, set up a conditonal
3477
+ // branch from the middle block to the loop scalar preheader, and the
3478
+ // exit block. completeLoopSkeleton will update the condition to use an
3479
+ // iteration check, if required to decide whether to execute the remainder.
3480
+ BranchInst *BrInst = Cost->requiresScalarEpilogue (VF) ?
3481
+ BranchInst::Create (LoopScalarPreHeader) :
3482
+ BranchInst::Create (LoopExitBlock, LoopScalarPreHeader,
3483
+ Builder.getTrue ());
3467
3484
BrInst->setDebugLoc (ScalarLatchTerm->getDebugLoc ());
3468
3485
ReplaceInstWithInst (LoopMiddleBlock->getTerminator (), BrInst);
3469
3486
@@ -3475,7 +3492,11 @@ Loop *InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
3475
3492
nullptr , nullptr , Twine (Prefix) + " vector.body" );
3476
3493
3477
3494
// Update dominator for loop exit.
3478
- DT->changeImmediateDominator (LoopExitBlock, LoopMiddleBlock);
3495
+ if (!Cost->requiresScalarEpilogue (VF))
3496
+ // If there is an epilogue which must run, there's no edge from the
3497
+ // middle block to exit blocks and thus no need to update the immediate
3498
+ // dominator of the exit blocks.
3499
+ DT->changeImmediateDominator (LoopExitBlock, LoopMiddleBlock);
3479
3500
3480
3501
// Create and register the new vector loop.
3481
3502
Loop *Lp = LI->AllocateLoop ();
@@ -3577,10 +3598,14 @@ BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(Loop *L,
3577
3598
auto *ScalarLatchTerm = OrigLoop->getLoopLatch ()->getTerminator ();
3578
3599
3579
3600
// Add a check in the middle block to see if we have completed
3580
- // all of the iterations in the first vector loop.
3581
- // If (N - N%VF) == N, then we *don't* need to run the remainder.
3582
- // If tail is to be folded, we know we don't need to run the remainder.
3583
- if (!Cost->foldTailByMasking ()) {
3601
+ // all of the iterations in the first vector loop. Three cases:
3602
+ // 1) If we require a scalar epilogue, there is no conditional branch as
3603
+ // we unconditionally branch to the scalar preheader. Do nothing.
3604
+ // 2) If (N - N%VF) == N, then we *don't* need to run the remainder.
3605
+ // Thus if tail is to be folded, we know we don't need to run the
3606
+ // remainder and we can use the previous value for the condition (true).
3607
+ // 3) Otherwise, construct a runtime check.
3608
+ if (!Cost->requiresScalarEpilogue (VF) && !Cost->foldTailByMasking ()) {
3584
3609
Instruction *CmpN = CmpInst::Create (Instruction::ICmp, CmpInst::ICMP_EQ,
3585
3610
Count, VectorTripCount, " cmp.n" ,
3586
3611
LoopMiddleBlock->getTerminator ());
@@ -3644,17 +3669,17 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
3644
3669
| [ ]_| <-- vector loop.
3645
3670
| |
3646
3671
| v
3647
- | -[ ] <--- middle-block.
3648
- | / |
3649
- | / v
3650
- -|- >[ ] <--- new preheader.
3672
+ \ -[ ] <--- middle-block.
3673
+ \/ |
3674
+ /\ v
3675
+ | - >[ ] <--- new preheader.
3651
3676
| |
3652
- | v
3677
+ (opt) v <-- edge from middle to exit iff epilogue is not required.
3653
3678
| [ ] \
3654
- | [ ]_| <-- old scalar loop to handle remainder.
3679
+ | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue) .
3655
3680
\ |
3656
3681
\ v
3657
- >[ ] <-- exit block.
3682
+ >[ ] <-- exit block(s) .
3658
3683
...
3659
3684
*/
3660
3685
@@ -4088,13 +4113,19 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
4088
4113
// Forget the original basic block.
4089
4114
PSE.getSE ()->forgetLoop (OrigLoop);
4090
4115
4091
- // Fix-up external users of the induction variables.
4092
- for (auto &Entry : Legal->getInductionVars ())
4093
- fixupIVUsers (Entry.first , Entry.second ,
4094
- getOrCreateVectorTripCount (LI->getLoopFor (LoopVectorBody)),
4095
- IVEndValues[Entry.first ], LoopMiddleBlock);
4116
+ // If we inserted an edge from the middle block to the unique exit block,
4117
+ // update uses outside the loop (phis) to account for the newly inserted
4118
+ // edge.
4119
+ if (!Cost->requiresScalarEpilogue (VF)) {
4120
+ // Fix-up external users of the induction variables.
4121
+ for (auto &Entry : Legal->getInductionVars ())
4122
+ fixupIVUsers (Entry.first , Entry.second ,
4123
+ getOrCreateVectorTripCount (LI->getLoopFor (LoopVectorBody)),
4124
+ IVEndValues[Entry.first ], LoopMiddleBlock);
4125
+
4126
+ fixLCSSAPHIs (State);
4127
+ }
4096
4128
4097
- fixLCSSAPHIs (State);
4098
4129
for (Instruction *PI : PredicatedInstructions)
4099
4130
sinkScalarOperands (&*PI);
4100
4131
@@ -4309,12 +4340,13 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(VPWidenPHIRecipe *PhiR,
4309
4340
// recurrence in the exit block, and then add an edge for the middle block.
4310
4341
// Note that LCSSA does not imply single entry when the original scalar loop
4311
4342
// had multiple exiting edges (as we always run the last iteration in the
4312
- // scalar epilogue); in that case, the exiting path through middle will be
4313
- // dynamically dead and the value picked for the phi doesn't matter.
4314
- for (PHINode &LCSSAPhi : LoopExitBlock->phis ())
4315
- if (any_of (LCSSAPhi.incoming_values (),
4316
- [Phi](Value *V) { return V == Phi; }))
4317
- LCSSAPhi.addIncoming (ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4343
+ // scalar epilogue); in that case, there is no edge from middle to exit and
4344
+ // and thus no phis which needed updated.
4345
+ if (!Cost->requiresScalarEpilogue (VF))
4346
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis ())
4347
+ if (any_of (LCSSAPhi.incoming_values (),
4348
+ [Phi](Value *V) { return V == Phi; }))
4349
+ LCSSAPhi.addIncoming (ExtractForPhiUsedOutsideLoop, LoopMiddleBlock);
4318
4350
}
4319
4351
4320
4352
void InnerLoopVectorizer::fixReduction (VPReductionPHIRecipe *PhiR,
@@ -4483,10 +4515,11 @@ void InnerLoopVectorizer::fixReduction(VPReductionPHIRecipe *PhiR,
4483
4515
// We know that the loop is in LCSSA form. We need to update the PHI nodes
4484
4516
// in the exit blocks. See comment on analogous loop in
4485
4517
// fixFirstOrderRecurrence for a more complete explaination of the logic.
4486
- for (PHINode &LCSSAPhi : LoopExitBlock->phis ())
4487
- if (any_of (LCSSAPhi.incoming_values (),
4488
- [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4489
- LCSSAPhi.addIncoming (ReducedPartRdx, LoopMiddleBlock);
4518
+ if (!Cost->requiresScalarEpilogue (VF))
4519
+ for (PHINode &LCSSAPhi : LoopExitBlock->phis ())
4520
+ if (any_of (LCSSAPhi.incoming_values (),
4521
+ [LoopExitInst](Value *V) { return V == LoopExitInst; }))
4522
+ LCSSAPhi.addIncoming (ReducedPartRdx, LoopMiddleBlock);
4490
4523
4491
4524
// Fix the scalar loop reduction variable with the incoming reduction sum
4492
4525
// from the vector body and from the backedge value.
@@ -8316,7 +8349,11 @@ BasicBlock *EpilogueVectorizerMainLoop::emitMinimumIterationCountCheck(
8316
8349
8317
8350
// Update dominator for Bypass & LoopExit.
8318
8351
DT->changeImmediateDominator (Bypass, TCCheckBlock);
8319
- DT->changeImmediateDominator (LoopExitBlock, TCCheckBlock);
8352
+ if (!Cost->requiresScalarEpilogue (EPI.EpilogueVF ))
8353
+ // For loops with multiple exits, there's no edge from the middle block
8354
+ // to exit blocks (as the epilogue must run) and thus no need to update
8355
+ // the immediate dominator of the exit blocks.
8356
+ DT->changeImmediateDominator (LoopExitBlock, TCCheckBlock);
8320
8357
8321
8358
LoopBypassBlocks.push_back (TCCheckBlock);
8322
8359
@@ -8380,7 +8417,12 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
8380
8417
8381
8418
DT->changeImmediateDominator (LoopScalarPreHeader,
8382
8419
EPI.EpilogueIterationCountCheck );
8383
- DT->changeImmediateDominator (LoopExitBlock, EPI.EpilogueIterationCountCheck );
8420
+ if (!Cost->requiresScalarEpilogue (EPI.EpilogueVF ))
8421
+ // If there is an epilogue which must run, there's no edge from the
8422
+ // middle block to exit blocks and thus no need to update the immediate
8423
+ // dominator of the exit blocks.
8424
+ DT->changeImmediateDominator (LoopExitBlock,
8425
+ EPI.EpilogueIterationCountCheck );
8384
8426
8385
8427
// Keep track of bypass blocks, as they feed start values to the induction
8386
8428
// phis in the scalar loop preheader.
0 commit comments