@@ -467,11 +467,12 @@ class InnerLoopVectorizer {
467
467
ElementCount MinProfitableTripCount,
468
468
unsigned UnrollFactor, LoopVectorizationLegality *LVL,
469
469
LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
470
- ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks)
470
+ ProfileSummaryInfo *PSI, GeneratedRTChecks &RTChecks,
471
+ VPlan &Plan)
471
472
: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
472
473
AC (AC), ORE(ORE), VF(VecWidth), UF(UnrollFactor),
473
474
Builder(PSE.getSE()->getContext()), Legal(LVL), Cost(CM), BFI(BFI),
474
- PSI(PSI), RTChecks(RTChecks) {
475
+ PSI(PSI), RTChecks(RTChecks), Plan(Plan) {
475
476
// Query this against the original loop and save it here because the profile
476
477
// of the original loop header may change as the transformation happens.
477
478
OptForSizeBasedOnProfile = llvm::shouldOptimizeForSize (
@@ -522,7 +523,7 @@ class InnerLoopVectorizer {
522
523
// / and the resume values can come from an additional bypass block, the \p
523
524
// / AdditionalBypass pair provides information about the bypass block and the
524
525
// / end value on the edge from bypass to this loop.
525
- PHINode * createInductionResumeValue (
526
+ void createInductionResumeValue (
526
527
PHINode *OrigPhi, const InductionDescriptor &ID, Value *Step,
527
528
ArrayRef<BasicBlock *> BypassBlocks,
528
529
std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr , nullptr });
@@ -535,6 +536,11 @@ class InnerLoopVectorizer {
535
536
// / count of the original loop for both main loop and epilogue vectorization.
536
537
void setTripCount (Value *TC) { TripCount = TC; }
537
538
539
+ std::pair<BasicBlock *, Value *>
540
+ getInductionBypassValue (PHINode *OrigPhi) const {
541
+ return InductionBypassValues.find (OrigPhi)->second ;
542
+ }
543
+
538
544
protected:
539
545
friend class LoopVectorizationPlanner ;
540
546
@@ -677,6 +683,11 @@ class InnerLoopVectorizer {
677
683
// / Structure to hold information about generated runtime checks, responsible
678
684
// / for cleaning the checks, if vectorization turns out unprofitable.
679
685
GeneratedRTChecks &RTChecks;
686
+
687
+ // / Mapping of induction phis to their bypass values and bypass blocks.
688
+ DenseMap<PHINode *, std::pair<BasicBlock *, Value *>> InductionBypassValues;
689
+
690
+ VPlan &Plan;
680
691
};
681
692
682
693
// / Encapsulate information regarding vectorization of a loop and its epilogue.
@@ -718,10 +729,10 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
718
729
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
719
730
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
720
731
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
721
- GeneratedRTChecks &Checks)
732
+ GeneratedRTChecks &Checks, VPlan &Plan )
722
733
: InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
723
734
EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, LVL,
724
- CM, BFI, PSI, Checks),
735
+ CM, BFI, PSI, Checks, Plan ),
725
736
EPI (EPI) {}
726
737
727
738
// Override this function to handle the more complex control flow around the
@@ -758,9 +769,9 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
758
769
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
759
770
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
760
771
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
761
- GeneratedRTChecks &Check)
772
+ GeneratedRTChecks &Check, VPlan &Plan )
762
773
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
763
- EPI, LVL, CM, BFI, PSI, Check) {}
774
+ EPI, LVL, CM, BFI, PSI, Check, Plan ) {}
764
775
// / Implements the interface for creating a vectorized skeleton using the
765
776
// / *main loop* strategy (ie the first pass of vplan execution).
766
777
std::pair<BasicBlock *, Value *>
@@ -787,9 +798,9 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
787
798
OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
788
799
LoopVectorizationLegality *LVL, llvm::LoopVectorizationCostModel *CM,
789
800
BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
790
- GeneratedRTChecks &Checks)
801
+ GeneratedRTChecks &Checks, VPlan &Plan )
791
802
: InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
792
- EPI, LVL, CM, BFI, PSI, Checks) {
803
+ EPI, LVL, CM, BFI, PSI, Checks, Plan ) {
793
804
TripCount = EPI.TripCount ;
794
805
}
795
806
// / Implements the interface for creating a vectorized skeleton using the
@@ -2546,7 +2557,18 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2546
2557
nullptr , Twine (Prefix) + " scalar.ph" );
2547
2558
}
2548
2559
2549
- PHINode *InnerLoopVectorizer::createInductionResumeValue (
2560
+ static void addOperandToPhiInVPIRBasicBlock (VPIRBasicBlock *VPBB, PHINode *P,
2561
+ VPValue *Op) {
2562
+ for (VPRecipeBase &R : *VPBB) {
2563
+ auto *IRI = cast<VPIRInstruction>(&R);
2564
+ if (&IRI->getInstruction () == P) {
2565
+ IRI->addOperand (Op);
2566
+ break ;
2567
+ }
2568
+ }
2569
+ }
2570
+
2571
+ void InnerLoopVectorizer::createInductionResumeValue (
2550
2572
PHINode *OrigPhi, const InductionDescriptor &II, Value *Step,
2551
2573
ArrayRef<BasicBlock *> BypassBlocks,
2552
2574
std::pair<BasicBlock *, Value *> AdditionalBypass) {
@@ -2581,27 +2603,28 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
2581
2603
}
2582
2604
}
2583
2605
2584
- // Create phi nodes to merge from the backedge-taken check block.
2585
- PHINode *BCResumeVal =
2586
- PHINode::Create (OrigPhi->getType (), 3 , " bc.resume.val" ,
2587
- LoopScalarPreHeader->getFirstNonPHIIt ());
2588
- // Copy original phi DL over to the new one.
2589
- BCResumeVal->setDebugLoc (OrigPhi->getDebugLoc ());
2606
+ VPBasicBlock *MiddleVPBB =
2607
+ cast<VPBasicBlock>(Plan.getVectorLoopRegion ()->getSingleSuccessor ());
2590
2608
2591
- // The new PHI merges the original incoming value, in case of a bypass,
2592
- // or the value at the end of the vectorized loop.
2593
- BCResumeVal->addIncoming (EndValue, LoopMiddleBlock);
2609
+ VPBasicBlock *ScalarPHVPBB = nullptr ;
2610
+ if (MiddleVPBB->getNumSuccessors () == 2 ) {
2611
+ // Order is strict: first is the exit block, second is the scalar preheader.
2612
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSuccessors ()[1 ]);
2613
+ } else {
2614
+ ScalarPHVPBB = cast<VPBasicBlock>(MiddleVPBB->getSingleSuccessor ());
2615
+ }
2594
2616
2595
- // Fix the scalar body counter (PHI node).
2596
- // The old induction's phi node in the scalar body needs the truncated
2597
- // value.
2598
- for (BasicBlock *BB : BypassBlocks)
2599
- BCResumeVal-> addIncoming (II. getStartValue ( ), BB );
2617
+ VPBuilder ScalarPHBuilder (ScalarPHVPBB);
2618
+ auto *ResumePhiRecipe = ScalarPHBuilder. createNaryOp (
2619
+ VPInstruction::ResumePhi,
2620
+ {Plan. getOrAddLiveIn (EndValue), Plan. getOrAddLiveIn (II. getStartValue ())},
2621
+ OrigPhi-> getDebugLoc ( ), " bc.resume.val " );
2600
2622
2601
- if (AdditionalBypass.first )
2602
- BCResumeVal->setIncomingValueForBlock (AdditionalBypass.first ,
2603
- EndValueFromAdditionalBypass);
2604
- return BCResumeVal;
2623
+ auto *ScalarLoopHeader =
2624
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ());
2625
+ addOperandToPhiInVPIRBasicBlock (ScalarLoopHeader, OrigPhi, ResumePhiRecipe);
2626
+ InductionBypassValues[OrigPhi] = {AdditionalBypass.first ,
2627
+ EndValueFromAdditionalBypass};
2605
2628
}
2606
2629
2607
2630
// / Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
@@ -2634,10 +2657,8 @@ void InnerLoopVectorizer::createInductionResumeValues(
2634
2657
for (const auto &InductionEntry : Legal->getInductionVars ()) {
2635
2658
PHINode *OrigPhi = InductionEntry.first ;
2636
2659
const InductionDescriptor &II = InductionEntry.second ;
2637
- PHINode *BCResumeVal = createInductionResumeValue (
2638
- OrigPhi, II, getExpandedStep (II, ExpandedSCEVs), LoopBypassBlocks,
2639
- AdditionalBypass);
2640
- OrigPhi->setIncomingValueForBlock (LoopScalarPreHeader, BCResumeVal);
2660
+ createInductionResumeValue (OrigPhi, II, getExpandedStep (II, ExpandedSCEVs),
2661
+ LoopBypassBlocks, AdditionalBypass);
2641
2662
}
2642
2663
}
2643
2664
@@ -7738,6 +7759,25 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7738
7759
// the second pass for the scalar loop. The induction resume values for the
7739
7760
// inductions in the epilogue loop are created before executing the plan for
7740
7761
// the epilogue loop.
7762
+ for (VPRecipeBase &R :
7763
+ Plan.getVectorLoopRegion ()->getEntryBasicBlock ()->phis ()) {
7764
+ // Create induction resume values for both widened pointer and
7765
+ // integer/fp inductions and update the start value of the induction
7766
+ // recipes to use the resume value.
7767
+ PHINode *IndPhi = nullptr ;
7768
+ const InductionDescriptor *ID;
7769
+ if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
7770
+ IndPhi = cast<PHINode>(Ind->getUnderlyingValue ());
7771
+ ID = &Ind->getInductionDescriptor ();
7772
+ } else if (auto *WidenInd = dyn_cast<VPWidenIntOrFpInductionRecipe>(&R)) {
7773
+ IndPhi = WidenInd->getPHINode ();
7774
+ ID = &WidenInd->getInductionDescriptor ();
7775
+ } else
7776
+ continue ;
7777
+
7778
+ createInductionResumeValue (IndPhi, *ID, getExpandedStep (*ID, ExpandedSCEVs),
7779
+ LoopBypassBlocks);
7780
+ }
7741
7781
7742
7782
return {LoopVectorPreHeader, nullptr };
7743
7783
}
@@ -8911,14 +8951,9 @@ static void addLiveOutsForFirstOrderRecurrences(
8911
8951
VPInstruction::ResumePhi, {Resume, FOR->getStartValue ()}, {},
8912
8952
" scalar.recur.init" );
8913
8953
auto *FORPhi = cast<PHINode>(FOR->getUnderlyingInstr ());
8914
- for (VPRecipeBase &R :
8915
- *cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ())) {
8916
- auto *IRI = cast<VPIRInstruction>(&R);
8917
- if (&IRI->getInstruction () == FORPhi) {
8918
- IRI->addOperand (ResumePhiRecipe);
8919
- break ;
8920
- }
8921
- }
8954
+ addOperandToPhiInVPIRBasicBlock (
8955
+ cast<VPIRBasicBlock>(ScalarPHVPBB->getSingleSuccessor ()), FORPhi,
8956
+ ResumePhiRecipe);
8922
8957
8923
8958
// Now update VPIRInstructions modeling LCSSA phis in the exit block.
8924
8959
// Extract the penultimate value of the recurrence and use it as operand for
@@ -9645,7 +9680,7 @@ static bool processLoopInVPlanNativePath(
9645
9680
GeneratedRTChecks Checks (PSE, DT, LI, TTI, F->getDataLayout (),
9646
9681
AddBranchWeights);
9647
9682
InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
9648
- VF.Width , 1 , LVL, &CM, BFI, PSI, Checks);
9683
+ VF.Width , 1 , LVL, &CM, BFI, PSI, Checks, BestPlan );
9649
9684
LLVM_DEBUG (dbgs () << " Vectorizing outer loop in \" "
9650
9685
<< L->getHeader ()->getParent ()->getName () << " \"\n " );
9651
9686
LVP.executePlan (VF.Width , 1 , BestPlan, LB, DT, false );
@@ -10133,11 +10168,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10133
10168
assert (IC > 1 && " interleave count should not be 1 or 0" );
10134
10169
// If we decided that it is not legal to vectorize the loop, then
10135
10170
// interleave it.
10171
+ VPlan &BestPlan = LVP.getPlanFor (VF.Width );
10136
10172
InnerLoopVectorizer Unroller (
10137
10173
L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed (1 ),
10138
- ElementCount::getFixed (1 ), IC, &LVL, &CM, BFI, PSI, Checks);
10174
+ ElementCount::getFixed (1 ), IC, &LVL, &CM, BFI, PSI, Checks, BestPlan );
10139
10175
10140
- VPlan &BestPlan = LVP.getPlanFor (VF.Width );
10141
10176
LVP.executePlan (VF.Width , IC, BestPlan, Unroller, DT, false );
10142
10177
10143
10178
ORE->emit ([&]() {
@@ -10159,10 +10194,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10159
10194
// to be vectorized by executing the plan (potentially with a different
10160
10195
// factor) again shortly afterwards.
10161
10196
EpilogueLoopVectorizationInfo EPI (VF.Width , IC, EpilogueVF.Width , 1 );
10197
+ std::unique_ptr<VPlan> BestMainPlan (BestPlan.duplicate ());
10162
10198
EpilogueVectorizerMainLoop MainILV (L, PSE, LI, DT, TLI, TTI, AC, ORE,
10163
- EPI, &LVL, &CM, BFI, PSI, Checks);
10199
+ EPI, &LVL, &CM, BFI, PSI, Checks,
10200
+ *BestMainPlan);
10164
10201
10165
- std::unique_ptr<VPlan> BestMainPlan (BestPlan.duplicate ());
10166
10202
auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
10167
10203
*BestMainPlan, MainILV, DT, true );
10168
10204
++LoopsVectorized;
@@ -10171,11 +10207,11 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10171
10207
// edges from the first pass.
10172
10208
EPI.MainLoopVF = EPI.EpilogueVF ;
10173
10209
EPI.MainLoopUF = EPI.EpilogueUF ;
10210
+ VPlan &BestEpiPlan = LVP.getPlanFor (EPI.EpilogueVF );
10174
10211
EpilogueVectorizerEpilogueLoop EpilogILV (L, PSE, LI, DT, TLI, TTI, AC,
10175
10212
ORE, EPI, &LVL, &CM, BFI, PSI,
10176
- Checks);
10213
+ Checks, BestEpiPlan );
10177
10214
10178
- VPlan &BestEpiPlan = LVP.getPlanFor (EPI.EpilogueVF );
10179
10215
VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion ();
10180
10216
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock ();
10181
10217
Header->setName (" vec.epilog.vector.body" );
@@ -10224,23 +10260,16 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10224
10260
RdxDesc.getRecurrenceStartValue ());
10225
10261
}
10226
10262
} else {
10227
- // Create induction resume values for both widened pointer and
10228
- // integer/fp inductions and update the start value of the induction
10229
- // recipes to use the resume value.
10263
+ // Retrive the induction resume values for wide inductions from
10264
+ // their original phi nodes in the scalar loop
10230
10265
PHINode *IndPhi = nullptr ;
10231
- const InductionDescriptor *ID;
10232
10266
if (auto *Ind = dyn_cast<VPWidenPointerInductionRecipe>(&R)) {
10233
10267
IndPhi = cast<PHINode>(Ind->getUnderlyingValue ());
10234
- ID = &Ind->getInductionDescriptor ();
10235
10268
} else {
10236
10269
auto *WidenInd = cast<VPWidenIntOrFpInductionRecipe>(&R);
10237
10270
IndPhi = WidenInd->getPHINode ();
10238
- ID = &WidenInd->getInductionDescriptor ();
10239
10271
}
10240
-
10241
- ResumeV = MainILV.createInductionResumeValue (
10242
- IndPhi, *ID, getExpandedStep (*ID, ExpandedSCEVs),
10243
- {EPI.MainLoopIterationCountCheck });
10272
+ ResumeV = IndPhi->getIncomingValueForBlock (L->getLoopPreheader ());
10244
10273
}
10245
10274
assert (ResumeV && " Must have a resume value" );
10246
10275
VPValue *StartVal = BestEpiPlan.getOrAddLiveIn (ResumeV);
@@ -10252,13 +10281,19 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10252
10281
LVP.executePlan (EPI.EpilogueVF , EPI.EpilogueUF , BestEpiPlan, EpilogILV,
10253
10282
DT, true , &ExpandedSCEVs);
10254
10283
++LoopsEpilogueVectorized;
10284
+ BasicBlock *PH = L->getLoopPreheader ();
10255
10285
10286
+ for (const auto &[IVPhi, _] : LVL.getInductionVars ()) {
10287
+ auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock (PH));
10288
+ const auto &[BB, V] = EpilogILV.getInductionBypassValue (IVPhi);
10289
+ Inc->setIncomingValueForBlock (BB, V);
10290
+ }
10256
10291
if (!MainILV.areSafetyChecksAdded ())
10257
10292
DisableRuntimeUnroll = true ;
10258
10293
} else {
10259
10294
InnerLoopVectorizer LB (L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width ,
10260
10295
VF.MinProfitableTripCount , IC, &LVL, &CM, BFI,
10261
- PSI, Checks);
10296
+ PSI, Checks, BestPlan );
10262
10297
LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10263
10298
++LoopsVectorized;
10264
10299
0 commit comments