|
59 | 59 | #include "VPlan.h"
|
60 | 60 | #include "VPlanAnalysis.h"
|
61 | 61 | #include "VPlanHCFGBuilder.h"
|
| 62 | +#include "VPlanPatternMatch.h" |
62 | 63 | #include "VPlanTransforms.h"
|
63 | 64 | #include "VPlanVerifier.h"
|
64 | 65 | #include "llvm/ADT/APInt.h"
|
@@ -2972,22 +2973,7 @@ void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
|
2972 | 2973 | SplitBlock(LoopMiddleBlock, LoopMiddleBlock->getTerminator(), DT, LI,
|
2973 | 2974 | nullptr, Twine(Prefix) + "scalar.ph");
|
2974 | 2975 |
|
2975 |
| - auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); |
2976 |
| - |
2977 |
| - // Set up the middle block terminator. Two cases: |
2978 |
| - // 1) If we know that we must execute the scalar epilogue, emit an |
2979 |
| - // unconditional branch. |
2980 |
| - // 2) Otherwise, we must have a single unique exit block (due to how we |
2981 |
| - // implement the multiple exit case). In this case, set up a conditional |
2982 |
| - // branch from the middle block to the loop scalar preheader, and the |
2983 |
| - // exit block. completeLoopSkeleton will update the condition to use an |
2984 |
| - // iteration check, if required to decide whether to execute the remainder. |
2985 |
| - BranchInst *BrInst = |
2986 |
| - Cost->requiresScalarEpilogue(VF.isVector()) |
2987 |
| - ? BranchInst::Create(LoopScalarPreHeader) |
2988 |
| - : BranchInst::Create(LoopExitBlock, LoopScalarPreHeader, |
2989 |
| - Builder.getTrue()); |
2990 |
| - BrInst->setDebugLoc(ScalarLatchTerm->getDebugLoc()); |
| 2976 | + auto *BrInst = new UnreachableInst(LoopMiddleBlock->getContext()); |
2991 | 2977 | ReplaceInstWithInst(LoopMiddleBlock->getTerminator(), BrInst);
|
2992 | 2978 |
|
2993 | 2979 | // Update dominator for loop exit. During skeleton creation, only the vector
|
@@ -3094,50 +3080,7 @@ void InnerLoopVectorizer::createInductionResumeValues(
|
3094 | 3080 | }
|
3095 | 3081 | }
|
3096 | 3082 |
|
3097 |
| -BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { |
3098 |
| - // The trip counts should be cached by now. |
3099 |
| - Value *Count = getTripCount(); |
3100 |
| - Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); |
3101 |
| - |
3102 |
| - auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); |
3103 |
| - |
3104 |
| - // Add a check in the middle block to see if we have completed |
3105 |
| - // all of the iterations in the first vector loop. Three cases: |
3106 |
| - // 1) If we require a scalar epilogue, there is no conditional branch as |
3107 |
| - // we unconditionally branch to the scalar preheader. Do nothing. |
3108 |
| - // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. |
3109 |
| - // Thus if tail is to be folded, we know we don't need to run the |
3110 |
| - // remainder and we can use the previous value for the condition (true). |
3111 |
| - // 3) Otherwise, construct a runtime check. |
3112 |
| - if (!Cost->requiresScalarEpilogue(VF.isVector()) && |
3113 |
| - !Cost->foldTailByMasking()) { |
3114 |
| - // Here we use the same DebugLoc as the scalar loop latch terminator instead |
3115 |
| - // of the corresponding compare because they may have ended up with |
3116 |
| - // different line numbers and we want to avoid awkward line stepping while |
3117 |
| - // debugging. Eg. if the compare has got a line number inside the loop. |
3118 |
| - // TODO: At the moment, CreateICmpEQ will simplify conditions with constant |
3119 |
| - // operands. Perform simplification directly on VPlan once the branch is |
3120 |
| - // modeled there. |
3121 |
| - IRBuilder<> B(LoopMiddleBlock->getTerminator()); |
3122 |
| - B.SetCurrentDebugLocation(ScalarLatchTerm->getDebugLoc()); |
3123 |
| - Value *CmpN = B.CreateICmpEQ(Count, VectorTripCount, "cmp.n"); |
3124 |
| - BranchInst &BI = *cast<BranchInst>(LoopMiddleBlock->getTerminator()); |
3125 |
| - BI.setCondition(CmpN); |
3126 |
| - if (hasBranchWeightMD(*ScalarLatchTerm)) { |
3127 |
| - // Assume that `Count % VectorTripCount` is equally distributed. |
3128 |
| - unsigned TripCount = UF * VF.getKnownMinValue(); |
3129 |
| - assert(TripCount > 0 && "trip count should not be zero"); |
3130 |
| - const uint32_t Weights[] = {1, TripCount - 1}; |
3131 |
| - setBranchWeights(BI, Weights); |
3132 |
| - } |
3133 |
| - } |
3134 |
| - |
3135 |
| -#ifdef EXPENSIVE_CHECKS |
3136 |
| - assert(DT->verify(DominatorTree::VerificationLevel::Fast)); |
3137 |
| -#endif |
3138 | 3083 |
|
3139 |
| - return LoopVectorPreHeader; |
3140 |
| -} |
3141 | 3084 |
|
3142 | 3085 | std::pair<BasicBlock *, Value *>
|
3143 | 3086 | InnerLoopVectorizer::createVectorizedLoopSkeleton(
|
@@ -3198,7 +3141,7 @@ InnerLoopVectorizer::createVectorizedLoopSkeleton(
|
3198 | 3141 | // Emit phis for the new starting index of the scalar loop.
|
3199 | 3142 | createInductionResumeValues(ExpandedSCEVs);
|
3200 | 3143 |
|
3201 |
| - return {completeLoopSkeleton(), nullptr}; |
| 3144 | + return {LoopVectorPreHeader, nullptr}; |
3202 | 3145 | }
|
3203 | 3146 |
|
3204 | 3147 | // Fix up external users of the induction variable. At this point, we are
|
@@ -3481,6 +3424,18 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
|
3481 | 3424 | VF.getKnownMinValue() * UF);
|
3482 | 3425 | }
|
3483 | 3426 |
|
| 3427 | +// Helper to reorder blocks so they match the original order even after the |
| 3428 | +// order of the predecessors changes. This is only used to avoid a number of |
| 3429 | +// test changes due to reordering of incoming blocks in phi nodes and should be |
| 3430 | +// removed soon, with the tests being updated. |
| 3431 | +static void reorderIncomingBlocks(SmallVectorImpl<BasicBlock *> &Blocks, |
| 3432 | + BasicBlock *LoopMiddleBlock) { |
| 3433 | + if (Blocks.front() == LoopMiddleBlock) |
| 3434 | + std::swap(Blocks.front(), Blocks.back()); |
| 3435 | + if (Blocks.size() == 3) |
| 3436 | + std::swap(Blocks[0], Blocks[1]); |
| 3437 | +} |
| 3438 | + |
3484 | 3439 | void InnerLoopVectorizer::fixFixedOrderRecurrence(
|
3485 | 3440 | VPFirstOrderRecurrencePHIRecipe *PhiR, VPTransformState &State) {
|
3486 | 3441 | // This is the second phase of vectorizing first-order recurrences. An
|
@@ -3591,7 +3546,9 @@ void InnerLoopVectorizer::fixFixedOrderRecurrence(
|
3591 | 3546 | PHINode *Phi = cast<PHINode>(PhiR->getUnderlyingValue());
|
3592 | 3547 | auto *Start = Builder.CreatePHI(Phi->getType(), 2, "scalar.recur.init");
|
3593 | 3548 | auto *ScalarInit = PhiR->getStartValue()->getLiveInIRValue();
|
3594 |
| - for (auto *BB : predecessors(LoopScalarPreHeader)) { |
| 3549 | + SmallVector<BasicBlock *> Blocks(predecessors(LoopScalarPreHeader)); |
| 3550 | + reorderIncomingBlocks(Blocks, LoopMiddleBlock); |
| 3551 | + for (auto *BB : Blocks) { |
3595 | 3552 | auto *Incoming = BB == LoopMiddleBlock ? ExtractForScalar : ScalarInit;
|
3596 | 3553 | Start->addIncoming(Incoming, BB);
|
3597 | 3554 | }
|
@@ -7480,7 +7437,9 @@ static void createAndCollectMergePhiForReduction(
|
7480 | 7437 | // If we are fixing reductions in the epilogue loop then we should already
|
7481 | 7438 | // have created a bc.merge.rdx Phi after the main vector body. Ensure that
|
7482 | 7439 | // we carry over the incoming values correctly.
|
7483 |
| - for (auto *Incoming : predecessors(LoopScalarPreHeader)) { |
| 7440 | + SmallVector<BasicBlock *> Blocks(predecessors(LoopScalarPreHeader)); |
| 7441 | + reorderIncomingBlocks(Blocks, LoopMiddleBlock); |
| 7442 | + for (auto *Incoming : Blocks) { |
7484 | 7443 | if (Incoming == LoopMiddleBlock)
|
7485 | 7444 | BCBlockPhi->addIncoming(FinalValue, Incoming);
|
7486 | 7445 | else if (ResumePhi && is_contained(ResumePhi->blocks(), Incoming))
|
@@ -7551,6 +7510,21 @@ LoopVectorizationPlanner::executePlan(
|
7551 | 7510 | std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
|
7552 | 7511 | ILV.createVectorizedLoopSkeleton(ExpandedSCEVs ? *ExpandedSCEVs
|
7553 | 7512 | : State.ExpandedSCEVs);
|
| 7513 | +#ifdef EXPENSIVE_CHECKS |
| 7514 | + assert(DT->verify(DominatorTree::VerificationLevel::Fast)); |
| 7515 | +#endif |
| 7516 | + |
| 7517 | + VPBasicBlock *MiddleVPBB = |
| 7518 | + cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor()); |
| 7519 | + |
| 7520 | + using namespace llvm::VPlanPatternMatch; |
| 7521 | + if (MiddleVPBB->begin() != MiddleVPBB->end() && |
| 7522 | + match(&MiddleVPBB->back(), m_BranchOnCond(m_VPValue()))) { |
| 7523 | + cast<VPIRWrapperBlock>(MiddleVPBB->getSuccessors()[1]) |
| 7524 | + ->resetBlock(OrigLoop->getLoopPreheader()); |
| 7525 | + } else |
| 7526 | + cast<VPIRWrapperBlock>(MiddleVPBB->getSuccessors()[0]) |
| 7527 | + ->resetBlock(OrigLoop->getLoopPreheader()); |
7554 | 7528 |
|
7555 | 7529 | // Only use noalias metadata when using memory checks guaranteeing no overlap
|
7556 | 7530 | // across all iterations.
|
@@ -7687,7 +7661,7 @@ EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
|
7687 | 7661 | // inductions in the epilogue loop are created before executing the plan for
|
7688 | 7662 | // the epilogue loop.
|
7689 | 7663 |
|
7690 |
| - return {completeLoopSkeleton(), nullptr}; |
| 7664 | + return {LoopVectorPreHeader, nullptr}; |
7691 | 7665 | }
|
7692 | 7666 |
|
7693 | 7667 | void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
|
@@ -7811,8 +7785,11 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
|
7811 | 7785 | VecEpilogueIterationCountCheck,
|
7812 | 7786 | VecEpilogueIterationCountCheck->getSinglePredecessor());
|
7813 | 7787 |
|
7814 |
| - DT->changeImmediateDominator(LoopScalarPreHeader, |
7815 |
| - EPI.EpilogueIterationCountCheck); |
| 7788 | + if (auto *N = DT->getNode(LoopScalarPreHeader)) |
| 7789 | + DT->changeImmediateDominator(LoopScalarPreHeader, |
| 7790 | + EPI.EpilogueIterationCountCheck); |
| 7791 | + else |
| 7792 | + DT->addNewBlock(LoopScalarPreHeader, EPI.EpilogueIterationCountCheck); |
7816 | 7793 | if (!Cost->requiresScalarEpilogue(EPI.EpilogueVF.isVector()))
|
7817 | 7794 | // If there is an epilogue which must run, there's no edge from the
|
7818 | 7795 | // middle block to exit blocks and thus no need to update the immediate
|
@@ -7876,7 +7853,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
|
7876 | 7853 | {VecEpilogueIterationCountCheck,
|
7877 | 7854 | EPI.VectorTripCount} /* AdditionalBypass */);
|
7878 | 7855 |
|
7879 |
| - return {completeLoopSkeleton(), EPResumeVal}; |
| 7856 | + return {LoopVectorPreHeader, EPResumeVal}; |
7880 | 7857 | }
|
7881 | 7858 |
|
7882 | 7859 | BasicBlock *
|
@@ -8625,9 +8602,25 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
|
8625 | 8602 | // modified; a basic block for the vector pre-header, followed by a region for
|
8626 | 8603 | // the vector loop, followed by the middle basic block. The skeleton vector
|
8627 | 8604 | // loop region contains a header and latch basic blocks.
|
| 8605 | + |
| 8606 | + // Add a check in the middle block to see if we have completed |
| 8607 | + // all of the iterations in the first vector loop. Three cases: |
| 8608 | + // 1) If we require a scalar epilogue, there is no conditional branch as |
| 8609 | + // we unconditionally branch to the scalar preheader. Do nothing. |
| 8610 | + // 2) If (N - N%VF) == N, then we *don't* need to run the remainder. |
| 8611 | + // Thus if tail is to be folded, we know we don't need to run the |
| 8612 | + // remainder and we can use the previous value for the condition (true). |
| 8613 | + // 3) Otherwise, construct a runtime check. |
| 8614 | + bool RequiresScalarEpilogueCheck = |
| 8615 | + LoopVectorizationPlanner::getDecisionAndClampRange( |
| 8616 | + [this](ElementCount VF) { |
| 8617 | + return !CM.requiresScalarEpilogue(VF.isVector()); |
| 8618 | + }, |
| 8619 | + Range); |
8628 | 8620 | VPlanPtr Plan = VPlan::createInitialVPlan(
|
8629 | 8621 | createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
|
8630 |
| - *PSE.getSE()); |
| 8622 | + *PSE.getSE(), RequiresScalarEpilogueCheck, CM.foldTailByMasking(), |
| 8623 | + OrigLoop); |
8631 | 8624 | VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
|
8632 | 8625 | VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
|
8633 | 8626 | VPBlockUtils::insertBlockAfter(LatchVPBB, HeaderVPBB);
|
@@ -8875,7 +8868,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
|
8875 | 8868 | // Create new empty VPlan
|
8876 | 8869 | auto Plan = VPlan::createInitialVPlan(
|
8877 | 8870 | createTripCountSCEV(Legal->getWidestInductionType(), PSE, OrigLoop),
|
8878 |
| - *PSE.getSE()); |
| 8871 | + *PSE.getSE(), true, false, OrigLoop); |
8879 | 8872 |
|
8880 | 8873 | // Build hierarchical CFG
|
8881 | 8874 | VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
|
@@ -9084,6 +9077,9 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
|
9084 | 9077 | }
|
9085 | 9078 | }
|
9086 | 9079 | Builder.setInsertPoint(&*LatchVPBB->begin());
|
| 9080 | + VPBasicBlock *MiddleVPBB = |
| 9081 | + cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()); |
| 9082 | + VPBasicBlock::iterator IP = MiddleVPBB->begin(); |
9087 | 9083 | for (VPRecipeBase &R :
|
9088 | 9084 | Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
|
9089 | 9085 | VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(&R);
|
@@ -9192,8 +9188,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
|
9192 | 9188 | // also modeled in VPlan.
|
9193 | 9189 | auto *FinalReductionResult = new VPInstruction(
|
9194 | 9190 | VPInstruction::ComputeReductionResult, {PhiR, NewExitingVPV}, ExitDL);
|
9195 |
| - cast<VPBasicBlock>(VectorLoopRegion->getSingleSuccessor()) |
9196 |
| - ->appendRecipe(FinalReductionResult); |
| 9191 | + FinalReductionResult->insertBefore(*MiddleVPBB, IP); |
| 9192 | + IP = std::next(FinalReductionResult->getIterator()); |
9197 | 9193 | OrigExitingVPV->replaceUsesWithIf(
|
9198 | 9194 | FinalReductionResult,
|
9199 | 9195 | [](VPUser &User, unsigned) { return isa<VPLiveOut>(&User); });
|
|
0 commit comments