Skip to content

Commit b039d2e

Browse files
committed
[VPlan] Build initial VPlan 0 using HCFGBuilder for inner loops. (NFC)
Use HCFGBuilder to build an initial VPlan 0, which wraps all input instructions in VPInstructions and update tryToBuildVPlanWithVPRecipes to replace the VPInstructions with widened recipes. At the moment, widened recipes are created based on the underlying instruction of the VPInstruction. Masks are also still created based on the input IR basic blocks and the loop CFG is flattened in the main loop processing the VPInstructions. This patch also incldues support for Switch instructions in HCFGBuilder using just a VPInstruction with Instruction::Switch opcode. There are multiple follow-ups planned: * Use VPIRInstructions instead of VPInstructions in HCFGBuilder, * Perform predication on the VPlan directly, * Unify code constructing VPlan 0 to be shared by both inner and outer loop code paths.
1 parent 04d5608 commit b039d2e

File tree

5 files changed

+108
-32
lines changed

5 files changed

+108
-32
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 65 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -8299,7 +8299,7 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
82998299
: GEPNoWrapFlags::none(),
83008300
I->getDebugLoc());
83018301
}
8302-
Builder.getInsertBlock()->appendRecipe(VectorPtr);
8302+
VectorPtr->insertBefore(&*Builder.getInsertPoint());
83038303
Ptr = VectorPtr;
83048304
}
83058305
if (LoadInst *Load = dyn_cast<LoadInst>(I))
@@ -9206,6 +9206,7 @@ static void addExitUsersForFirstOrderRecurrences(
92069206
VPlanPtr
92079207
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
92089208

9209+
using namespace llvm::VPlanPatternMatch;
92099210
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
92109211

92119212
// ---------------------------------------------------------------------------
@@ -9229,6 +9230,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
92299230
PSE, RequiresScalarEpilogueCheck,
92309231
CM.foldTailByMasking(), OrigLoop);
92319232

9233+
// Build hierarchical CFG.
9234+
VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
9235+
HCFGBuilder.buildHierarchicalCFG();
9236+
92329237
// Don't use getDecisionAndClampRange here, because we don't know the UF
92339238
// so this function is better to be conservative, rather than to split
92349239
// it up into different VPlans.
@@ -9297,23 +9302,45 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
92979302
RecipeBuilder.collectScaledReductions(Range);
92989303

92999304
auto *MiddleVPBB = Plan->getMiddleBlock();
9305+
ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
9306+
Plan->getVectorLoopRegion()->getEntry());
9307+
93009308
VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
9301-
for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) {
9302-
// Relevant instructions from basic block BB will be grouped into VPRecipe
9303-
// ingredients and fill a new VPBasicBlock.
9304-
if (VPBB != HeaderVPBB)
9305-
VPBB->setName(BB->getName());
9306-
Builder.setInsertPoint(VPBB);
9309+
VPBlockBase *PrevVPBB = nullptr;
9310+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
9311+
// Skip VPBBs not corresponding to any input IR basic blocks.
9312+
if (!HCFGBuilder.getIRBBForVPB(VPBB))
9313+
continue;
93079314

9308-
if (VPBB == HeaderVPBB)
9315+
// Create mask based on the IR BB corresponding to VPBB.
9316+
// TODO: Predicate directly based on VPlan.
9317+
if (VPBB == HeaderVPBB) {
9318+
Builder.setInsertPoint(VPBB, VPBB->getFirstNonPhi());
93099319
RecipeBuilder.createHeaderMask();
9310-
else if (NeedsMasks)
9311-
RecipeBuilder.createBlockInMask(BB);
9320+
} else if (NeedsMasks) {
9321+
Builder.setInsertPoint(VPBB, VPBB->begin());
9322+
RecipeBuilder.createBlockInMask(HCFGBuilder.getIRBBForVPB(VPBB));
9323+
}
93129324

9313-
// Introduce each ingredient into VPlan.
9325+
// Convert input VPInstructions to widened recipes.
93149326
// TODO: Model and preserve debug intrinsics in VPlan.
9315-
for (Instruction &I : drop_end(BB->instructionsWithoutDebug(false))) {
9316-
Instruction *Instr = &I;
9327+
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
9328+
auto *SingleDef = dyn_cast<VPSingleDefRecipe>(&R);
9329+
if (!isa<VPWidenPHIRecipe>(&R) &&
9330+
(!isa<VPInstruction>(SingleDef) || !SingleDef->getUnderlyingValue()))
9331+
continue;
9332+
9333+
if (match(&R, m_BranchOnCond(m_VPValue())) ||
9334+
(isa<VPInstruction>(&R) &&
9335+
cast<VPInstruction>(&R)->getOpcode() == Instruction::Switch)) {
9336+
R.eraseFromParent();
9337+
break;
9338+
}
9339+
9340+
// TODO: Gradually replace uses of underlying instruction by analyses on
9341+
// VPlan.
9342+
Instruction *Instr = SingleDef->getUnderlyingInstr();
9343+
Builder.setInsertPoint(SingleDef);
93179344
SmallVector<VPValue *, 4> Operands;
93189345
auto *Phi = dyn_cast<PHINode>(Instr);
93199346
if (Phi && Phi->getParent() == HeaderBB) {
@@ -9328,15 +9355,18 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93289355
// in the exit block, a uniform store recipe will be created for the final
93299356
// invariant store of the reduction.
93309357
StoreInst *SI;
9331-
if ((SI = dyn_cast<StoreInst>(&I)) &&
9358+
if ((SI = dyn_cast<StoreInst>(Instr)) &&
93329359
Legal->isInvariantAddressOfReduction(SI->getPointerOperand())) {
93339360
// Only create recipe for the final invariant store of the reduction.
9334-
if (!Legal->isInvariantStoreOfReduction(SI))
9361+
if (!Legal->isInvariantStoreOfReduction(SI)) {
9362+
R.eraseFromParent();
93359363
continue;
9364+
}
93369365
auto *Recipe = new VPReplicateRecipe(
93379366
SI, RecipeBuilder.mapToVPValues(Instr->operands()),
93389367
true /* IsUniform */);
93399368
Recipe->insertBefore(*MiddleVPBB, MBIP);
9369+
R.eraseFromParent();
93409370
continue;
93419371
}
93429372

@@ -9355,16 +9385,30 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
93559385
// after them)
93569386
// * Optimizing truncates to VPWidenIntOrFpInductionRecipe.
93579387

9358-
assert((HeaderVPBB->getFirstNonPhi() == VPBB->end() ||
9359-
CM.foldTailByMasking() || isa<TruncInst>(Instr)) &&
9360-
"unexpected recipe needs moving");
93619388
Recipe->insertBefore(*HeaderVPBB, HeaderVPBB->getFirstNonPhi());
93629389
} else
9363-
VPBB->appendRecipe(Recipe);
9390+
Recipe->insertBefore(&R);
9391+
if (Recipe->getNumDefinedValues() == 1)
9392+
SingleDef->replaceAllUsesWith(Recipe->getVPSingleValue());
9393+
else
9394+
assert(Recipe->getNumDefinedValues() == 0);
9395+
R.eraseFromParent();
93649396
}
93659397

9366-
VPBlockUtils::insertBlockAfter(Plan->createVPBasicBlock(""), VPBB);
9367-
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
9398+
// Flatten the CFG in the loop. Masks for blocks have already been generated
9399+
// and added to recipes as needed. To do so, first disconnect VPBB from its
9400+
// predecessors and successors, except the exiting block. Then connect VPBB
9401+
// to the previously visited VPBB.
9402+
for (auto *Succ : to_vector(VPBB->getSuccessors())) {
9403+
if (Succ == Plan->getVectorLoopRegion()->getExiting())
9404+
continue;
9405+
VPBlockUtils::disconnectBlocks(VPBB, Succ);
9406+
}
9407+
for (auto *Pred : to_vector(VPBB->getPredecessors()))
9408+
VPBlockUtils::disconnectBlocks(Pred, VPBB);
9409+
if (PrevVPBB)
9410+
VPBlockUtils::connectBlocks(PrevVPBB, VPBB);
9411+
PrevVPBB = VPBB;
93689412
}
93699413

93709414
// After here, VPBB should not be used.

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -579,9 +579,11 @@ static bool hasConditionalTerminator(const VPBasicBlock *VPBB) {
579579
}
580580

581581
const VPRecipeBase *R = &VPBB->back();
582-
bool IsCondBranch = isa<VPBranchOnMaskRecipe>(R) ||
583-
match(R, m_BranchOnCond(m_VPValue())) ||
584-
match(R, m_BranchOnCount(m_VPValue(), m_VPValue()));
582+
bool IsCondBranch =
583+
isa<VPBranchOnMaskRecipe>(R) || match(R, m_BranchOnCond(m_VPValue())) ||
584+
match(R, m_BranchOnCount(m_VPValue(), m_VPValue())) ||
585+
(isa<VPInstruction>(R) &&
586+
cast<VPInstruction>(R)->getOpcode() == Instruction::Switch);
585587
(void)IsCondBranch;
586588

587589
if (VPBB->getNumSuccessors() >= 2 ||

llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.cpp

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ class PlainCFGBuilder {
7575
: TheLoop(Lp), LI(LI), Plan(P) {}
7676

7777
/// Build plain CFG for TheLoop and connects it to Plan's entry.
78-
void buildPlainCFG();
78+
void buildPlainCFG(DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB);
7979
};
8080
} // anonymous namespace
8181

@@ -238,9 +238,9 @@ bool PlainCFGBuilder::isExternalDef(Value *Val) {
238238
return false;
239239

240240
// Check whether Instruction definition is in the loop exit.
241-
BasicBlock *Exit = TheLoop->getUniqueExitBlock();
242-
assert(Exit && "Expected loop with single exit.");
243-
if (InstParent == Exit) {
241+
SmallVector<BasicBlock *> ExitBlocks;
242+
TheLoop->getExitBlocks(ExitBlocks);
243+
if (is_contained(ExitBlocks, InstParent)) {
244244
// Instruction definition is in outermost loop exit.
245245
return false;
246246
}
@@ -308,6 +308,14 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
308308
continue;
309309
}
310310

311+
if (auto *SI = dyn_cast<SwitchInst>(Inst)) {
312+
SmallVector<VPValue *> Ops = {getOrCreateVPOperand(SI->getCondition())};
313+
for (auto Case : SI->cases())
314+
Ops.push_back(getOrCreateVPOperand(Case.getCaseValue()));
315+
VPIRBuilder.createNaryOp(Instruction::Switch, Ops, Inst);
316+
continue;
317+
}
318+
311319
VPValue *NewVPV;
312320
if (auto *Phi = dyn_cast<PHINode>(Inst)) {
313321
// Phi node's operands may have not been visited at this point. We create
@@ -334,7 +342,8 @@ void PlainCFGBuilder::createVPInstructionsForVPBB(VPBasicBlock *VPBB,
334342
}
335343

336344
// Main interface to build the plain CFG.
337-
void PlainCFGBuilder::buildPlainCFG() {
345+
void PlainCFGBuilder::buildPlainCFG(
346+
DenseMap<VPBlockBase *, BasicBlock *> &VPB2IRBB) {
338347
// 0. Reuse the top-level region, vector-preheader and exit VPBBs from the
339348
// skeleton. These were created directly rather than via getOrCreateVPBB(),
340349
// revisit them now to update BB2VPBB. Note that header/entry and
@@ -423,6 +432,14 @@ void PlainCFGBuilder::buildPlainCFG() {
423432
// Set VPBB successors. We create empty VPBBs for successors if they don't
424433
// exist already. Recipes will be created when the successor is visited
425434
// during the RPO traversal.
435+
if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
436+
SmallVector<VPBlockBase *> Succs = {
437+
getOrCreateVPBB(SI->getDefaultDest())};
438+
for (auto Case : SI->cases())
439+
Succs.push_back(getOrCreateVPBB(Case.getCaseSuccessor()));
440+
VPBB->setSuccessors(Succs);
441+
continue;
442+
}
426443
auto *BI = cast<BranchInst>(BB->getTerminator());
427444
unsigned NumSuccs = succ_size(BB);
428445
if (NumSuccs == 1) {
@@ -476,11 +493,14 @@ void PlainCFGBuilder::buildPlainCFG() {
476493
// have a VPlan couterpart. Fix VPlan phi nodes by adding their corresponding
477494
// VPlan operands.
478495
fixPhiNodes();
496+
497+
for (const auto &[IRBB, VPB] : BB2VPBB)
498+
VPB2IRBB[VPB] = IRBB;
479499
}
480500

481501
void VPlanHCFGBuilder::buildPlainCFG() {
482502
PlainCFGBuilder PCFGBuilder(TheLoop, LI, Plan);
483-
PCFGBuilder.buildPlainCFG();
503+
PCFGBuilder.buildPlainCFG(VPB2IRBB);
484504
}
485505

486506
// Public interface to build a H-CFG.

llvm/lib/Transforms/Vectorize/VPlanHCFGBuilder.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ class VPlanHCFGBuilder {
5353
// are introduced.
5454
VPDominatorTree VPDomTree;
5555

56+
/// Map of create VP blocks to their input IR basic blocks, if they have been
57+
/// created for a input IR basic block.
58+
DenseMap<VPBlockBase *, BasicBlock *> VPB2IRBB;
59+
5660
/// Build plain CFG for TheLoop and connects it to Plan's entry.
5761
void buildPlainCFG();
5862

@@ -62,6 +66,12 @@ class VPlanHCFGBuilder {
6266

6367
/// Build H-CFG for TheLoop and update Plan accordingly.
6468
void buildHierarchicalCFG();
69+
70+
/// Return the input IR BasicBlock corresponding to \p VPB. Returns nullptr if
71+
/// there is no such corresponding block.
72+
BasicBlock *getIRBBForVPB(const VPBlockBase *VPB) const {
73+
return VPB2IRBB.lookup(VPB);
74+
}
6575
};
6676
} // namespace llvm
6777

llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur
4646
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
4747
; CHECK-NEXT: LV: Using user VF vscale x 4.
4848
; CHECK-NEXT: LV: Loop does not require scalar epilogue
49-
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
49+
; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
5050
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
5151
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom
5252
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom
@@ -295,7 +295,7 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur
295295
; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
296296
; CHECK-NEXT: LV: Using user VF vscale x 4.
297297
; CHECK-NEXT: LV: Loop does not require scalar epilogue
298-
; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
298+
; CHECK: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1
299299
; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64
300300
; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom
301301
; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom

0 commit comments

Comments
 (0)