Skip to content

Commit d26e746

Browse files
[LV] Decompose WidenIntOrFPInduction into phi and update recipes
Loop Vectorizer still has two recipes `VPWidenIntOrFpInductionRecipe` and `VPWidenPointerInductionRecipe` that behave in a VPlan as phi-like, as they're derived from `VPHeaderPHIRecipe`, but their generate functions construct vector phi and vector self-update in the vectorized loop. This is not only bad from readability of a VPlan, but also requires more code to maintain such behavior. For instance, there's already ad-hoc code motion to move generated updates of these recipes closer to the loop latch. The changeset: * Adds `WidenVFxUF` to represent `broadcast({1...UF} x `VFxUF`)` value * Decomposes existing `VPWidenIntOrFpInductionRecipe` into ``` WIDEN-INDUCTION vp<%iv> = phi ir<0>, vp<%be-value> ... EMIT vp<%widen-step> = mul ir<%step>, vp<WidenVFxUF> EMIT vp<%be-value> = add vp<%iv>,vp<%widen-step> ``` * Moves trunc optimization of widen IV into VPlan xform * Adds trivial cyclic dependency removal and mark some binops as non side-effecting * Adds element type to `VPValue` to query it for artifical added `VPValue` without underlying instruction
1 parent aa68e28 commit d26e746

File tree

170 files changed

+16018
-10025
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

170 files changed

+16018
-10025
lines changed

llvm/include/llvm/Analysis/IVDescriptors.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,11 @@ class InductionDescriptor {
363363
return nullptr;
364364
}
365365

366+
const Instruction *getExactFPMathInst() const {
367+
return const_cast<const Instruction *>(
368+
const_cast<InductionDescriptor *>(this)->getExactFPMathInst());
369+
}
370+
366371
/// Returns binary opcode of the induction operator.
367372
Instruction::BinaryOps getInductionOpcode() const {
368373
return InductionBinOp ? InductionBinOp->getOpcode()

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 88 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -8130,34 +8130,6 @@ VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
81308130
return nullptr;
81318131
}
81328132

8133-
VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
8134-
TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range, VPlan &Plan) {
8135-
// Optimize the special case where the source is a constant integer
8136-
// induction variable. Notice that we can only optimize the 'trunc' case
8137-
// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
8138-
// (c) other casts depend on pointer size.
8139-
8140-
// Determine whether \p K is a truncation based on an induction variable that
8141-
// can be optimized.
8142-
auto isOptimizableIVTruncate =
8143-
[&](Instruction *K) -> std::function<bool(ElementCount)> {
8144-
return [=](ElementCount VF) -> bool {
8145-
return CM.isOptimizableIVTruncate(K, VF);
8146-
};
8147-
};
8148-
8149-
if (LoopVectorizationPlanner::getDecisionAndClampRange(
8150-
isOptimizableIVTruncate(I), Range)) {
8151-
8152-
auto *Phi = cast<PHINode>(I->getOperand(0));
8153-
const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
8154-
VPValue *Start = Plan.getVPValueOrAddLiveIn(II.getStartValue());
8155-
return createWidenInductionRecipes(Phi, I, Start, II, Plan, *PSE.getSE(),
8156-
*OrigLoop, Range);
8157-
}
8158-
return nullptr;
8159-
}
8160-
81618133
VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi,
81628134
ArrayRef<VPValue *> Operands,
81638135
VPlanPtr &Plan) {
@@ -8291,6 +8263,70 @@ bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
82918263
Range);
82928264
}
82938265

8266+
VPWidenCastRecipe *VPRecipeBuilder::createCast(VPValue *V, Type *From,
8267+
Type *To) {
8268+
if (From == To)
8269+
return nullptr;
8270+
Instruction::CastOps CastOpcode;
8271+
if (To->isIntegerTy() && From->isIntegerTy())
8272+
CastOpcode = To->getPrimitiveSizeInBits() < From->getPrimitiveSizeInBits()
8273+
? Instruction::Trunc
8274+
: Instruction::ZExt;
8275+
else if (To->isIntegerTy())
8276+
CastOpcode = Instruction::FPToUI;
8277+
else
8278+
CastOpcode = Instruction::UIToFP;
8279+
8280+
return new VPWidenCastRecipe(CastOpcode, V, To);
8281+
}
8282+
8283+
VPRecipeBase *
8284+
VPRecipeBuilder::createWidenStep(VPWidenIntOrFpInductionRecipe &WIV,
8285+
ScalarEvolution &SE, VPlan &Plan,
8286+
DenseSet<VPRecipeBase *> *CreatedRecipes) {
8287+
PHINode *PN = WIV.getPHINode();
8288+
const InductionDescriptor &IndDesc = WIV.getInductionDescriptor();
8289+
VPValue *ScalarStep =
8290+
vputils::getOrCreateVPValueForSCEVExpr(Plan, IndDesc.getStep(), SE);
8291+
Type *VFxUFTy = Plan.getVFxUF().getElementType();
8292+
Type *StepTy = IndDesc.getStep()->getType();
8293+
VPValue *WidenVFxUF = &Plan.getWidenVFxUF();
8294+
VPBasicBlock *LatchVPBB = Plan.getVectorLoopRegion()->getExitingBasicBlock();
8295+
if (VPWidenCastRecipe *WidenVFxUFCast =
8296+
createCast(&Plan.getWidenVFxUF(), VFxUFTy, StepTy)) {
8297+
WidenVFxUFCast->insertBefore(LatchVPBB->getTerminator());
8298+
if (CreatedRecipes)
8299+
CreatedRecipes->insert(WidenVFxUFCast);
8300+
WidenVFxUF = WidenVFxUFCast->getVPSingleValue();
8301+
}
8302+
const Instruction::BinaryOps UpdateOp =
8303+
IndDesc.getInductionOpcode() != Instruction::BinaryOpsEnd
8304+
? IndDesc.getInductionOpcode()
8305+
: Instruction::Add;
8306+
VPInstruction *Update;
8307+
if (StepTy->isIntegerTy()) {
8308+
VPInstruction *Mul = new VPInstruction(
8309+
Instruction::Mul, {WidenVFxUF, ScalarStep}, PN->getDebugLoc());
8310+
Mul->insertBefore(LatchVPBB->getTerminator());
8311+
if (CreatedRecipes)
8312+
CreatedRecipes->insert(Mul);
8313+
Update = new VPInstruction(UpdateOp, {&WIV, Mul}, PN->getDebugLoc());
8314+
Update->insertBefore(LatchVPBB->getTerminator());
8315+
} else {
8316+
FastMathFlags FMF = IndDesc.getExactFPMathInst()
8317+
? IndDesc.getExactFPMathInst()->getFastMathFlags()
8318+
: FastMathFlags();
8319+
VPInstruction *Mul = new VPInstruction(
8320+
Instruction::FMul, {WidenVFxUF, ScalarStep}, FMF, PN->getDebugLoc());
8321+
Mul->insertBefore(LatchVPBB->getTerminator());
8322+
Update = new VPInstruction(UpdateOp, {&WIV, Mul}, FMF, PN->getDebugLoc());
8323+
Update->insertBefore(LatchVPBB->getTerminator());
8324+
}
8325+
if (CreatedRecipes)
8326+
CreatedRecipes->insert(Update);
8327+
return Update;
8328+
}
8329+
82948330
VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
82958331
ArrayRef<VPValue *> Operands,
82968332
VPBasicBlock *VPBB, VPlanPtr &Plan) {
@@ -8340,10 +8376,15 @@ VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
83408376
};
83418377
}
83428378

8343-
void VPRecipeBuilder::fixHeaderPhis() {
8379+
void VPRecipeBuilder::fixHeaderPhis(VPlan &Plan) {
83448380
BasicBlock *OrigLatch = OrigLoop->getLoopLatch();
83458381
for (VPHeaderPHIRecipe *R : PhisToFix) {
8346-
auto *PN = cast<PHINode>(R->getUnderlyingValue());
8382+
if (auto *VPWIFR = dyn_cast<VPWidenIntOrFpInductionRecipe>(R)) {
8383+
VPWIFR->addOperand(
8384+
createWidenStep(*VPWIFR, *PSE.getSE(), Plan)->getVPSingleValue());
8385+
continue;
8386+
}
8387+
PHINode *PN = cast<PHINode>(R->getUnderlyingValue());
83478388
VPRecipeBase *IncR =
83488389
getRecipe(cast<Instruction>(PN->getIncomingValueForBlock(OrigLatch)));
83498390
R->addOperand(IncR->getVPSingleValue());
@@ -8421,8 +8462,12 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
84218462
// can have earlier phis as incoming values.
84228463
recordRecipeOf(Phi);
84238464

8424-
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range)))
8465+
if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, *Plan, Range))) {
8466+
if (isa<VPWidenPointerInductionRecipe>(Recipe))
8467+
return Recipe;
8468+
PhisToFix.push_back(cast<VPWidenIntOrFpInductionRecipe>(Recipe));
84258469
return Recipe;
8470+
}
84268471

84278472
VPHeaderPHIRecipe *PhiRecipe = nullptr;
84288473
assert((Legal->isReductionVariable(Phi) ||
@@ -8457,10 +8502,17 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(
84578502
return PhiRecipe;
84588503
}
84598504

8460-
if (isa<TruncInst>(Instr) &&
8461-
(Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Operands,
8462-
Range, *Plan)))
8463-
return Recipe;
8505+
if (isa<TruncInst>(Instr)) {
8506+
auto IsOptimizableIVTruncate =
8507+
[&](Instruction *K) -> std::function<bool(ElementCount)> {
8508+
return [=](ElementCount VF) -> bool {
8509+
return CM.isOptimizableIVTruncate(K, VF);
8510+
};
8511+
};
8512+
8513+
LoopVectorizationPlanner::getDecisionAndClampRange(
8514+
IsOptimizableIVTruncate(Instr), Range);
8515+
}
84648516

84658517
// All widen recipes below deal only with VF > 1.
84668518
if (LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8718,7 +8770,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
87188770
!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
87198771
"entry block must be set to a VPRegionBlock having a non-empty entry "
87208772
"VPBasicBlock");
8721-
RecipeBuilder.fixHeaderPhis();
8773+
RecipeBuilder.fixHeaderPhis(*Plan);
87228774

87238775
// ---------------------------------------------------------------------------
87248776
// Transform initial VPlan: Apply previously taken decisions, in order, to

llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,18 @@ class VPRecipeBuilder {
146146
/// between SRC and DST.
147147
VPValue *getEdgeMask(BasicBlock *Src, BasicBlock *Dst) const;
148148

149+
/// A helper function to create VPWidenCastRecipe of a \p V VPValue to a \p To
150+
/// type.
151+
/// FIXME: Remove \p From argument and take it from a \p V value
152+
static VPWidenCastRecipe *createCast(VPValue *V, Type *From, Type *To);
153+
154+
/// A helper function which widens \p WIV step, multiplies it by WidenVFxUF
155+
/// and attaches to loop latch of the \p Plan. Returns multiplication.
156+
static VPRecipeBase *
157+
createWidenStep(VPWidenIntOrFpInductionRecipe &WIV, ScalarEvolution &SE,
158+
VPlan &Plan,
159+
DenseSet<VPRecipeBase *> *CreatedRecipes = nullptr);
160+
149161
/// Mark given ingredient for recording its recipe once one is created for
150162
/// it.
151163
void recordRecipeOf(Instruction *I) {
@@ -171,7 +183,7 @@ class VPRecipeBuilder {
171183

172184
/// Add the incoming values from the backedge to reduction & first-order
173185
/// recurrence cross-iteration phis.
174-
void fixHeaderPhis();
186+
void fixHeaderPhis(VPlan &Plan);
175187
};
176188
} // end namespace llvm
177189

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 47 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -78,12 +78,25 @@ Value *VPLane::getAsRuntimeExpr(IRBuilderBase &Builder,
7878
llvm_unreachable("Unknown lane kind");
7979
}
8080

81-
VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def)
82-
: SubclassID(SC), UnderlyingVal(UV), Def(Def) {
81+
VPValue::VPValue(const unsigned char SC, Value *UV, VPDef *Def, Type *Ty)
82+
: SubclassID(SC), UnderlyingVal(UV), UnderlyingTy(Ty), Def(Def) {
83+
if (UnderlyingTy)
84+
assert((!UnderlyingVal || UnderlyingVal->getType() == UnderlyingTy) &&
85+
"VPValue with set type should either be created without underlying "
86+
"value or type should match the given type");
8387
if (Def)
8488
Def->addDefinedValue(this);
8589
}
8690

91+
Type *VPValue::getElementType() {
92+
return const_cast<Type *>(
93+
const_cast<const VPValue *>(this)->getElementType());
94+
}
95+
96+
const Type *VPValue::getElementType() const {
97+
return UnderlyingVal ? UnderlyingVal->getType() : UnderlyingTy;
98+
}
99+
87100
VPValue::~VPValue() {
88101
assert(Users.empty() && "trying to delete a VPValue with remaining users");
89102
if (Def)
@@ -781,6 +794,7 @@ VPlanPtr VPlan::createInitialVPlan(const SCEV *TripCount, ScalarEvolution &SE) {
781794
auto Plan = std::make_unique<VPlan>(Preheader, VecPreheader);
782795
Plan->TripCount =
783796
vputils::getOrCreateVPValueForSCEVExpr(*Plan, TripCount, SE);
797+
Type *TCType = TripCount->getType();
784798
// Create empty VPRegionBlock, to be filled during processing later.
785799
auto *TopRegion = new VPRegionBlock("vector loop", false /*isReplicator*/);
786800
VPBlockUtils::insertBlockAfter(TopRegion, VecPreheader);
@@ -808,6 +822,18 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
808822
VFxUF.setUnderlyingValue(
809823
createStepForVF(Builder, TripCountV->getType(), State.VF, State.UF));
810824

825+
if (WidenVFxUF.getNumUsers() > 0)
826+
for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
827+
Value *Step =
828+
createStepForVF(Builder, TripCountV->getType(), State.VF, Part+1);
829+
if (State.VF.isScalar())
830+
State.set(&WidenVFxUF, Step, Part);
831+
else
832+
State.set(&WidenVFxUF,
833+
Builder.CreateVectorSplat(State.VF, Step, "widen.vfxuf"),
834+
Part);
835+
}
836+
811837
// When vectorizing the epilogue loop, the canonical induction start value
812838
// needs to be changed from zero to the value after the main vector loop.
813839
// FIXME: Improve modeling for canonical IV start values in the epilogue loop.
@@ -853,21 +879,16 @@ void VPlan::execute(VPTransformState *State) {
853879
if (isa<VPWidenPHIRecipe>(&R))
854880
continue;
855881

856-
if (isa<VPWidenPointerInductionRecipe>(&R) ||
857-
isa<VPWidenIntOrFpInductionRecipe>(&R)) {
882+
if (isa<VPWidenPointerInductionRecipe>(&R)) {
858883
PHINode *Phi = nullptr;
859-
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
860-
Phi = cast<PHINode>(State->get(R.getVPSingleValue(), 0));
861-
} else {
862-
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
863-
// TODO: Split off the case that all users of a pointer phi are scalar
864-
// from the VPWidenPointerInductionRecipe.
865-
if (WidenPhi->onlyScalarsGenerated(State->VF.isScalable()))
866-
continue;
867-
868-
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
869-
Phi = cast<PHINode>(GEP->getPointerOperand());
870-
}
884+
auto *WidenPhi = cast<VPWidenPointerInductionRecipe>(&R);
885+
// TODO: Split off the case that all users of a pointer phi are scalar
886+
// from the VPWidenPointerInductionRecipe.
887+
if (WidenPhi->onlyScalarsGenerated(State->VF.isScalable()))
888+
continue;
889+
890+
auto *GEP = cast<GetElementPtrInst>(State->get(WidenPhi, 0));
891+
Phi = cast<PHINode>(GEP->getPointerOperand());
871892

872893
Phi->setIncomingBlock(1, VectorLatchBB);
873894

@@ -885,6 +906,7 @@ void VPlan::execute(VPTransformState *State) {
885906
// generated.
886907
bool SinglePartNeeded = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
887908
isa<VPFirstOrderRecurrencePHIRecipe>(PhiR) ||
909+
isa<VPWidenIntOrFpInductionRecipe>(PhiR) ||
888910
(isa<VPReductionPHIRecipe>(PhiR) &&
889911
cast<VPReductionPHIRecipe>(PhiR)->isOrdered());
890912
bool NeedsScalar = isa<VPCanonicalIVPHIRecipe>(PhiR) ||
@@ -920,6 +942,12 @@ void VPlan::printLiveIns(raw_ostream &O) const {
920942
O << " = VF * UF";
921943
}
922944

945+
if (WidenVFxUF.getNumUsers() > 0) {
946+
O << "\nLive-in ";
947+
WidenVFxUF.printAsOperand(O, SlotTracker);
948+
O << " = WIDEN VF * UF";
949+
}
950+
923951
if (VectorTripCount.getNumUsers() > 0) {
924952
O << "\nLive-in ";
925953
VectorTripCount.printAsOperand(O, SlotTracker);
@@ -1095,6 +1123,7 @@ VPlan *VPlan::duplicate() {
10951123
}
10961124
Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
10971125
Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
1126+
Old2NewVPValues[&WidenVFxUF] = &NewPlan->WidenVFxUF;
10981127
if (BackedgeTakenCount) {
10991128
NewPlan->BackedgeTakenCount = new VPValue();
11001129
Old2NewVPValues[BackedgeTakenCount] = NewPlan->BackedgeTakenCount;
@@ -1391,6 +1420,8 @@ void VPSlotTracker::assignSlot(const VPValue *V) {
13911420
void VPSlotTracker::assignSlots(const VPlan &Plan) {
13921421
if (Plan.VFxUF.getNumUsers() > 0)
13931422
assignSlot(&Plan.VFxUF);
1423+
if (Plan.WidenVFxUF.getNumUsers() > 0)
1424+
assignSlot(&Plan.WidenVFxUF);
13941425
assignSlot(&Plan.VectorTripCount);
13951426
if (Plan.BackedgeTakenCount)
13961427
assignSlot(Plan.BackedgeTakenCount);

0 commit comments

Comments
 (0)