Skip to content

Commit 1441aea

Browse files
[LV] Build VPlan for CSA
1 parent ce9f1c3 commit 1441aea

File tree

8 files changed

+2779
-383
lines changed

8 files changed

+2779
-383
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 182 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
174174
STATISTIC(LoopsVectorized, "Number of loops vectorized");
175175
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
176176
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
177+
STATISTIC(CSAsVectorized,
178+
"Number of conditional scalar assignments vectorized");
177179

178180
static cl::opt<bool> EnableEpilogueVectorization(
179181
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -498,6 +500,10 @@ class InnerLoopVectorizer {
498500
/// Fix the vectorized code, taking care of header phi's, and more.
499501
void fixVectorizedLoop(VPTransformState &State);
500502

503+
/// For all vectorized CSAs, replace uses of live-out scalar from the orignal
504+
/// loop with the extracted scalar from the vector loop for.
505+
void fixCSALiveOuts(VPTransformState &State, VPlan &Plan);
506+
501507
// Return true if any runtime check is added.
502508
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
503509

@@ -2937,6 +2943,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29372943
TargetTransformInfo::TCK_RecipThroughput);
29382944
}
29392945

2946+
void InnerLoopVectorizer::fixCSALiveOuts(VPTransformState &State, VPlan &Plan) {
2947+
for (const auto &CSA : Plan.getCSAStates()) {
2948+
VPCSADataUpdateRecipe *VPDataUpdate = CSA.second->getDataUpdate();
2949+
assert(VPDataUpdate &&
2950+
"VPDataUpdate must have been introduced prior to fixing live outs");
2951+
Value *V = VPDataUpdate->getUnderlyingValue();
2952+
Value *ExtractedScalar = State.get(CSA.second->getExtractScalarRecipe(), 0,
2953+
/*NeedsScalar=*/true);
2954+
// Fix LCSSAPhis
2955+
llvm::SmallPtrSet<PHINode *, 2> ToFix;
2956+
for (User *U : V->users())
2957+
if (auto *Phi = dyn_cast<PHINode>(U);
2958+
Phi && Phi->getParent() == LoopExitBlock)
2959+
ToFix.insert(Phi);
2960+
for (PHINode *Phi : ToFix)
2961+
Phi->addIncoming(ExtractedScalar, LoopMiddleBlock);
2962+
}
2963+
}
2964+
29402965
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
29412966
// Fix widened non-induction PHIs by setting up the PHI operands.
29422967
if (EnableVPlanNativePath)
@@ -2971,6 +2996,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
29712996
for (const auto &Entry : Legal->getInductionVars())
29722997
fixupIVUsers(Entry.first, Entry.second,
29732998
getOrCreateVectorTripCount(nullptr), LoopMiddleBlock, State);
2999+
fixCSALiveOuts(State, Plan);
29743000
}
29753001

29763002
for (Instruction *PI : PredicatedInstructions)
@@ -4516,6 +4542,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
45164542
case VPDef::VPEVLBasedIVPHISC:
45174543
case VPDef::VPPredInstPHISC:
45184544
case VPDef::VPBranchOnMaskSC:
4545+
case VPRecipeBase::VPCSADataUpdateSC:
4546+
case VPRecipeBase::VPCSAExtractScalarSC:
4547+
case VPRecipeBase::VPCSAHeaderPHISC:
45194548
continue;
45204549
case VPDef::VPReductionSC:
45214550
case VPDef::VPActiveLaneMaskPHISC:
@@ -8701,9 +8730,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87018730
return Recipe;
87028731

87038732
VPHeaderPHIRecipe *PhiRecipe = nullptr;
8704-
assert((Legal->isReductionVariable(Phi) ||
8705-
Legal->isFixedOrderRecurrence(Phi)) &&
8706-
"can only widen reductions and fixed-order recurrences here");
87078733
VPValue *StartV = Operands[0];
87088734
if (Legal->isReductionVariable(Phi)) {
87098735
const RecurrenceDescriptor &RdxDesc =
@@ -8713,12 +8739,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87138739
PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
87148740
CM.isInLoopReduction(Phi),
87158741
CM.useOrderedReductions(RdxDesc));
8716-
} else {
8742+
} else if (Legal->isFixedOrderRecurrence(Phi)) {
87178743
// TODO: Currently fixed-order recurrences are modeled as chains of
87188744
// first-order recurrences. If there are no users of the intermediate
87198745
// recurrences in the chain, the fixed order recurrence should be modeled
87208746
// directly, enabling more efficient codegen.
87218747
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8748+
} else if (Legal->isCSAPhi(Phi)) {
8749+
VPCSAState *State = Plan.getCSAStates().find(Phi)->second;
8750+
VPValue *InitData = State->getVPInitData();
8751+
// When the VF=getFixed(1), InitData is just InitScalar.
8752+
if (!InitData)
8753+
InitData = State->getVPInitScalar();
8754+
PhiRecipe = new VPCSAHeaderPHIRecipe(Phi, InitData);
8755+
State->setPhiRecipe(cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8756+
} else {
8757+
llvm_unreachable(
8758+
"can only widen reductions, fixed-order recurrences, and CSAs here");
87228759
}
87238760

87248761
PhisToFix.push_back(PhiRecipe);
@@ -8752,6 +8789,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87528789
make_range(Operands.begin(), Operands.end()));
87538790

87548791
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8792+
auto *CSADescIt = find_if(Legal->getCSAs(), [&](auto CSA) {
8793+
return CSADescriptor::isCSASelect(CSA.second, SI);
8794+
});
8795+
if (CSADescIt != Legal->getCSAs().end()) {
8796+
PHINode *CSAPhi = CSADescIt->first;
8797+
VPCSAState *State = Plan.getCSAStates().find(CSAPhi)->second;
8798+
VPValue *VPDataPhi = State->getPhiRecipe();
8799+
auto *R = new VPCSADataUpdateRecipe(
8800+
SI, {VPDataPhi, Operands[0], Operands[1], Operands[2]});
8801+
State->setDataUpdate(R);
8802+
return R;
8803+
}
8804+
87558805
return new VPWidenSelectRecipe(
87568806
*SI, make_range(Operands.begin(), Operands.end()));
87578807
}
@@ -8764,6 +8814,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87648814
return tryToWiden(Instr, Operands, VPBB);
87658815
}
87668816

8817+
/// Add CSA Recipes that can occur before each instruction in the input IR
8818+
/// is processed and introduced into VPlan.
8819+
static void
8820+
addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
8821+
Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8822+
VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8823+
VPlan &Plan) {
8824+
8825+
// Don't build full CSA for VF=ElementCount::getFixed(1)
8826+
bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange(
8827+
[&](ElementCount VF) { return VF.isScalar(); }, Range);
8828+
8829+
for (const auto &CSA : CSAs) {
8830+
VPValue *VPInitScalar = Plan.getOrAddLiveIn(
8831+
CSA.first->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8832+
8833+
// Scalar VF builds the scalar version of the loop. In that case,
8834+
// no maintenence of mask nor extraction in middle block is needed.
8835+
if (IsScalarVF) {
8836+
VPCSAState *S = new VPCSAState(VPInitScalar);
8837+
Plan.addCSAState(CSA.first, S);
8838+
continue;
8839+
}
8840+
8841+
auto *VPInitMask =
8842+
new VPInstruction(VPInstruction::CSAInitMask, {}, DL, "csa.init.mask");
8843+
auto *VPInitData = new VPInstruction(VPInstruction::CSAInitData,
8844+
{VPInitScalar}, DL, "csa.init.data");
8845+
PreheaderVPBB->appendRecipe(VPInitMask);
8846+
PreheaderVPBB->appendRecipe(VPInitData);
8847+
8848+
auto *VPMaskPhi = new VPInstruction(VPInstruction::CSAMaskPhi, {VPInitMask},
8849+
DL, "csa.mask.phi");
8850+
HeaderVPBB->appendRecipe(VPMaskPhi);
8851+
8852+
auto *S = new VPCSAState(VPInitScalar, VPInitData, VPMaskPhi);
8853+
Plan.addCSAState(CSA.first, S);
8854+
}
8855+
}
8856+
8857+
/// Add CSA Recipes that must occur after each instruction in the input IR
8858+
/// is processed and introduced into VPlan.
8859+
static void
8860+
addCSAPostprocessRecipes(VPRecipeBuilder &RecipeBuilder,
8861+
const LoopVectorizationLegality::CSAList &CSAs,
8862+
VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8863+
VPlan &Plan) {
8864+
// Don't build CSA for VF=ElementCount::getFixed(1)
8865+
if (LoopVectorizationPlanner::getDecisionAndClampRange(
8866+
[&](ElementCount VF) { return VF.isScalar(); }, Range))
8867+
return;
8868+
8869+
for (const auto &CSA : CSAs) {
8870+
VPCSAState *CSAState = Plan.getCSAStates().find(CSA.first)->second;
8871+
VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate();
8872+
8873+
assert(VPDataUpdate &&
8874+
"VPDataUpdate must have been introduced prior to postprocess");
8875+
assert(CSA.second.getCond() &&
8876+
"CSADescriptor must know how to describe the condition");
8877+
auto GetVPValue = [&](Value *I) {
8878+
return RecipeBuilder.getRecipe(cast<Instruction>(I))->getVPSingleValue();
8879+
};
8880+
VPValue *WidenedCond = GetVPValue(CSA.second.getCond());
8881+
VPValue *VPInitScalar = CSAState->getVPInitScalar();
8882+
8883+
// The CSA optimization wants to use a condition such that when it is
8884+
// true, a new value is assigned. However, it is possible that a true lane
8885+
// in WidenedCond corresponds to selection of the initial value instead.
8886+
// In that case, we must use the negation of WidenedCond.
8887+
// i.e. select cond new_val old_val versus select cond.not old_val new_val
8888+
VPValue *CondToUse = WidenedCond;
8889+
if (cast<SelectInst>(CSA.second.getAssignment())->getTrueValue() ==
8890+
CSA.first) {
8891+
auto *VPNotCond = new VPInstruction(VPInstruction::Not, WidenedCond, DL);
8892+
VPNotCond->insertBefore(
8893+
GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
8894+
CondToUse = VPNotCond;
8895+
}
8896+
8897+
auto *VPAnyActive = new VPInstruction(
8898+
VPInstruction::CSAAnyActive, {CondToUse}, DL, "csa.cond.anyactive");
8899+
VPAnyActive->insertBefore(
8900+
GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
8901+
8902+
auto *VPMaskSel = new VPInstruction(
8903+
VPInstruction::CSAMaskSel,
8904+
{CondToUse, CSAState->getVPMaskPhi(), VPAnyActive}, DL, "csa.mask.sel");
8905+
VPMaskSel->insertAfter(VPAnyActive);
8906+
VPDataUpdate->setVPNewMaskAndVPAnyActive(VPMaskSel, VPAnyActive);
8907+
VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8908+
new VPCSAExtractScalarRecipe({VPInitScalar, VPMaskSel, VPDataUpdate});
8909+
8910+
MiddleVPBB->insert(ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi());
8911+
8912+
// Update CSAState with new recipes
8913+
CSAState->setExtractScalarRecipe(ExtractScalarRecipe);
8914+
CSAState->setVPAnyActive(VPAnyActive);
8915+
}
8916+
}
8917+
87678918
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
87688919
ElementCount MaxVF) {
87698920
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8856,7 +9007,8 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
88569007
// increments.
88579008
static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
88589009
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8859-
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
9010+
const MapVector<PHINode *, InductionDescriptor> &Inductions,
9011+
const MapVector<PHINode *, CSADescriptor> &CSAs) {
88609012
SetVector<VPIRInstruction *> ExitUsersToFix;
88619013
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
88629014
BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock();
@@ -8887,6 +9039,17 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
88879039
return P && Inductions.contains(P);
88889040
})))
88899041
continue;
9042+
// Exit values for CSAs are computed and updated outside of VPlan and
9043+
// independent of induction recipes.
9044+
// TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
9045+
// live-outs.
9046+
if (isa<VPCSADataUpdateRecipe>(V) &&
9047+
(isa<Instruction>(IncomingValue) &&
9048+
any_of(IncomingValue->users(), [&CSAs](User *U) {
9049+
auto *P = dyn_cast<PHINode>(U);
9050+
return P && CSAs.contains(P);
9051+
})))
9052+
continue;
88909053
ExitUsersToFix.insert(ExitIRI);
88919054
ExitIRI->addOperand(V);
88929055
}
@@ -9068,6 +9231,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
90689231
bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
90699232
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
90709233

9234+
addCSAPreprocessRecipes(Legal->getCSAs(), OrigLoop, Plan->getPreheader(),
9235+
Plan->getVectorLoopRegion()->getEntryBasicBlock(), DL,
9236+
Range, *Plan);
9237+
90719238
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
90729239

90739240
// ---------------------------------------------------------------------------
@@ -9185,6 +9352,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91859352
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
91869353
}
91879354

9355+
VPBasicBlock *MiddleVPBB =
9356+
cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
9357+
addCSAPostprocessRecipes(RecipeBuilder, Legal->getCSAs(), MiddleVPBB, DL,
9358+
Range, *Plan);
9359+
91889360
// After here, VPBB should not be used.
91899361
VPBB = nullptr;
91909362

@@ -9195,8 +9367,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91959367
RecipeBuilder.fixHeaderPhis();
91969368

91979369
addScalarResumePhis(RecipeBuilder, *Plan);
9198-
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks(
9199-
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
9370+
SetVector<VPIRInstruction *> ExitUsersToFix =
9371+
collectUsersInExitBlocks(OrigLoop, RecipeBuilder, *Plan,
9372+
Legal->getInductionVars(), Legal->getCSAs());
92009373
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
92019374
addUsersInExitBlocks(*Plan, ExitUsersToFix);
92029375
// ---------------------------------------------------------------------------
@@ -10256,6 +10429,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1025610429
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
1025710430
*BestMainPlan, MainILV, DT, false);
1025810431
++LoopsVectorized;
10432+
CSAsVectorized += LVL.getCSAs().size();
1025910433

1026010434
// Second pass vectorizes the epilogue and adjusts the control flow
1026110435
// edges from the first pass.
@@ -10351,6 +10525,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1035110525
PSI, Checks, BestPlan);
1035210526
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
1035310527
++LoopsVectorized;
10528+
CSAsVectorized += LVL.getCSAs().size();
1035410529

1035510530
// Add metadata to disable runtime unrolling a scalar loop when there
1035610531
// are no runtime checks about strides and memory. A scalar loop that is

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
214214

215215
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
216216
iterator It = begin();
217-
while (It != end() && It->isPhi())
217+
while (It != end() && vputils::isPhi(*It))
218218
It++;
219219
return It;
220220
}
@@ -838,6 +838,9 @@ VPlan::~VPlan() {
838838
delete VPV;
839839
if (BackedgeTakenCount)
840840
delete BackedgeTakenCount;
841+
842+
for (std::pair<PHINode *, VPCSAState *> &S : CSAStates)
843+
delete S.second;
841844
}
842845

843846
VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) {
@@ -1040,7 +1043,7 @@ void VPlan::execute(VPTransformState *State) {
10401043
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
10411044
for (VPRecipeBase &R : Header->phis()) {
10421045
// Skip phi-like recipes that generate their backedege values themselves.
1043-
if (isa<VPWidenPHIRecipe>(&R))
1046+
if (vputils::isPhiThatGeneratesBackedge(R))
10441047
continue;
10451048

10461049
if (isa<VPWidenPointerInductionRecipe>(&R) ||

0 commit comments

Comments
 (0)