Skip to content

Commit 5149e22

Browse files
[LV] Build VPlan for CSA
1 parent e1db193 commit 5149e22

File tree

8 files changed

+2779
-383
lines changed

8 files changed

+2779
-383
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 182 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
173173
STATISTIC(LoopsVectorized, "Number of loops vectorized");
174174
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175175
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176+
STATISTIC(CSAsVectorized,
177+
"Number of conditional scalar assignments vectorized");
176178

177179
static cl::opt<bool> EnableEpilogueVectorization(
178180
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -497,6 +499,10 @@ class InnerLoopVectorizer {
497499
/// Fix the vectorized code, taking care of header phi's, and more.
498500
void fixVectorizedLoop(VPTransformState &State);
499501

502+
/// For all vectorized CSAs, replace uses of live-out scalar from the orignal
503+
/// loop with the extracted scalar from the vector loop for.
504+
void fixCSALiveOuts(VPTransformState &State, VPlan &Plan);
505+
500506
// Return true if any runtime check is added.
501507
bool areSafetyChecksAdded() { return AddedSafetyChecks; }
502508

@@ -2937,6 +2943,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
29372943
TargetTransformInfo::TCK_RecipThroughput);
29382944
}
29392945

2946+
void InnerLoopVectorizer::fixCSALiveOuts(VPTransformState &State, VPlan &Plan) {
2947+
for (const auto &CSA : Plan.getCSAStates()) {
2948+
VPCSADataUpdateRecipe *VPDataUpdate = CSA.second->getDataUpdate();
2949+
assert(VPDataUpdate &&
2950+
"VPDataUpdate must have been introduced prior to fixing live outs");
2951+
Value *V = VPDataUpdate->getUnderlyingValue();
2952+
Value *ExtractedScalar = State.get(CSA.second->getExtractScalarRecipe(), 0,
2953+
/*NeedsScalar=*/true);
2954+
// Fix LCSSAPhis
2955+
llvm::SmallPtrSet<PHINode *, 2> ToFix;
2956+
for (User *U : V->users())
2957+
if (auto *Phi = dyn_cast<PHINode>(U);
2958+
Phi && Phi->getParent() == LoopExitBlock)
2959+
ToFix.insert(Phi);
2960+
for (PHINode *Phi : ToFix)
2961+
Phi->addIncoming(ExtractedScalar, LoopMiddleBlock);
2962+
}
2963+
}
2964+
29402965
void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
29412966
// Fix widened non-induction PHIs by setting up the PHI operands.
29422967
if (EnableVPlanNativePath)
@@ -2972,6 +2997,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
29722997
fixupIVUsers(Entry.first, Entry.second,
29732998
getOrCreateVectorTripCount(nullptr),
29742999
IVEndValues[Entry.first], LoopMiddleBlock, State);
3000+
fixCSALiveOuts(State, Plan);
29753001
}
29763002

29773003
for (Instruction *PI : PredicatedInstructions)
@@ -4497,6 +4523,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
44974523
case VPDef::VPEVLBasedIVPHISC:
44984524
case VPDef::VPPredInstPHISC:
44994525
case VPDef::VPBranchOnMaskSC:
4526+
case VPRecipeBase::VPCSADataUpdateSC:
4527+
case VPRecipeBase::VPCSAExtractScalarSC:
4528+
case VPRecipeBase::VPCSAHeaderPHISC:
45004529
continue;
45014530
case VPDef::VPReductionSC:
45024531
case VPDef::VPActiveLaneMaskPHISC:
@@ -8680,9 +8709,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
86808709
return Recipe;
86818710

86828711
VPHeaderPHIRecipe *PhiRecipe = nullptr;
8683-
assert((Legal->isReductionVariable(Phi) ||
8684-
Legal->isFixedOrderRecurrence(Phi)) &&
8685-
"can only widen reductions and fixed-order recurrences here");
86868712
VPValue *StartV = Operands[0];
86878713
if (Legal->isReductionVariable(Phi)) {
86888714
const RecurrenceDescriptor &RdxDesc =
@@ -8692,12 +8718,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
86928718
PhiRecipe = new VPReductionPHIRecipe(Phi, RdxDesc, *StartV,
86938719
CM.isInLoopReduction(Phi),
86948720
CM.useOrderedReductions(RdxDesc));
8695-
} else {
8721+
} else if (Legal->isFixedOrderRecurrence(Phi)) {
86968722
// TODO: Currently fixed-order recurrences are modeled as chains of
86978723
// first-order recurrences. If there are no users of the intermediate
86988724
// recurrences in the chain, the fixed order recurrence should be modeled
86998725
// directly, enabling more efficient codegen.
87008726
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8727+
} else if (Legal->isCSAPhi(Phi)) {
8728+
VPCSAState *State = Plan.getCSAStates().find(Phi)->second;
8729+
VPValue *InitData = State->getVPInitData();
8730+
// When the VF=getFixed(1), InitData is just InitScalar.
8731+
if (!InitData)
8732+
InitData = State->getVPInitScalar();
8733+
PhiRecipe = new VPCSAHeaderPHIRecipe(Phi, InitData);
8734+
State->setPhiRecipe(cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8735+
} else {
8736+
llvm_unreachable(
8737+
"can only widen reductions, fixed-order recurrences, and CSAs here");
87018738
}
87028739

87038740
PhisToFix.push_back(PhiRecipe);
@@ -8731,6 +8768,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87318768
make_range(Operands.begin(), Operands.end()));
87328769

87338770
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8771+
auto *CSADescIt = find_if(Legal->getCSAs(), [&](auto CSA) {
8772+
return CSADescriptor::isCSASelect(CSA.second, SI);
8773+
});
8774+
if (CSADescIt != Legal->getCSAs().end()) {
8775+
PHINode *CSAPhi = CSADescIt->first;
8776+
VPCSAState *State = Plan.getCSAStates().find(CSAPhi)->second;
8777+
VPValue *VPDataPhi = State->getPhiRecipe();
8778+
auto *R = new VPCSADataUpdateRecipe(
8779+
SI, {VPDataPhi, Operands[0], Operands[1], Operands[2]});
8780+
State->setDataUpdate(R);
8781+
return R;
8782+
}
8783+
87348784
return new VPWidenSelectRecipe(
87358785
*SI, make_range(Operands.begin(), Operands.end()));
87368786
}
@@ -8743,6 +8793,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87438793
return tryToWiden(Instr, Operands, VPBB);
87448794
}
87458795

8796+
/// Add CSA Recipes that can occur before each instruction in the input IR
8797+
/// is processed and introduced into VPlan.
8798+
static void
8799+
addCSAPreprocessRecipes(const LoopVectorizationLegality::CSAList &CSAs,
8800+
Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8801+
VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8802+
VPlan &Plan) {
8803+
8804+
// Don't build full CSA for VF=ElementCount::getFixed(1)
8805+
bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange(
8806+
[&](ElementCount VF) { return VF.isScalar(); }, Range);
8807+
8808+
for (const auto &CSA : CSAs) {
8809+
VPValue *VPInitScalar = Plan.getOrAddLiveIn(
8810+
CSA.first->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8811+
8812+
// Scalar VF builds the scalar version of the loop. In that case,
8813+
// no maintenence of mask nor extraction in middle block is needed.
8814+
if (IsScalarVF) {
8815+
VPCSAState *S = new VPCSAState(VPInitScalar);
8816+
Plan.addCSAState(CSA.first, S);
8817+
continue;
8818+
}
8819+
8820+
auto *VPInitMask =
8821+
new VPInstruction(VPInstruction::CSAInitMask, {}, DL, "csa.init.mask");
8822+
auto *VPInitData = new VPInstruction(VPInstruction::CSAInitData,
8823+
{VPInitScalar}, DL, "csa.init.data");
8824+
PreheaderVPBB->appendRecipe(VPInitMask);
8825+
PreheaderVPBB->appendRecipe(VPInitData);
8826+
8827+
auto *VPMaskPhi = new VPInstruction(VPInstruction::CSAMaskPhi, {VPInitMask},
8828+
DL, "csa.mask.phi");
8829+
HeaderVPBB->appendRecipe(VPMaskPhi);
8830+
8831+
auto *S = new VPCSAState(VPInitScalar, VPInitData, VPMaskPhi);
8832+
Plan.addCSAState(CSA.first, S);
8833+
}
8834+
}
8835+
8836+
/// Add CSA Recipes that must occur after each instruction in the input IR
8837+
/// is processed and introduced into VPlan.
8838+
static void
8839+
addCSAPostprocessRecipes(VPRecipeBuilder &RecipeBuilder,
8840+
const LoopVectorizationLegality::CSAList &CSAs,
8841+
VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8842+
VPlan &Plan) {
8843+
// Don't build CSA for VF=ElementCount::getFixed(1)
8844+
if (LoopVectorizationPlanner::getDecisionAndClampRange(
8845+
[&](ElementCount VF) { return VF.isScalar(); }, Range))
8846+
return;
8847+
8848+
for (const auto &CSA : CSAs) {
8849+
VPCSAState *CSAState = Plan.getCSAStates().find(CSA.first)->second;
8850+
VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate();
8851+
8852+
assert(VPDataUpdate &&
8853+
"VPDataUpdate must have been introduced prior to postprocess");
8854+
assert(CSA.second.getCond() &&
8855+
"CSADescriptor must know how to describe the condition");
8856+
auto GetVPValue = [&](Value *I) {
8857+
return RecipeBuilder.getRecipe(cast<Instruction>(I))->getVPSingleValue();
8858+
};
8859+
VPValue *WidenedCond = GetVPValue(CSA.second.getCond());
8860+
VPValue *VPInitScalar = CSAState->getVPInitScalar();
8861+
8862+
// The CSA optimization wants to use a condition such that when it is
8863+
// true, a new value is assigned. However, it is possible that a true lane
8864+
// in WidenedCond corresponds to selection of the initial value instead.
8865+
// In that case, we must use the negation of WidenedCond.
8866+
// i.e. select cond new_val old_val versus select cond.not old_val new_val
8867+
VPValue *CondToUse = WidenedCond;
8868+
if (cast<SelectInst>(CSA.second.getAssignment())->getTrueValue() ==
8869+
CSA.first) {
8870+
auto *VPNotCond = new VPInstruction(VPInstruction::Not, WidenedCond, DL);
8871+
VPNotCond->insertBefore(
8872+
GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
8873+
CondToUse = VPNotCond;
8874+
}
8875+
8876+
auto *VPAnyActive = new VPInstruction(
8877+
VPInstruction::CSAAnyActive, {CondToUse}, DL, "csa.cond.anyactive");
8878+
VPAnyActive->insertBefore(
8879+
GetVPValue(CSA.second.getAssignment())->getDefiningRecipe());
8880+
8881+
auto *VPMaskSel = new VPInstruction(
8882+
VPInstruction::CSAMaskSel,
8883+
{CondToUse, CSAState->getVPMaskPhi(), VPAnyActive}, DL, "csa.mask.sel");
8884+
VPMaskSel->insertAfter(VPAnyActive);
8885+
VPDataUpdate->setVPNewMaskAndVPAnyActive(VPMaskSel, VPAnyActive);
8886+
VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8887+
new VPCSAExtractScalarRecipe({VPInitScalar, VPMaskSel, VPDataUpdate});
8888+
8889+
MiddleVPBB->insert(ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi());
8890+
8891+
// Update CSAState with new recipes
8892+
CSAState->setExtractScalarRecipe(ExtractScalarRecipe);
8893+
CSAState->setVPAnyActive(VPAnyActive);
8894+
}
8895+
}
8896+
87468897
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
87478898
ElementCount MaxVF) {
87488899
assert(OrigLoop->isInnermost() && "Inner loop expected.");
@@ -8835,7 +8986,8 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
88358986
// increments.
88368987
static SetVector<VPIRInstruction *> collectUsersInExitBlock(
88378988
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8838-
const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8989+
const MapVector<PHINode *, InductionDescriptor> &Inductions,
8990+
const MapVector<PHINode *, CSADescriptor> &CSAs) {
88398991
auto *MiddleVPBB = Plan.getMiddleBlock();
88408992
// No edge from the middle block to the unique exit block has been inserted
88418993
// and there is nothing to fix from vector loop; phis should have incoming
@@ -8867,6 +9019,17 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
88679019
return P && Inductions.contains(P);
88689020
})))
88699021
continue;
9022+
// Exit values for CSAs are computed and updated outside of VPlan and
9023+
// independent of induction recipes.
9024+
// TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
9025+
// live-outs.
9026+
if (isa<VPCSADataUpdateRecipe>(V) &&
9027+
(isa<Instruction>(IncomingValue) &&
9028+
any_of(IncomingValue->users(), [&CSAs](User *U) {
9029+
auto *P = dyn_cast<PHINode>(U);
9030+
return P && CSAs.contains(P);
9031+
})))
9032+
continue;
88709033
ExitUsersToFix.insert(ExitIRI);
88719034
ExitIRI->addOperand(V);
88729035
}
@@ -9043,6 +9206,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
90439206
bool HasNUW = Style == TailFoldingStyle::None;
90449207
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
90459208

9209+
addCSAPreprocessRecipes(Legal->getCSAs(), OrigLoop, Plan->getPreheader(),
9210+
Plan->getVectorLoopRegion()->getEntryBasicBlock(), DL,
9211+
Range, *Plan);
9212+
90469213
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
90479214

90489215
// ---------------------------------------------------------------------------
@@ -9160,6 +9327,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91609327
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor());
91619328
}
91629329

9330+
VPBasicBlock *MiddleVPBB =
9331+
cast<VPBasicBlock>(Plan->getVectorLoopRegion()->getSingleSuccessor());
9332+
addCSAPostprocessRecipes(RecipeBuilder, Legal->getCSAs(), MiddleVPBB, DL,
9333+
Range, *Plan);
9334+
91639335
// After here, VPBB should not be used.
91649336
VPBB = nullptr;
91659337

@@ -9170,8 +9342,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
91709342
RecipeBuilder.fixHeaderPhis();
91719343

91729344
addScalarResumePhis(RecipeBuilder, *Plan);
9173-
SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock(
9174-
OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars());
9345+
SetVector<VPIRInstruction *> ExitUsersToFix =
9346+
collectUsersInExitBlock(OrigLoop, RecipeBuilder, *Plan,
9347+
Legal->getInductionVars(), Legal->getCSAs());
91759348
addExitUsersForFirstOrderRecurrences(*Plan, ExitUsersToFix);
91769349
addUsersInExitBlock(*Plan, ExitUsersToFix);
91779350
// ---------------------------------------------------------------------------
@@ -10238,6 +10411,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1023810411
auto ExpandedSCEVs = LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF,
1023910412
*BestMainPlan, MainILV, DT, false);
1024010413
++LoopsVectorized;
10414+
CSAsVectorized += LVL.getCSAs().size();
1024110415

1024210416
// Second pass vectorizes the epilogue and adjusts the control flow
1024310417
// edges from the first pass.
@@ -10333,6 +10507,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1033310507
PSI, Checks, BestPlan);
1033410508
LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
1033510509
++LoopsVectorized;
10510+
CSAsVectorized += LVL.getCSAs().size();
1033610511

1033710512
// Add metadata to disable runtime unrolling a scalar loop when there
1033810513
// are no runtime checks about strides and memory. A scalar loop that is

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,7 @@ void VPBlockBase::deleteCFG(VPBlockBase *Entry) {
214214

215215
VPBasicBlock::iterator VPBasicBlock::getFirstNonPhi() {
216216
iterator It = begin();
217-
while (It != end() && It->isPhi())
217+
while (It != end() && vputils::isPhi(*It))
218218
It++;
219219
return It;
220220
}
@@ -848,6 +848,9 @@ VPlan::~VPlan() {
848848
delete VPV;
849849
if (BackedgeTakenCount)
850850
delete BackedgeTakenCount;
851+
852+
for (std::pair<PHINode *, VPCSAState *> &S : CSAStates)
853+
delete S.second;
851854
}
852855

853856
VPIRBasicBlock *VPIRBasicBlock::fromBasicBlock(BasicBlock *IRBB) {
@@ -1050,7 +1053,7 @@ void VPlan::execute(VPTransformState *State) {
10501053
VPBasicBlock *Header = getVectorLoopRegion()->getEntryBasicBlock();
10511054
for (VPRecipeBase &R : Header->phis()) {
10521055
// Skip phi-like recipes that generate their backedege values themselves.
1053-
if (isa<VPWidenPHIRecipe>(&R))
1056+
if (vputils::isPhiThatGeneratesBackedge(R))
10541057
continue;
10551058

10561059
if (isa<VPWidenPointerInductionRecipe>(&R) ||

0 commit comments

Comments
 (0)