@@ -174,6 +174,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
174
174
STATISTIC (LoopsVectorized, " Number of loops vectorized" );
175
175
STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
176
176
STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
177
+ STATISTIC (CSAsVectorized,
178
+ " Number of conditional scalar assignments vectorized" );
177
179
178
180
static cl::opt<bool > EnableEpilogueVectorization (
179
181
" enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -498,6 +500,10 @@ class InnerLoopVectorizer {
498
500
// / Fix the vectorized code, taking care of header phi's, and more.
499
501
void fixVectorizedLoop (VPTransformState &State);
500
502
503
+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
504
+ // / loop with the extracted scalar from the vector loop for.
505
+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
506
+
501
507
// Return true if any runtime check is added.
502
508
bool areSafetyChecksAdded () { return AddedSafetyChecks; }
503
509
@@ -2937,6 +2943,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2937
2943
TargetTransformInfo::TCK_RecipThroughput);
2938
2944
}
2939
2945
2946
+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2947
+ for (const auto &CSA : Plan.getCSAStates ()) {
2948
+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2949
+ assert (VPDataUpdate &&
2950
+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2951
+ Value *V = VPDataUpdate->getUnderlyingValue ();
2952
+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2953
+ /* NeedsScalar=*/ true );
2954
+ // Fix LCSSAPhis
2955
+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2956
+ for (User *U : V->users ())
2957
+ if (auto *Phi = dyn_cast<PHINode>(U);
2958
+ Phi && Phi->getParent () == LoopExitBlock)
2959
+ ToFix.insert (Phi);
2960
+ for (PHINode *Phi : ToFix)
2961
+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2962
+ }
2963
+ }
2964
+
2940
2965
void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State) {
2941
2966
// Fix widened non-induction PHIs by setting up the PHI operands.
2942
2967
if (EnableVPlanNativePath)
@@ -2971,6 +2996,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2971
2996
for (const auto &Entry : Legal->getInductionVars ())
2972
2997
fixupIVUsers (Entry.first , Entry.second ,
2973
2998
getOrCreateVectorTripCount (nullptr ), LoopMiddleBlock, State);
2999
+ fixCSALiveOuts (State, Plan);
2974
3000
}
2975
3001
2976
3002
for (Instruction *PI : PredicatedInstructions)
@@ -4516,6 +4542,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4516
4542
case VPDef::VPEVLBasedIVPHISC:
4517
4543
case VPDef::VPPredInstPHISC:
4518
4544
case VPDef::VPBranchOnMaskSC:
4545
+ case VPRecipeBase::VPCSADataUpdateSC:
4546
+ case VPRecipeBase::VPCSAExtractScalarSC:
4547
+ case VPRecipeBase::VPCSAHeaderPHISC:
4519
4548
continue ;
4520
4549
case VPDef::VPReductionSC:
4521
4550
case VPDef::VPActiveLaneMaskPHISC:
@@ -8701,9 +8730,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8701
8730
return Recipe;
8702
8731
8703
8732
VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8704
- assert ((Legal->isReductionVariable (Phi) ||
8705
- Legal->isFixedOrderRecurrence (Phi)) &&
8706
- " can only widen reductions and fixed-order recurrences here" );
8707
8733
VPValue *StartV = Operands[0 ];
8708
8734
if (Legal->isReductionVariable (Phi)) {
8709
8735
const RecurrenceDescriptor &RdxDesc =
@@ -8713,12 +8739,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8713
8739
PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
8714
8740
CM.isInLoopReduction (Phi),
8715
8741
CM.useOrderedReductions (RdxDesc));
8716
- } else {
8742
+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
8717
8743
// TODO: Currently fixed-order recurrences are modeled as chains of
8718
8744
// first-order recurrences. If there are no users of the intermediate
8719
8745
// recurrences in the chain, the fixed order recurrence should be modeled
8720
8746
// directly, enabling more efficient codegen.
8721
8747
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8748
+ } else if (Legal->isCSAPhi (Phi)) {
8749
+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8750
+ VPValue *InitData = State->getVPInitData ();
8751
+ // When the VF=getFixed(1), InitData is just InitScalar.
8752
+ if (!InitData)
8753
+ InitData = State->getVPInitScalar ();
8754
+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8755
+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8756
+ } else {
8757
+ llvm_unreachable (
8758
+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
8722
8759
}
8723
8760
8724
8761
PhisToFix.push_back (PhiRecipe);
@@ -8752,6 +8789,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8752
8789
make_range (Operands.begin (), Operands.end ()));
8753
8790
8754
8791
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8792
+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8793
+ return CSADescriptor::isCSASelect (CSA.second , SI);
8794
+ });
8795
+ if (CSADescIt != Legal->getCSAs ().end ()) {
8796
+ PHINode *CSAPhi = CSADescIt->first ;
8797
+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8798
+ VPValue *VPDataPhi = State->getPhiRecipe ();
8799
+ auto *R = new VPCSADataUpdateRecipe (
8800
+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8801
+ State->setDataUpdate (R);
8802
+ return R;
8803
+ }
8804
+
8755
8805
return new VPWidenSelectRecipe (
8756
8806
*SI, make_range (Operands.begin (), Operands.end ()));
8757
8807
}
@@ -8764,6 +8814,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8764
8814
return tryToWiden (Instr, Operands, VPBB);
8765
8815
}
8766
8816
8817
+ // / Add CSA Recipes that can occur before each instruction in the input IR
8818
+ // / is processed and introduced into VPlan.
8819
+ static void
8820
+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8821
+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8822
+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8823
+ VPlan &Plan) {
8824
+
8825
+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8826
+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8827
+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8828
+
8829
+ for (const auto &CSA : CSAs) {
8830
+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8831
+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8832
+
8833
+ // Scalar VF builds the scalar version of the loop. In that case,
8834
+ // no maintenence of mask nor extraction in middle block is needed.
8835
+ if (IsScalarVF) {
8836
+ VPCSAState *S = new VPCSAState (VPInitScalar);
8837
+ Plan.addCSAState (CSA.first , S);
8838
+ continue ;
8839
+ }
8840
+
8841
+ auto *VPInitMask =
8842
+ new VPInstruction (VPInstruction::CSAInitMask, {}, DL, " csa.init.mask" );
8843
+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8844
+ {VPInitScalar}, DL, " csa.init.data" );
8845
+ PreheaderVPBB->appendRecipe (VPInitMask);
8846
+ PreheaderVPBB->appendRecipe (VPInitData);
8847
+
8848
+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8849
+ DL, " csa.mask.phi" );
8850
+ HeaderVPBB->appendRecipe (VPMaskPhi);
8851
+
8852
+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8853
+ Plan.addCSAState (CSA.first , S);
8854
+ }
8855
+ }
8856
+
8857
+ // / Add CSA Recipes that must occur after each instruction in the input IR
8858
+ // / is processed and introduced into VPlan.
8859
+ static void
8860
+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8861
+ const LoopVectorizationLegality::CSAList &CSAs,
8862
+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8863
+ VPlan &Plan) {
8864
+ // Don't build CSA for VF=ElementCount::getFixed(1)
8865
+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8866
+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8867
+ return ;
8868
+
8869
+ for (const auto &CSA : CSAs) {
8870
+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8871
+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8872
+
8873
+ assert (VPDataUpdate &&
8874
+ " VPDataUpdate must have been introduced prior to postprocess" );
8875
+ assert (CSA.second .getCond () &&
8876
+ " CSADescriptor must know how to describe the condition" );
8877
+ auto GetVPValue = [&](Value *I) {
8878
+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8879
+ };
8880
+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8881
+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8882
+
8883
+ // The CSA optimization wants to use a condition such that when it is
8884
+ // true, a new value is assigned. However, it is possible that a true lane
8885
+ // in WidenedCond corresponds to selection of the initial value instead.
8886
+ // In that case, we must use the negation of WidenedCond.
8887
+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8888
+ VPValue *CondToUse = WidenedCond;
8889
+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8890
+ CSA.first ) {
8891
+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8892
+ VPNotCond->insertBefore (
8893
+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8894
+ CondToUse = VPNotCond;
8895
+ }
8896
+
8897
+ auto *VPAnyActive = new VPInstruction (
8898
+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8899
+ VPAnyActive->insertBefore (
8900
+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8901
+
8902
+ auto *VPMaskSel = new VPInstruction (
8903
+ VPInstruction::CSAMaskSel,
8904
+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8905
+ VPMaskSel->insertAfter (VPAnyActive);
8906
+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8907
+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8908
+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8909
+
8910
+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8911
+
8912
+ // Update CSAState with new recipes
8913
+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8914
+ CSAState->setVPAnyActive (VPAnyActive);
8915
+ }
8916
+ }
8917
+
8767
8918
void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
8768
8919
ElementCount MaxVF) {
8769
8920
assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8856,7 +9007,8 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
8856
9007
// increments.
8857
9008
static SetVector<VPIRInstruction *> collectUsersInExitBlocks (
8858
9009
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8859
- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
9010
+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
9011
+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
8860
9012
SetVector<VPIRInstruction *> ExitUsersToFix;
8861
9013
for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks ()) {
8862
9014
BasicBlock *ExitBB = ExitVPBB->getIRBasicBlock ();
@@ -8887,6 +9039,17 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlocks(
8887
9039
return P && Inductions.contains (P);
8888
9040
})))
8889
9041
continue ;
9042
+ // Exit values for CSAs are computed and updated outside of VPlan and
9043
+ // independent of induction recipes.
9044
+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
9045
+ // live-outs.
9046
+ if (isa<VPCSADataUpdateRecipe>(V) &&
9047
+ (isa<Instruction>(IncomingValue) &&
9048
+ any_of (IncomingValue->users (), [&CSAs](User *U) {
9049
+ auto *P = dyn_cast<PHINode>(U);
9050
+ return P && CSAs.contains (P);
9051
+ })))
9052
+ continue ;
8890
9053
ExitUsersToFix.insert (ExitIRI);
8891
9054
ExitIRI->addOperand (V);
8892
9055
}
@@ -9068,6 +9231,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9068
9231
bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
9069
9232
addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
9070
9233
9234
+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9235
+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9236
+ Range, *Plan);
9237
+
9071
9238
VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
9072
9239
9073
9240
// ---------------------------------------------------------------------------
@@ -9185,6 +9352,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9185
9352
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
9186
9353
}
9187
9354
9355
+ VPBasicBlock *MiddleVPBB =
9356
+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9357
+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9358
+ Range, *Plan);
9359
+
9188
9360
// After here, VPBB should not be used.
9189
9361
VPBB = nullptr ;
9190
9362
@@ -9195,8 +9367,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9195
9367
RecipeBuilder.fixHeaderPhis ();
9196
9368
9197
9369
addScalarResumePhis (RecipeBuilder, *Plan);
9198
- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlocks (
9199
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9370
+ SetVector<VPIRInstruction *> ExitUsersToFix =
9371
+ collectUsersInExitBlocks (OrigLoop, RecipeBuilder, *Plan,
9372
+ Legal->getInductionVars (), Legal->getCSAs ());
9200
9373
addExitUsersForFirstOrderRecurrences (*Plan, ExitUsersToFix);
9201
9374
addUsersInExitBlocks (*Plan, ExitUsersToFix);
9202
9375
// ---------------------------------------------------------------------------
@@ -10256,6 +10429,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10256
10429
auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
10257
10430
*BestMainPlan, MainILV, DT, false );
10258
10431
++LoopsVectorized;
10432
+ CSAsVectorized += LVL.getCSAs ().size ();
10259
10433
10260
10434
// Second pass vectorizes the epilogue and adjusts the control flow
10261
10435
// edges from the first pass.
@@ -10351,6 +10525,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10351
10525
PSI, Checks, BestPlan);
10352
10526
LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10353
10527
++LoopsVectorized;
10528
+ CSAsVectorized += LVL.getCSAs ().size ();
10354
10529
10355
10530
// Add metadata to disable runtime unrolling a scalar loop when there
10356
10531
// are no runtime checks about strides and memory. A scalar loop that is
0 commit comments