@@ -173,6 +173,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
173
173
STATISTIC (LoopsVectorized, " Number of loops vectorized" );
174
174
STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
175
175
STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
176
+ STATISTIC (CSAsVectorized,
177
+ " Number of conditional scalar assignments vectorized" );
176
178
177
179
static cl::opt<bool > EnableEpilogueVectorization (
178
180
" enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -501,6 +503,10 @@ class InnerLoopVectorizer {
501
503
// / Fix the vectorized code, taking care of header phi's, and more.
502
504
void fixVectorizedLoop (VPTransformState &State);
503
505
506
+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
507
+ // / loop with the extracted scalar from the vector loop for.
508
+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
509
+
504
510
// Return true if any runtime check is added.
505
511
bool areSafetyChecksAdded () { return AddedSafetyChecks; }
506
512
@@ -2934,6 +2940,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2934
2940
TargetTransformInfo::TCK_RecipThroughput);
2935
2941
}
2936
2942
2943
+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2944
+ for (const auto &CSA : Plan.getCSAStates ()) {
2945
+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2946
+ assert (VPDataUpdate &&
2947
+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2948
+ Value *V = VPDataUpdate->getUnderlyingValue ();
2949
+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2950
+ /* NeedsScalar=*/ true );
2951
+ // Fix LCSSAPhis
2952
+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2953
+ for (User *U : V->users ())
2954
+ if (auto *Phi = dyn_cast<PHINode>(U);
2955
+ Phi && Phi->getParent () == LoopExitBlock)
2956
+ ToFix.insert (Phi);
2957
+ for (PHINode *Phi : ToFix)
2958
+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2959
+ }
2960
+ }
2961
+
2937
2962
void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State) {
2938
2963
// Fix widened non-induction PHIs by setting up the PHI operands.
2939
2964
if (EnableVPlanNativePath)
@@ -2969,6 +2994,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2969
2994
fixupIVUsers (Entry.first , Entry.second ,
2970
2995
getOrCreateVectorTripCount (nullptr ),
2971
2996
IVEndValues[Entry.first ], LoopMiddleBlock, State);
2997
+ fixCSALiveOuts (State, Plan);
2972
2998
}
2973
2999
2974
3000
for (Instruction *PI : PredicatedInstructions)
@@ -4494,6 +4520,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4494
4520
case VPDef::VPEVLBasedIVPHISC:
4495
4521
case VPDef::VPPredInstPHISC:
4496
4522
case VPDef::VPBranchOnMaskSC:
4523
+ case VPRecipeBase::VPCSADataUpdateSC:
4524
+ case VPRecipeBase::VPCSAExtractScalarSC:
4525
+ case VPRecipeBase::VPCSAHeaderPHISC:
4497
4526
continue ;
4498
4527
case VPDef::VPReductionSC:
4499
4528
case VPDef::VPActiveLaneMaskPHISC:
@@ -8675,9 +8704,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8675
8704
return Recipe;
8676
8705
8677
8706
VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8678
- assert ((Legal->isReductionVariable (Phi) ||
8679
- Legal->isFixedOrderRecurrence (Phi)) &&
8680
- " can only widen reductions and fixed-order recurrences here" );
8681
8707
VPValue *StartV = Operands[0 ];
8682
8708
if (Legal->isReductionVariable (Phi)) {
8683
8709
const RecurrenceDescriptor &RdxDesc =
@@ -8687,12 +8713,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8687
8713
PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
8688
8714
CM.isInLoopReduction (Phi),
8689
8715
CM.useOrderedReductions (RdxDesc));
8690
- } else {
8716
+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
8691
8717
// TODO: Currently fixed-order recurrences are modeled as chains of
8692
8718
// first-order recurrences. If there are no users of the intermediate
8693
8719
// recurrences in the chain, the fixed order recurrence should be modeled
8694
8720
// directly, enabling more efficient codegen.
8695
8721
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8722
+ } else if (Legal->isCSAPhi (Phi)) {
8723
+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8724
+ VPValue *InitData = State->getVPInitData ();
8725
+ // When the VF=getFixed(1), InitData is just InitScalar.
8726
+ if (!InitData)
8727
+ InitData = State->getVPInitScalar ();
8728
+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8729
+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8730
+ } else {
8731
+ llvm_unreachable (
8732
+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
8696
8733
}
8697
8734
8698
8735
PhisToFix.push_back (PhiRecipe);
@@ -8726,6 +8763,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8726
8763
make_range (Operands.begin (), Operands.end ()));
8727
8764
8728
8765
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8766
+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8767
+ return CSADescriptor::isCSASelect (CSA.second , SI);
8768
+ });
8769
+ if (CSADescIt != Legal->getCSAs ().end ()) {
8770
+ PHINode *CSAPhi = CSADescIt->first ;
8771
+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8772
+ VPValue *VPDataPhi = State->getPhiRecipe ();
8773
+ auto *R = new VPCSADataUpdateRecipe (
8774
+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8775
+ State->setDataUpdate (R);
8776
+ return R;
8777
+ }
8778
+
8729
8779
return new VPWidenSelectRecipe (
8730
8780
*SI, make_range (Operands.begin (), Operands.end ()));
8731
8781
}
@@ -8738,6 +8788,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8738
8788
return tryToWiden (Instr, Operands, VPBB);
8739
8789
}
8740
8790
8791
+ // / Add CSA Recipes that can occur before each instruction in the input IR
8792
+ // / is processed and introduced into VPlan.
8793
+ static void
8794
+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8795
+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8796
+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8797
+ VPlan &Plan) {
8798
+
8799
+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8800
+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8801
+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8802
+
8803
+ for (const auto &CSA : CSAs) {
8804
+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8805
+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8806
+
8807
+ // Scalar VF builds the scalar version of the loop. In that case,
8808
+ // no maintenence of mask nor extraction in middle block is needed.
8809
+ if (IsScalarVF) {
8810
+ VPCSAState *S = new VPCSAState (VPInitScalar);
8811
+ Plan.addCSAState (CSA.first , S);
8812
+ continue ;
8813
+ }
8814
+
8815
+ auto *VPInitMask =
8816
+ new VPInstruction (VPInstruction::CSAInitMask, {}, DL, " csa.init.mask" );
8817
+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8818
+ {VPInitScalar}, DL, " csa.init.data" );
8819
+ PreheaderVPBB->appendRecipe (VPInitMask);
8820
+ PreheaderVPBB->appendRecipe (VPInitData);
8821
+
8822
+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8823
+ DL, " csa.mask.phi" );
8824
+ HeaderVPBB->appendRecipe (VPMaskPhi);
8825
+
8826
+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8827
+ Plan.addCSAState (CSA.first , S);
8828
+ }
8829
+ }
8830
+
8831
+ // / Add CSA Recipes that must occur after each instruction in the input IR
8832
+ // / is processed and introduced into VPlan.
8833
+ static void
8834
+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8835
+ const LoopVectorizationLegality::CSAList &CSAs,
8836
+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8837
+ VPlan &Plan) {
8838
+ // Don't build CSA for VF=ElementCount::getFixed(1)
8839
+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8840
+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8841
+ return ;
8842
+
8843
+ for (const auto &CSA : CSAs) {
8844
+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8845
+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8846
+
8847
+ assert (VPDataUpdate &&
8848
+ " VPDataUpdate must have been introduced prior to postprocess" );
8849
+ assert (CSA.second .getCond () &&
8850
+ " CSADescriptor must know how to describe the condition" );
8851
+ auto GetVPValue = [&](Value *I) {
8852
+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8853
+ };
8854
+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8855
+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8856
+
8857
+ // The CSA optimization wants to use a condition such that when it is
8858
+ // true, a new value is assigned. However, it is possible that a true lane
8859
+ // in WidenedCond corresponds to selection of the initial value instead.
8860
+ // In that case, we must use the negation of WidenedCond.
8861
+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8862
+ VPValue *CondToUse = WidenedCond;
8863
+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8864
+ CSA.first ) {
8865
+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8866
+ VPNotCond->insertBefore (
8867
+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8868
+ CondToUse = VPNotCond;
8869
+ }
8870
+
8871
+ auto *VPAnyActive = new VPInstruction (
8872
+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8873
+ VPAnyActive->insertBefore (
8874
+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8875
+
8876
+ auto *VPMaskSel = new VPInstruction (
8877
+ VPInstruction::CSAMaskSel,
8878
+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8879
+ VPMaskSel->insertAfter (VPAnyActive);
8880
+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8881
+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8882
+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8883
+
8884
+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8885
+
8886
+ // Update CSAState with new recipes
8887
+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8888
+ CSAState->setVPAnyActive (VPAnyActive);
8889
+ }
8890
+ }
8891
+
8741
8892
void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
8742
8893
ElementCount MaxVF) {
8743
8894
assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8830,7 +8981,8 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
8830
8981
// increments.
8831
8982
static SetVector<VPIRInstruction *> collectUsersInExitBlock (
8832
8983
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8833
- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8984
+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
8985
+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
8834
8986
auto *MiddleVPBB = Plan.getMiddleBlock ();
8835
8987
// No edge from the middle block to the unique exit block has been inserted
8836
8988
// and there is nothing to fix from vector loop; phis should have incoming
@@ -8862,6 +9014,17 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
8862
9014
return P && Inductions.contains (P);
8863
9015
})))
8864
9016
continue ;
9017
+ // Exit values for CSAs are computed and updated outside of VPlan and
9018
+ // independent of induction recipes.
9019
+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
9020
+ // live-outs.
9021
+ if (isa<VPCSADataUpdateRecipe>(V) &&
9022
+ (isa<Instruction>(IncomingValue) &&
9023
+ any_of (IncomingValue->users (), [&CSAs](User *U) {
9024
+ auto *P = dyn_cast<PHINode>(U);
9025
+ return P && CSAs.contains (P);
9026
+ })))
9027
+ continue ;
8865
9028
ExitUsersToFix.insert (ExitIRI);
8866
9029
ExitIRI->addOperand (V);
8867
9030
}
@@ -9038,6 +9201,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9038
9201
bool HasNUW = Style == TailFoldingStyle::None;
9039
9202
addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
9040
9203
9204
+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9205
+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9206
+ Range, *Plan);
9207
+
9041
9208
VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
9042
9209
9043
9210
// ---------------------------------------------------------------------------
@@ -9155,6 +9322,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9155
9322
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
9156
9323
}
9157
9324
9325
+ VPBasicBlock *MiddleVPBB =
9326
+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9327
+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9328
+ Range, *Plan);
9329
+
9158
9330
// After here, VPBB should not be used.
9159
9331
VPBB = nullptr ;
9160
9332
@@ -9165,8 +9337,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9165
9337
RecipeBuilder.fixHeaderPhis ();
9166
9338
9167
9339
addScalarResumePhis (RecipeBuilder, *Plan);
9168
- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock (
9169
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9340
+ SetVector<VPIRInstruction *> ExitUsersToFix =
9341
+ collectUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
9342
+ Legal->getInductionVars (), Legal->getCSAs ());
9170
9343
addExitUsersForFirstOrderRecurrences (*Plan, ExitUsersToFix);
9171
9344
addUsersInExitBlock (*Plan, ExitUsersToFix);
9172
9345
// ---------------------------------------------------------------------------
@@ -10235,6 +10408,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10235
10408
auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
10236
10409
*BestMainPlan, MainILV, DT, false );
10237
10410
++LoopsVectorized;
10411
+ CSAsVectorized += LVL.getCSAs ().size ();
10238
10412
10239
10413
// Second pass vectorizes the epilogue and adjusts the control flow
10240
10414
// edges from the first pass.
@@ -10330,6 +10504,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10330
10504
PSI, Checks, BestPlan);
10331
10505
LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10332
10506
++LoopsVectorized;
10507
+ CSAsVectorized += LVL.getCSAs ().size ();
10333
10508
10334
10509
// Add metadata to disable runtime unrolling a scalar loop when there
10335
10510
// are no runtime checks about strides and memory. A scalar loop that is
0 commit comments