@@ -173,6 +173,8 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
173
173
STATISTIC (LoopsVectorized, " Number of loops vectorized" );
174
174
STATISTIC (LoopsAnalyzed, " Number of loops analyzed for vectorization" );
175
175
STATISTIC (LoopsEpilogueVectorized, " Number of epilogues vectorized" );
176
+ STATISTIC (CSAsVectorized,
177
+ " Number of conditional scalar assignments vectorized" );
176
178
177
179
static cl::opt<bool > EnableEpilogueVectorization (
178
180
" enable-epilogue-vectorization" , cl::init(true ), cl::Hidden,
@@ -497,6 +499,10 @@ class InnerLoopVectorizer {
497
499
// / Fix the vectorized code, taking care of header phi's, and more.
498
500
void fixVectorizedLoop (VPTransformState &State);
499
501
502
+ // / For all vectorized CSAs, replace uses of live-out scalar from the orignal
503
+ // / loop with the extracted scalar from the vector loop for.
504
+ void fixCSALiveOuts (VPTransformState &State, VPlan &Plan);
505
+
500
506
// Return true if any runtime check is added.
501
507
bool areSafetyChecksAdded () { return AddedSafetyChecks; }
502
508
@@ -2937,6 +2943,25 @@ LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2937
2943
TargetTransformInfo::TCK_RecipThroughput);
2938
2944
}
2939
2945
2946
+ void InnerLoopVectorizer::fixCSALiveOuts (VPTransformState &State, VPlan &Plan) {
2947
+ for (const auto &CSA : Plan.getCSAStates ()) {
2948
+ VPCSADataUpdateRecipe *VPDataUpdate = CSA.second ->getDataUpdate ();
2949
+ assert (VPDataUpdate &&
2950
+ " VPDataUpdate must have been introduced prior to fixing live outs" );
2951
+ Value *V = VPDataUpdate->getUnderlyingValue ();
2952
+ Value *ExtractedScalar = State.get (CSA.second ->getExtractScalarRecipe (), 0 ,
2953
+ /* NeedsScalar=*/ true );
2954
+ // Fix LCSSAPhis
2955
+ llvm::SmallPtrSet<PHINode *, 2 > ToFix;
2956
+ for (User *U : V->users ())
2957
+ if (auto *Phi = dyn_cast<PHINode>(U);
2958
+ Phi && Phi->getParent () == LoopExitBlock)
2959
+ ToFix.insert (Phi);
2960
+ for (PHINode *Phi : ToFix)
2961
+ Phi->addIncoming (ExtractedScalar, LoopMiddleBlock);
2962
+ }
2963
+ }
2964
+
2940
2965
void InnerLoopVectorizer::fixVectorizedLoop (VPTransformState &State) {
2941
2966
// Fix widened non-induction PHIs by setting up the PHI operands.
2942
2967
if (EnableVPlanNativePath)
@@ -2972,6 +2997,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2972
2997
fixupIVUsers (Entry.first , Entry.second ,
2973
2998
getOrCreateVectorTripCount (nullptr ),
2974
2999
IVEndValues[Entry.first ], LoopMiddleBlock, State);
3000
+ fixCSALiveOuts (State, Plan);
2975
3001
}
2976
3002
2977
3003
for (Instruction *PI : PredicatedInstructions)
@@ -4497,6 +4523,9 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4497
4523
case VPDef::VPEVLBasedIVPHISC:
4498
4524
case VPDef::VPPredInstPHISC:
4499
4525
case VPDef::VPBranchOnMaskSC:
4526
+ case VPRecipeBase::VPCSADataUpdateSC:
4527
+ case VPRecipeBase::VPCSAExtractScalarSC:
4528
+ case VPRecipeBase::VPCSAHeaderPHISC:
4500
4529
continue ;
4501
4530
case VPDef::VPReductionSC:
4502
4531
case VPDef::VPActiveLaneMaskPHISC:
@@ -8680,9 +8709,6 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8680
8709
return Recipe;
8681
8710
8682
8711
VPHeaderPHIRecipe *PhiRecipe = nullptr ;
8683
- assert ((Legal->isReductionVariable (Phi) ||
8684
- Legal->isFixedOrderRecurrence (Phi)) &&
8685
- " can only widen reductions and fixed-order recurrences here" );
8686
8712
VPValue *StartV = Operands[0 ];
8687
8713
if (Legal->isReductionVariable (Phi)) {
8688
8714
const RecurrenceDescriptor &RdxDesc =
@@ -8692,12 +8718,23 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8692
8718
PhiRecipe = new VPReductionPHIRecipe (Phi, RdxDesc, *StartV,
8693
8719
CM.isInLoopReduction (Phi),
8694
8720
CM.useOrderedReductions (RdxDesc));
8695
- } else {
8721
+ } else if (Legal-> isFixedOrderRecurrence (Phi)) {
8696
8722
// TODO: Currently fixed-order recurrences are modeled as chains of
8697
8723
// first-order recurrences. If there are no users of the intermediate
8698
8724
// recurrences in the chain, the fixed order recurrence should be modeled
8699
8725
// directly, enabling more efficient codegen.
8700
8726
PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8727
+ } else if (Legal->isCSAPhi (Phi)) {
8728
+ VPCSAState *State = Plan.getCSAStates ().find (Phi)->second ;
8729
+ VPValue *InitData = State->getVPInitData ();
8730
+ // When the VF=getFixed(1), InitData is just InitScalar.
8731
+ if (!InitData)
8732
+ InitData = State->getVPInitScalar ();
8733
+ PhiRecipe = new VPCSAHeaderPHIRecipe (Phi, InitData);
8734
+ State->setPhiRecipe (cast<VPCSAHeaderPHIRecipe>(PhiRecipe));
8735
+ } else {
8736
+ llvm_unreachable (
8737
+ " can only widen reductions, fixed-order recurrences, and CSAs here" );
8701
8738
}
8702
8739
8703
8740
PhisToFix.push_back (PhiRecipe);
@@ -8731,6 +8768,19 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8731
8768
make_range (Operands.begin (), Operands.end ()));
8732
8769
8733
8770
if (auto *SI = dyn_cast<SelectInst>(Instr)) {
8771
+ auto *CSADescIt = find_if (Legal->getCSAs (), [&](auto CSA) {
8772
+ return CSADescriptor::isCSASelect (CSA.second , SI);
8773
+ });
8774
+ if (CSADescIt != Legal->getCSAs ().end ()) {
8775
+ PHINode *CSAPhi = CSADescIt->first ;
8776
+ VPCSAState *State = Plan.getCSAStates ().find (CSAPhi)->second ;
8777
+ VPValue *VPDataPhi = State->getPhiRecipe ();
8778
+ auto *R = new VPCSADataUpdateRecipe (
8779
+ SI, {VPDataPhi, Operands[0 ], Operands[1 ], Operands[2 ]});
8780
+ State->setDataUpdate (R);
8781
+ return R;
8782
+ }
8783
+
8734
8784
return new VPWidenSelectRecipe (
8735
8785
*SI, make_range (Operands.begin (), Operands.end ()));
8736
8786
}
@@ -8743,6 +8793,107 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
8743
8793
return tryToWiden (Instr, Operands, VPBB);
8744
8794
}
8745
8795
8796
+ // / Add CSA Recipes that can occur before each instruction in the input IR
8797
+ // / is processed and introduced into VPlan.
8798
+ static void
8799
+ addCSAPreprocessRecipes (const LoopVectorizationLegality::CSAList &CSAs,
8800
+ Loop *OrigLoop, VPBasicBlock *PreheaderVPBB,
8801
+ VPBasicBlock *HeaderVPBB, DebugLoc DL, VFRange &Range,
8802
+ VPlan &Plan) {
8803
+
8804
+ // Don't build full CSA for VF=ElementCount::getFixed(1)
8805
+ bool IsScalarVF = LoopVectorizationPlanner::getDecisionAndClampRange (
8806
+ [&](ElementCount VF) { return VF.isScalar (); }, Range);
8807
+
8808
+ for (const auto &CSA : CSAs) {
8809
+ VPValue *VPInitScalar = Plan.getOrAddLiveIn (
8810
+ CSA.first ->getIncomingValueForBlock (OrigLoop->getLoopPreheader ()));
8811
+
8812
+ // Scalar VF builds the scalar version of the loop. In that case,
8813
+ // no maintenence of mask nor extraction in middle block is needed.
8814
+ if (IsScalarVF) {
8815
+ VPCSAState *S = new VPCSAState (VPInitScalar);
8816
+ Plan.addCSAState (CSA.first , S);
8817
+ continue ;
8818
+ }
8819
+
8820
+ auto *VPInitMask =
8821
+ new VPInstruction (VPInstruction::CSAInitMask, {}, DL, " csa.init.mask" );
8822
+ auto *VPInitData = new VPInstruction (VPInstruction::CSAInitData,
8823
+ {VPInitScalar}, DL, " csa.init.data" );
8824
+ PreheaderVPBB->appendRecipe (VPInitMask);
8825
+ PreheaderVPBB->appendRecipe (VPInitData);
8826
+
8827
+ auto *VPMaskPhi = new VPInstruction (VPInstruction::CSAMaskPhi, {VPInitMask},
8828
+ DL, " csa.mask.phi" );
8829
+ HeaderVPBB->appendRecipe (VPMaskPhi);
8830
+
8831
+ auto *S = new VPCSAState (VPInitScalar, VPInitData, VPMaskPhi);
8832
+ Plan.addCSAState (CSA.first , S);
8833
+ }
8834
+ }
8835
+
8836
+ // / Add CSA Recipes that must occur after each instruction in the input IR
8837
+ // / is processed and introduced into VPlan.
8838
+ static void
8839
+ addCSAPostprocessRecipes (VPRecipeBuilder &RecipeBuilder,
8840
+ const LoopVectorizationLegality::CSAList &CSAs,
8841
+ VPBasicBlock *MiddleVPBB, DebugLoc DL, VFRange &Range,
8842
+ VPlan &Plan) {
8843
+ // Don't build CSA for VF=ElementCount::getFixed(1)
8844
+ if (LoopVectorizationPlanner::getDecisionAndClampRange (
8845
+ [&](ElementCount VF) { return VF.isScalar (); }, Range))
8846
+ return ;
8847
+
8848
+ for (const auto &CSA : CSAs) {
8849
+ VPCSAState *CSAState = Plan.getCSAStates ().find (CSA.first )->second ;
8850
+ VPCSADataUpdateRecipe *VPDataUpdate = CSAState->getDataUpdate ();
8851
+
8852
+ assert (VPDataUpdate &&
8853
+ " VPDataUpdate must have been introduced prior to postprocess" );
8854
+ assert (CSA.second .getCond () &&
8855
+ " CSADescriptor must know how to describe the condition" );
8856
+ auto GetVPValue = [&](Value *I) {
8857
+ return RecipeBuilder.getRecipe (cast<Instruction>(I))->getVPSingleValue ();
8858
+ };
8859
+ VPValue *WidenedCond = GetVPValue (CSA.second .getCond ());
8860
+ VPValue *VPInitScalar = CSAState->getVPInitScalar ();
8861
+
8862
+ // The CSA optimization wants to use a condition such that when it is
8863
+ // true, a new value is assigned. However, it is possible that a true lane
8864
+ // in WidenedCond corresponds to selection of the initial value instead.
8865
+ // In that case, we must use the negation of WidenedCond.
8866
+ // i.e. select cond new_val old_val versus select cond.not old_val new_val
8867
+ VPValue *CondToUse = WidenedCond;
8868
+ if (cast<SelectInst>(CSA.second .getAssignment ())->getTrueValue () ==
8869
+ CSA.first ) {
8870
+ auto *VPNotCond = new VPInstruction (VPInstruction::Not, WidenedCond, DL);
8871
+ VPNotCond->insertBefore (
8872
+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8873
+ CondToUse = VPNotCond;
8874
+ }
8875
+
8876
+ auto *VPAnyActive = new VPInstruction (
8877
+ VPInstruction::CSAAnyActive, {CondToUse}, DL, " csa.cond.anyactive" );
8878
+ VPAnyActive->insertBefore (
8879
+ GetVPValue (CSA.second .getAssignment ())->getDefiningRecipe ());
8880
+
8881
+ auto *VPMaskSel = new VPInstruction (
8882
+ VPInstruction::CSAMaskSel,
8883
+ {CondToUse, CSAState->getVPMaskPhi (), VPAnyActive}, DL, " csa.mask.sel" );
8884
+ VPMaskSel->insertAfter (VPAnyActive);
8885
+ VPDataUpdate->setVPNewMaskAndVPAnyActive (VPMaskSel, VPAnyActive);
8886
+ VPCSAExtractScalarRecipe *ExtractScalarRecipe =
8887
+ new VPCSAExtractScalarRecipe ({VPInitScalar, VPMaskSel, VPDataUpdate});
8888
+
8889
+ MiddleVPBB->insert (ExtractScalarRecipe, MiddleVPBB->getFirstNonPhi ());
8890
+
8891
+ // Update CSAState with new recipes
8892
+ CSAState->setExtractScalarRecipe (ExtractScalarRecipe);
8893
+ CSAState->setVPAnyActive (VPAnyActive);
8894
+ }
8895
+ }
8896
+
8746
8897
void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
8747
8898
ElementCount MaxVF) {
8748
8899
assert (OrigLoop->isInnermost () && " Inner loop expected." );
@@ -8835,7 +8986,8 @@ static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan) {
8835
8986
// increments.
8836
8987
static SetVector<VPIRInstruction *> collectUsersInExitBlock (
8837
8988
Loop *OrigLoop, VPRecipeBuilder &Builder, VPlan &Plan,
8838
- const MapVector<PHINode *, InductionDescriptor> &Inductions) {
8989
+ const MapVector<PHINode *, InductionDescriptor> &Inductions,
8990
+ const MapVector<PHINode *, CSADescriptor> &CSAs) {
8839
8991
auto *MiddleVPBB = Plan.getMiddleBlock ();
8840
8992
// No edge from the middle block to the unique exit block has been inserted
8841
8993
// and there is nothing to fix from vector loop; phis should have incoming
@@ -8867,6 +9019,17 @@ static SetVector<VPIRInstruction *> collectUsersInExitBlock(
8867
9019
return P && Inductions.contains (P);
8868
9020
})))
8869
9021
continue ;
9022
+ // Exit values for CSAs are computed and updated outside of VPlan and
9023
+ // independent of induction recipes.
9024
+ // TODO: Compute induction exit values in VPlan, use VPLiveOuts to update
9025
+ // live-outs.
9026
+ if (isa<VPCSADataUpdateRecipe>(V) &&
9027
+ (isa<Instruction>(IncomingValue) &&
9028
+ any_of (IncomingValue->users (), [&CSAs](User *U) {
9029
+ auto *P = dyn_cast<PHINode>(U);
9030
+ return P && CSAs.contains (P);
9031
+ })))
9032
+ continue ;
8870
9033
ExitUsersToFix.insert (ExitIRI);
8871
9034
ExitIRI->addOperand (V);
8872
9035
}
@@ -9043,6 +9206,10 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9043
9206
bool HasNUW = Style == TailFoldingStyle::None;
9044
9207
addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
9045
9208
9209
+ addCSAPreprocessRecipes (Legal->getCSAs (), OrigLoop, Plan->getPreheader (),
9210
+ Plan->getVectorLoopRegion ()->getEntryBasicBlock (), DL,
9211
+ Range, *Plan);
9212
+
9046
9213
VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
9047
9214
9048
9215
// ---------------------------------------------------------------------------
@@ -9160,6 +9327,11 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9160
9327
VPBB = cast<VPBasicBlock>(VPBB->getSingleSuccessor ());
9161
9328
}
9162
9329
9330
+ VPBasicBlock *MiddleVPBB =
9331
+ cast<VPBasicBlock>(Plan->getVectorLoopRegion ()->getSingleSuccessor ());
9332
+ addCSAPostprocessRecipes (RecipeBuilder, Legal->getCSAs (), MiddleVPBB, DL,
9333
+ Range, *Plan);
9334
+
9163
9335
// After here, VPBB should not be used.
9164
9336
VPBB = nullptr ;
9165
9337
@@ -9170,8 +9342,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9170
9342
RecipeBuilder.fixHeaderPhis ();
9171
9343
9172
9344
addScalarResumePhis (RecipeBuilder, *Plan);
9173
- SetVector<VPIRInstruction *> ExitUsersToFix = collectUsersInExitBlock (
9174
- OrigLoop, RecipeBuilder, *Plan, Legal->getInductionVars ());
9345
+ SetVector<VPIRInstruction *> ExitUsersToFix =
9346
+ collectUsersInExitBlock (OrigLoop, RecipeBuilder, *Plan,
9347
+ Legal->getInductionVars (), Legal->getCSAs ());
9175
9348
addExitUsersForFirstOrderRecurrences (*Plan, ExitUsersToFix);
9176
9349
addUsersInExitBlock (*Plan, ExitUsersToFix);
9177
9350
// ---------------------------------------------------------------------------
@@ -10238,6 +10411,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10238
10411
auto ExpandedSCEVs = LVP.executePlan (EPI.MainLoopVF , EPI.MainLoopUF ,
10239
10412
*BestMainPlan, MainILV, DT, false );
10240
10413
++LoopsVectorized;
10414
+ CSAsVectorized += LVL.getCSAs ().size ();
10241
10415
10242
10416
// Second pass vectorizes the epilogue and adjusts the control flow
10243
10417
// edges from the first pass.
@@ -10333,6 +10507,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
10333
10507
PSI, Checks, BestPlan);
10334
10508
LVP.executePlan (VF.Width , IC, BestPlan, LB, DT, false );
10335
10509
++LoopsVectorized;
10510
+ CSAsVectorized += LVL.getCSAs ().size ();
10336
10511
10337
10512
// Add metadata to disable runtime unrolling a scalar loop when there
10338
10513
// are no runtime checks about strides and memory. A scalar loop that is
0 commit comments