@@ -1090,7 +1090,7 @@ class LoopVectorizationCostModel {
1090
1090
bool selectUserVectorizationFactor (ElementCount UserVF) {
1091
1091
collectUniformsAndScalars (UserVF);
1092
1092
collectInstsToScalarize (UserVF);
1093
- return expectedCost (UserVF).first . isValid ();
1093
+ return expectedCost (UserVF).isValid ();
1094
1094
}
1095
1095
1096
1096
// / \return The size (in bits) of the smallest and widest types in the code
@@ -1591,20 +1591,13 @@ class LoopVectorizationCostModel {
1591
1591
Scalars.clear ();
1592
1592
}
1593
1593
1594
- // / The vectorization cost is a combination of the cost itself and a boolean
1595
- // / indicating whether any of the contributing operations will actually
1596
- // / operate on vector values after type legalization in the backend. If this
1597
- // / latter value is false, then all operations will be scalarized (i.e. no
1598
- // / vectorization has actually taken place).
1599
- using VectorizationCostTy = std::pair<InstructionCost, bool >;
1600
-
1601
1594
// / Returns the expected execution cost. The unit of the cost does
1602
1595
// / not matter because we use the 'cost' units to compare different
1603
1596
// / vector widths. The cost that is returned is *not* normalized by
1604
1597
// / the factor width. If \p Invalid is not nullptr, this function
1605
1598
// / will add a pair(Instruction*, ElementCount) to \p Invalid for
1606
1599
// / each instruction that has an Invalid cost for the given VF.
1607
- VectorizationCostTy
1600
+ InstructionCost
1608
1601
expectedCost (ElementCount VF,
1609
1602
SmallVectorImpl<InstructionVFPair> *Invalid = nullptr );
1610
1603
@@ -1642,12 +1635,7 @@ class LoopVectorizationCostModel {
1642
1635
1643
1636
// / Returns the execution time cost of an instruction for a given vector
1644
1637
// / width. Vector width of one means scalar.
1645
- VectorizationCostTy getInstructionCost (Instruction *I, ElementCount VF);
1646
-
1647
- // / The cost-computation logic from getInstructionCost which provides
1648
- // / the vector type as an output parameter.
1649
- InstructionCost getInstructionCost (Instruction *I, ElementCount VF,
1650
- Type *&VectorTy);
1638
+ InstructionCost getInstructionCost (Instruction *I, ElementCount VF);
1651
1639
1652
1640
// / Return the cost of instructions in an inloop reduction pattern, if I is
1653
1641
// / part of that pattern.
@@ -4795,9 +4783,101 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4795
4783
} while (!Tail.empty ());
4796
4784
}
4797
4785
4786
+ // / Check if any recipe of \p Plan will generate a vector value, which will be
4787
+ // / assigned a vector register.
4788
+ static bool willGenerateVectors (VPlan &Plan, ElementCount VF,
4789
+ const TargetTransformInfo &TTI) {
4790
+ assert (VF.isVector () && " Checking a scalar VF?" );
4791
+ VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType (),
4792
+ Plan.getCanonicalIV ()->getScalarType ()->getContext ());
4793
+ // Set of already visited types.
4794
+ DenseSet<Type *> Visited;
4795
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4796
+ vp_depth_first_shallow (Plan.getVectorLoopRegion ()->getEntry ()))) {
4797
+ for (VPRecipeBase &R : *VPBB) {
4798
+ // Continue early if the recipe is considered to not produce a vector
4799
+ // result. Note that this includes VPInstruction where some opcodes may
4800
+ // produce a vector, to preserve existing behavior as VPInstructions model
4801
+ // aspects not directly mapped to existing IR instructions.
4802
+ switch (R.getVPDefID ()) {
4803
+ case VPDef::VPDerivedIVSC:
4804
+ case VPDef::VPScalarIVStepsSC:
4805
+ case VPDef::VPScalarCastSC:
4806
+ case VPDef::VPReplicateSC:
4807
+ case VPDef::VPInstructionSC:
4808
+ case VPDef::VPCanonicalIVPHISC:
4809
+ case VPDef::VPVectorPointerSC:
4810
+ case VPDef::VPExpandSCEVSC:
4811
+ case VPDef::VPEVLBasedIVPHISC:
4812
+ case VPDef::VPPredInstPHISC:
4813
+ case VPDef::VPBranchOnMaskSC:
4814
+ continue ;
4815
+ case VPDef::VPReductionSC:
4816
+ case VPDef::VPActiveLaneMaskPHISC:
4817
+ case VPDef::VPWidenCallSC:
4818
+ case VPDef::VPWidenCanonicalIVSC:
4819
+ case VPDef::VPWidenCastSC:
4820
+ case VPDef::VPWidenGEPSC:
4821
+ case VPDef::VPWidenSC:
4822
+ case VPDef::VPWidenSelectSC:
4823
+ case VPDef::VPBlendSC:
4824
+ case VPDef::VPFirstOrderRecurrencePHISC:
4825
+ case VPDef::VPWidenPHISC:
4826
+ case VPDef::VPWidenIntOrFpInductionSC:
4827
+ case VPDef::VPWidenPointerInductionSC:
4828
+ case VPDef::VPReductionPHISC:
4829
+ case VPDef::VPInterleaveSC:
4830
+ case VPDef::VPWidenLoadEVLSC:
4831
+ case VPDef::VPWidenLoadSC:
4832
+ case VPDef::VPWidenStoreEVLSC:
4833
+ case VPDef::VPWidenStoreSC:
4834
+ break ;
4835
+ default :
4836
+ llvm_unreachable (" unhandled recipe" );
4837
+ }
4838
+
4839
+ auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4840
+ Type *VectorTy = ToVectorTy (ScalarTy, VF);
4841
+ unsigned NumLegalParts = TTI.getNumberOfParts (VectorTy);
4842
+ if (!NumLegalParts)
4843
+ return false ;
4844
+ if (VF.isScalable ()) {
4845
+ // <vscale x 1 x iN> is assumed to be profitable over iN because
4846
+ // scalable registers are a distinct register class from scalar
4847
+ // ones. If we ever find a target which wants to lower scalable
4848
+ // vectors back to scalars, we'll need to update this code to
4849
+ // explicitly ask TTI about the register class uses for each part.
4850
+ return NumLegalParts <= VF.getKnownMinValue ();
4851
+ }
4852
+ // Two or more parts that share a register - are vectorized.
4853
+ return NumLegalParts < VF.getKnownMinValue ();
4854
+ };
4855
+
4856
+ // If no def nor is a store, e.g., branches, continue - no value to check.
4857
+ if (R.getNumDefinedValues () == 0 &&
4858
+ !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4859
+ &R))
4860
+ continue ;
4861
+ // For multi-def recipes, currently only interleaved loads, suffice to
4862
+ // check first def only.
4863
+ // For stores check their stored value; for interleaved stores suffice
4864
+ // the check first stored value only. In all cases this is the second
4865
+ // operand.
4866
+ VPValue *ToCheck =
4867
+ R.getNumDefinedValues () >= 1 ? R.getVPValue (0 ) : R.getOperand (1 );
4868
+ Type *ScalarTy = TypeInfo.inferScalarType (ToCheck);
4869
+ if (!Visited.insert ({ScalarTy}).second )
4870
+ continue ;
4871
+ if (WillWiden (ScalarTy))
4872
+ return true ;
4873
+ }
4874
+ }
4875
+
4876
+ return false ;
4877
+ }
4878
+
4798
4879
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor () {
4799
- InstructionCost ExpectedCost =
4800
- CM.expectedCost (ElementCount::getFixed (1 )).first ;
4880
+ InstructionCost ExpectedCost = CM.expectedCost (ElementCount::getFixed (1 ));
4801
4881
LLVM_DEBUG (dbgs () << " LV: Scalar loop costs: " << ExpectedCost << " .\n " );
4802
4882
assert (ExpectedCost.isValid () && " Unexpected invalid cost for scalar loop" );
4803
4883
assert (any_of (VPlans,
@@ -4826,9 +4906,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4826
4906
if (VF.isScalar ())
4827
4907
continue ;
4828
4908
4829
- LoopVectorizationCostModel::VectorizationCostTy C =
4830
- CM.expectedCost (VF, &InvalidCosts);
4831
- VectorizationFactor Candidate (VF, C.first , ScalarCost.ScalarCost );
4909
+ InstructionCost C = CM.expectedCost (VF, &InvalidCosts);
4910
+ VectorizationFactor Candidate (VF, C, ScalarCost.ScalarCost );
4832
4911
4833
4912
#ifndef NDEBUG
4834
4913
unsigned AssumedMinimumVscale =
@@ -4845,7 +4924,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4845
4924
LLVM_DEBUG (dbgs () << " .\n " );
4846
4925
#endif
4847
4926
4848
- if (!C. second && !ForceVectorization ) {
4927
+ if (!ForceVectorization && !willGenerateVectors (*P, VF, TTI) ) {
4849
4928
LLVM_DEBUG (
4850
4929
dbgs ()
4851
4930
<< " LV: Not considering vector loop of width " << VF
@@ -5146,7 +5225,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5146
5225
// If we did not calculate the cost for VF (because the user selected the VF)
5147
5226
// then we calculate the cost of VF here.
5148
5227
if (LoopCost == 0 ) {
5149
- LoopCost = expectedCost (VF). first ;
5228
+ LoopCost = expectedCost (VF);
5150
5229
assert (LoopCost.isValid () && " Expected to have chosen a VF with valid cost" );
5151
5230
5152
5231
// Loop body is free and there is no need for interleaving.
@@ -5717,15 +5796,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5717
5796
5718
5797
// Compute the cost of the vector instruction. Note that this cost already
5719
5798
// includes the scalarization overhead of the predicated instruction.
5720
- InstructionCost VectorCost = getInstructionCost (I, VF). first ;
5799
+ InstructionCost VectorCost = getInstructionCost (I, VF);
5721
5800
5722
5801
// Compute the cost of the scalarized instruction. This cost is the cost of
5723
5802
// the instruction as if it wasn't if-converted and instead remained in the
5724
5803
// predicated block. We will scale this cost by block probability after
5725
5804
// computing the scalarization overhead.
5726
5805
InstructionCost ScalarCost =
5727
- VF.getFixedValue () *
5728
- getInstructionCost (I, ElementCount::getFixed (1 )).first ;
5806
+ VF.getFixedValue () * getInstructionCost (I, ElementCount::getFixed (1 ));
5729
5807
5730
5808
// Compute the scalarization overhead of needed insertelement instructions
5731
5809
// and phi nodes.
@@ -5769,14 +5847,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5769
5847
return Discount;
5770
5848
}
5771
5849
5772
- LoopVectorizationCostModel::VectorizationCostTy
5773
- LoopVectorizationCostModel::expectedCost (
5850
+ InstructionCost LoopVectorizationCostModel::expectedCost (
5774
5851
ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5775
- VectorizationCostTy Cost;
5852
+ InstructionCost Cost;
5776
5853
5777
5854
// For each block.
5778
5855
for (BasicBlock *BB : TheLoop->blocks ()) {
5779
- VectorizationCostTy BlockCost;
5856
+ InstructionCost BlockCost;
5780
5857
5781
5858
// For each instruction in the old loop.
5782
5859
for (Instruction &I : BB->instructionsWithoutDebug ()) {
@@ -5785,22 +5862,19 @@ LoopVectorizationCostModel::expectedCost(
5785
5862
(VF.isVector () && VecValuesToIgnore.count (&I)))
5786
5863
continue ;
5787
5864
5788
- VectorizationCostTy C = getInstructionCost (&I, VF);
5865
+ InstructionCost C = getInstructionCost (&I, VF);
5789
5866
5790
5867
// Check if we should override the cost.
5791
- if (C.first .isValid () &&
5792
- ForceTargetInstructionCost.getNumOccurrences () > 0 )
5793
- C.first = InstructionCost (ForceTargetInstructionCost);
5868
+ if (C.isValid () && ForceTargetInstructionCost.getNumOccurrences () > 0 )
5869
+ C = InstructionCost (ForceTargetInstructionCost);
5794
5870
5795
5871
// Keep a list of instructions with invalid costs.
5796
- if (Invalid && !C.first . isValid ())
5872
+ if (Invalid && !C.isValid ())
5797
5873
Invalid->emplace_back (&I, VF);
5798
5874
5799
- BlockCost.first += C.first ;
5800
- BlockCost.second |= C.second ;
5801
- LLVM_DEBUG (dbgs () << " LV: Found an estimated cost of " << C.first
5802
- << " for VF " << VF << " For instruction: " << I
5803
- << ' \n ' );
5875
+ BlockCost += C;
5876
+ LLVM_DEBUG (dbgs () << " LV: Found an estimated cost of " << C << " for VF "
5877
+ << VF << " For instruction: " << I << ' \n ' );
5804
5878
}
5805
5879
5806
5880
// If we are vectorizing a predicated block, it will have been
@@ -5811,10 +5885,9 @@ LoopVectorizationCostModel::expectedCost(
5811
5885
// cost by the probability of executing it. blockNeedsPredication from
5812
5886
// Legal is used so as to not include all blocks in tail folded loops.
5813
5887
if (VF.isScalar () && Legal->blockNeedsPredication (BB))
5814
- BlockCost. first /= getReciprocalPredBlockProb ();
5888
+ BlockCost /= getReciprocalPredBlockProb ();
5815
5889
5816
- Cost.first += BlockCost.first ;
5817
- Cost.second |= BlockCost.second ;
5890
+ Cost += BlockCost;
5818
5891
}
5819
5892
5820
5893
return Cost;
@@ -6213,49 +6286,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6213
6286
return getWideningCost (I, VF);
6214
6287
}
6215
6288
6216
- LoopVectorizationCostModel::VectorizationCostTy
6217
- LoopVectorizationCostModel::getInstructionCost (Instruction *I,
6218
- ElementCount VF) {
6219
- // If we know that this instruction will remain uniform, check the cost of
6220
- // the scalar version.
6221
- if (isUniformAfterVectorization (I, VF))
6222
- VF = ElementCount::getFixed (1 );
6223
-
6224
- if (VF.isVector () && isProfitableToScalarize (I, VF))
6225
- return VectorizationCostTy (InstsToScalarize[VF][I], false );
6226
-
6227
- // Forced scalars do not have any scalarization overhead.
6228
- auto ForcedScalar = ForcedScalars.find (VF);
6229
- if (VF.isVector () && ForcedScalar != ForcedScalars.end ()) {
6230
- auto InstSet = ForcedScalar->second ;
6231
- if (InstSet.count (I))
6232
- return VectorizationCostTy (
6233
- (getInstructionCost (I, ElementCount::getFixed (1 )).first *
6234
- VF.getKnownMinValue ()),
6235
- false );
6236
- }
6237
-
6238
- Type *VectorTy;
6239
- InstructionCost C = getInstructionCost (I, VF, VectorTy);
6240
-
6241
- bool TypeNotScalarized = false ;
6242
- if (VF.isVector () && VectorTy->isVectorTy ()) {
6243
- if (unsigned NumParts = TTI.getNumberOfParts (VectorTy)) {
6244
- if (VF.isScalable ())
6245
- // <vscale x 1 x iN> is assumed to be profitable over iN because
6246
- // scalable registers are a distinct register class from scalar ones.
6247
- // If we ever find a target which wants to lower scalable vectors
6248
- // back to scalars, we'll need to update this code to explicitly
6249
- // ask TTI about the register class uses for each part.
6250
- TypeNotScalarized = NumParts <= VF.getKnownMinValue ();
6251
- else
6252
- TypeNotScalarized = NumParts < VF.getKnownMinValue ();
6253
- } else
6254
- C = InstructionCost::getInvalid ();
6255
- }
6256
- return VectorizationCostTy (C, TypeNotScalarized);
6257
- }
6258
-
6259
6289
InstructionCost LoopVectorizationCostModel::getScalarizationOverhead (
6260
6290
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6261
6291
@@ -6646,8 +6676,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6646
6676
}
6647
6677
6648
6678
InstructionCost
6649
- LoopVectorizationCostModel::getInstructionCost (Instruction *I, ElementCount VF,
6650
- Type *&VectorTy) {
6679
+ LoopVectorizationCostModel::getInstructionCost (Instruction *I,
6680
+ ElementCount VF) {
6681
+ // If we know that this instruction will remain uniform, check the cost of
6682
+ // the scalar version.
6683
+ if (isUniformAfterVectorization (I, VF))
6684
+ VF = ElementCount::getFixed (1 );
6685
+
6686
+ if (VF.isVector () && isProfitableToScalarize (I, VF))
6687
+ return InstsToScalarize[VF][I];
6688
+
6689
+ // Forced scalars do not have any scalarization overhead.
6690
+ auto ForcedScalar = ForcedScalars.find (VF);
6691
+ if (VF.isVector () && ForcedScalar != ForcedScalars.end ()) {
6692
+ auto InstSet = ForcedScalar->second ;
6693
+ if (InstSet.count (I))
6694
+ return getInstructionCost (I, ElementCount::getFixed (1 )) *
6695
+ VF.getKnownMinValue ();
6696
+ }
6697
+
6651
6698
Type *RetTy = I->getType ();
6652
6699
if (canTruncateToMinimalBitwidth (I, VF))
6653
6700
RetTy = IntegerType::get (RetTy->getContext (), MinBWs[I]);
@@ -6670,6 +6717,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6670
6717
};
6671
6718
(void ) hasSingleCopyAfterVectorization;
6672
6719
6720
+ Type *VectorTy;
6673
6721
if (isScalarAfterVectorization (I, VF)) {
6674
6722
// With the exception of GEPs and PHIs, after scalarization there should
6675
6723
// only be one copy of the instruction generated in the loop. This is
@@ -6685,6 +6733,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6685
6733
} else
6686
6734
VectorTy = ToVectorTy (RetTy, VF);
6687
6735
6736
+ if (VF.isVector () && VectorTy->isVectorTy () &&
6737
+ !TTI.getNumberOfParts (VectorTy))
6738
+ return InstructionCost::getInvalid ();
6739
+
6688
6740
// TODO: We need to estimate the cost of intrinsic calls.
6689
6741
switch (I->getOpcode ()) {
6690
6742
case Instruction::GetElementPtr:
0 commit comments