@@ -1648,12 +1648,7 @@ class LoopVectorizationCostModel {
1648
1648
1649
1649
// / Returns the execution time cost of an instruction for a given vector
1650
1650
// / width. Vector width of one means scalar.
1651
- VectorizationCostTy getInstructionCost (Instruction *I, ElementCount VF);
1652
-
1653
- // / The cost-computation logic from getInstructionCost which provides
1654
- // / the vector type as an output parameter.
1655
- InstructionCost getInstructionCost (Instruction *I, ElementCount VF,
1656
- Type *&VectorTy);
1651
+ InstructionCost getInstructionCost (Instruction *I, ElementCount VF);
1657
1652
1658
1653
// / Return the cost of instructions in an inloop reduction pattern, if I is
1659
1654
// / part of that pattern.
@@ -4879,6 +4874,52 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4879
4874
} while (!Tail.empty ());
4880
4875
}
4881
4876
4877
+ static bool willGenerateVectorInstructions (VPlan &Plan, ElementCount VF,
4878
+ const TargetTransformInfo &TTI) {
4879
+ VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType (),
4880
+ Plan.getCanonicalIV ()->getScalarType ()->getContext ());
4881
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4882
+ vp_depth_first_shallow (Plan.getVectorLoopRegion ()->getEntry ()))) {
4883
+ for (VPRecipeBase &R : *VPBB) {
4884
+ if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
4885
+ VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
4886
+ VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
4887
+ VPVectorPointerRecipe>(&R))
4888
+ continue ;
4889
+
4890
+ auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
4891
+ Type *ScalarTy = TypeInfo.inferScalarType (VPV);
4892
+ Type *VectorTy = ToVectorTy (ScalarTy, VF);
4893
+ unsigned NumParts = TTI.getNumberOfParts (VectorTy);
4894
+ if (!NumParts)
4895
+ return false ;
4896
+ if (VF.isScalable ())
4897
+ // <vscale x 1 x iN> is assumed to be profitable over iN because
4898
+ // scalable registers are a distinct register class from scalar ones.
4899
+ // If we ever find a target which wants to lower scalable vectors
4900
+ // back to scalars, we'll need to update this code to explicitly
4901
+ // ask TTI about the register class uses for each part.
4902
+ return NumParts <= VF.getKnownMinValue ();
4903
+ else
4904
+ return NumParts < VF.getKnownMinValue ();
4905
+ };
4906
+ SmallVector<VPValue *> VPValuesToCheck;
4907
+ if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
4908
+ VPValuesToCheck.push_back (WidenStore->getOperand (1 ));
4909
+ } else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
4910
+ append_range (VPValuesToCheck, IG->getStoredValues ());
4911
+ } else {
4912
+ append_range (VPValuesToCheck, R.definedValues ());
4913
+ }
4914
+ if (any_of (VPValuesToCheck,
4915
+ [&WillWiden](VPValue *VPV) { return WillWiden (VPV); }))
4916
+ return true ;
4917
+ }
4918
+ }
4919
+
4920
+ return false ;
4921
+ }
4922
+
4882
4923
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor () {
4883
4924
InstructionCost ExpectedCost =
4884
4925
CM.expectedCost (ElementCount::getFixed (1 )).first ;
@@ -4929,7 +4970,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4929
4970
LLVM_DEBUG (dbgs () << " .\n " );
4930
4971
#endif
4931
4972
4932
- if (!C. second && !ForceVectorization) {
4973
+ if (!willGenerateVectorInstructions (*P, VF, TTI) && !ForceVectorization) {
4933
4974
LLVM_DEBUG (
4934
4975
dbgs ()
4935
4976
<< " LV: Not considering vector loop of width " << VF
@@ -5801,15 +5842,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5801
5842
5802
5843
// Compute the cost of the vector instruction. Note that this cost already
5803
5844
// includes the scalarization overhead of the predicated instruction.
5804
- InstructionCost VectorCost = getInstructionCost (I, VF). first ;
5845
+ InstructionCost VectorCost = getInstructionCost (I, VF);
5805
5846
5806
5847
// Compute the cost of the scalarized instruction. This cost is the cost of
5807
5848
// the instruction as if it wasn't if-converted and instead remained in the
5808
5849
// predicated block. We will scale this cost by block probability after
5809
5850
// computing the scalarization overhead.
5810
5851
InstructionCost ScalarCost =
5811
- VF.getFixedValue () *
5812
- getInstructionCost (I, ElementCount::getFixed (1 )).first ;
5852
+ VF.getFixedValue () * getInstructionCost (I, ElementCount::getFixed (1 ));
5813
5853
5814
5854
// Compute the scalarization overhead of needed insertelement instructions
5815
5855
// and phi nodes.
@@ -5869,22 +5909,19 @@ LoopVectorizationCostModel::expectedCost(
5869
5909
(VF.isVector () && VecValuesToIgnore.count (&I)))
5870
5910
continue ;
5871
5911
5872
- VectorizationCostTy C = getInstructionCost (&I, VF);
5912
+ InstructionCost C = getInstructionCost (&I, VF);
5873
5913
5874
5914
// Check if we should override the cost.
5875
- if (C.first .isValid () &&
5876
- ForceTargetInstructionCost.getNumOccurrences () > 0 )
5877
- C.first = InstructionCost (ForceTargetInstructionCost);
5915
+ if (C.isValid () && ForceTargetInstructionCost.getNumOccurrences () > 0 )
5916
+ C = InstructionCost (ForceTargetInstructionCost);
5878
5917
5879
5918
// Keep a list of instructions with invalid costs.
5880
- if (Invalid && !C.first . isValid ())
5919
+ if (Invalid && !C.isValid ())
5881
5920
Invalid->emplace_back (&I, VF);
5882
5921
5883
- BlockCost.first += C.first ;
5884
- BlockCost.second |= C.second ;
5885
- LLVM_DEBUG (dbgs () << " LV: Found an estimated cost of " << C.first
5886
- << " for VF " << VF << " For instruction: " << I
5887
- << ' \n ' );
5922
+ BlockCost.first += C;
5923
+ LLVM_DEBUG (dbgs () << " LV: Found an estimated cost of " << C << " for VF "
5924
+ << VF << " For instruction: " << I << ' \n ' );
5888
5925
}
5889
5926
5890
5927
// If we are vectorizing a predicated block, it will have been
@@ -6297,49 +6334,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6297
6334
return getWideningCost (I, VF);
6298
6335
}
6299
6336
6300
- LoopVectorizationCostModel::VectorizationCostTy
6301
- LoopVectorizationCostModel::getInstructionCost (Instruction *I,
6302
- ElementCount VF) {
6303
- // If we know that this instruction will remain uniform, check the cost of
6304
- // the scalar version.
6305
- if (isUniformAfterVectorization (I, VF))
6306
- VF = ElementCount::getFixed (1 );
6307
-
6308
- if (VF.isVector () && isProfitableToScalarize (I, VF))
6309
- return VectorizationCostTy (InstsToScalarize[VF][I], false );
6310
-
6311
- // Forced scalars do not have any scalarization overhead.
6312
- auto ForcedScalar = ForcedScalars.find (VF);
6313
- if (VF.isVector () && ForcedScalar != ForcedScalars.end ()) {
6314
- auto InstSet = ForcedScalar->second ;
6315
- if (InstSet.count (I))
6316
- return VectorizationCostTy (
6317
- (getInstructionCost (I, ElementCount::getFixed (1 )).first *
6318
- VF.getKnownMinValue ()),
6319
- false );
6320
- }
6321
-
6322
- Type *VectorTy;
6323
- InstructionCost C = getInstructionCost (I, VF, VectorTy);
6324
-
6325
- bool TypeNotScalarized = false ;
6326
- if (VF.isVector () && VectorTy->isVectorTy ()) {
6327
- if (unsigned NumParts = TTI.getNumberOfParts (VectorTy)) {
6328
- if (VF.isScalable ())
6329
- // <vscale x 1 x iN> is assumed to be profitable over iN because
6330
- // scalable registers are a distinct register class from scalar ones.
6331
- // If we ever find a target which wants to lower scalable vectors
6332
- // back to scalars, we'll need to update this code to explicitly
6333
- // ask TTI about the register class uses for each part.
6334
- TypeNotScalarized = NumParts <= VF.getKnownMinValue ();
6335
- else
6336
- TypeNotScalarized = NumParts < VF.getKnownMinValue ();
6337
- } else
6338
- C = InstructionCost::getInvalid ();
6339
- }
6340
- return VectorizationCostTy (C, TypeNotScalarized);
6341
- }
6342
-
6343
6337
InstructionCost LoopVectorizationCostModel::getScalarizationOverhead (
6344
6338
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6345
6339
@@ -6730,8 +6724,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6730
6724
}
6731
6725
6732
6726
InstructionCost
6733
- LoopVectorizationCostModel::getInstructionCost (Instruction *I, ElementCount VF,
6734
- Type *&VectorTy) {
6727
+ LoopVectorizationCostModel::getInstructionCost (Instruction *I,
6728
+ ElementCount VF) {
6729
+ // If we know that this instruction will remain uniform, check the cost of
6730
+ // the scalar version.
6731
+ if (isUniformAfterVectorization (I, VF))
6732
+ VF = ElementCount::getFixed (1 );
6733
+
6734
+ if (VF.isVector () && isProfitableToScalarize (I, VF))
6735
+ return InstsToScalarize[VF][I];
6736
+
6737
+ // Forced scalars do not have any scalarization overhead.
6738
+ auto ForcedScalar = ForcedScalars.find (VF);
6739
+ if (VF.isVector () && ForcedScalar != ForcedScalars.end ()) {
6740
+ auto InstSet = ForcedScalar->second ;
6741
+ if (InstSet.count (I))
6742
+ return getInstructionCost (I, ElementCount::getFixed (1 )) *
6743
+ VF.getKnownMinValue ();
6744
+ }
6745
+
6735
6746
Type *RetTy = I->getType ();
6736
6747
if (canTruncateToMinimalBitwidth (I, VF))
6737
6748
RetTy = IntegerType::get (RetTy->getContext (), MinBWs[I]);
@@ -6754,6 +6765,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6754
6765
};
6755
6766
(void ) hasSingleCopyAfterVectorization;
6756
6767
6768
+ Type *VectorTy;
6757
6769
if (isScalarAfterVectorization (I, VF)) {
6758
6770
// With the exception of GEPs and PHIs, after scalarization there should
6759
6771
// only be one copy of the instruction generated in the loop. This is
@@ -6769,6 +6781,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6769
6781
} else
6770
6782
VectorTy = ToVectorTy (RetTy, VF);
6771
6783
6784
+ if (VF.isVector () && VectorTy->isVectorTy () &&
6785
+ !TTI.getNumberOfParts (VectorTy))
6786
+ return InstructionCost::getInvalid ();
6787
+
6772
6788
// TODO: We need to estimate the cost of intrinsic calls.
6773
6789
switch (I->getOpcode ()) {
6774
6790
case Instruction::GetElementPtr:
0 commit comments