@@ -1642,12 +1642,7 @@ class LoopVectorizationCostModel {
1642
1642
1643
1643
// / Returns the execution time cost of an instruction for a given vector
1644
1644
// / width. Vector width of one means scalar.
1645
- VectorizationCostTy getInstructionCost (Instruction *I, ElementCount VF);
1646
-
1647
- // / The cost-computation logic from getInstructionCost which provides
1648
- // / the vector type as an output parameter.
1649
- InstructionCost getInstructionCost (Instruction *I, ElementCount VF,
1650
- Type *&VectorTy);
1645
+ InstructionCost getInstructionCost (Instruction *I, ElementCount VF);
1651
1646
1652
1647
// / Return the cost of instructions in an inloop reduction pattern, if I is
1653
1648
// / part of that pattern.
@@ -4873,6 +4868,52 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
4873
4868
} while (!Tail.empty ());
4874
4869
}
4875
4870
4871
+ static bool willGenerateVectorInstructions (VPlan &Plan, ElementCount VF,
4872
+ const TargetTransformInfo &TTI) {
4873
+ VPTypeAnalysis TypeInfo (Plan.getCanonicalIV ()->getScalarType (),
4874
+ Plan.getCanonicalIV ()->getScalarType ()->getContext ());
4875
+ for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4876
+ vp_depth_first_shallow (Plan.getVectorLoopRegion ()->getEntry ()))) {
4877
+ for (VPRecipeBase &R : *VPBB) {
4878
+ if (isa<VPDerivedIVRecipe, VPScalarIVStepsRecipe, VPScalarCastRecipe,
4879
+ VPReplicateRecipe, VPInstruction, VPActiveLaneMaskPHIRecipe,
4880
+ VPCanonicalIVPHIRecipe, VPEVLBasedIVPHIRecipe,
4881
+ VPVectorPointerRecipe>(&R))
4882
+ continue ;
4883
+
4884
+ auto WillWiden = [&TypeInfo, &TTI, VF](VPValue *VPV) {
4885
+ Type *ScalarTy = TypeInfo.inferScalarType (VPV);
4886
+ Type *VectorTy = ToVectorTy (ScalarTy, VF);
4887
+ unsigned NumParts = TTI.getNumberOfParts (VectorTy);
4888
+ if (!NumParts)
4889
+ return false ;
4890
+ if (VF.isScalable ())
4891
+ // <vscale x 1 x iN> is assumed to be profitable over iN because
4892
+ // scalable registers are a distinct register class from scalar ones.
4893
+ // If we ever find a target which wants to lower scalable vectors
4894
+ // back to scalars, we'll need to update this code to explicitly
4895
+ // ask TTI about the register class uses for each part.
4896
+ return NumParts <= VF.getKnownMinValue ();
4897
+ else
4898
+ return NumParts < VF.getKnownMinValue ();
4899
+ };
4900
+ SmallVector<VPValue *> VPValuesToCheck;
4901
+ if (auto *WidenStore = dyn_cast<VPWidenStoreRecipe>(&R)) {
4902
+ VPValuesToCheck.push_back (WidenStore->getOperand (1 ));
4903
+ } else if (auto *IG = dyn_cast<VPInterleaveRecipe>(&R)) {
4904
+ append_range (VPValuesToCheck, IG->getStoredValues ());
4905
+ } else {
4906
+ append_range (VPValuesToCheck, R.definedValues ());
4907
+ }
4908
+ if (any_of (VPValuesToCheck,
4909
+ [&WillWiden](VPValue *VPV) { return WillWiden (VPV); }))
4910
+ return true ;
4911
+ }
4912
+ }
4913
+
4914
+ return false ;
4915
+ }
4916
+
4876
4917
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor () {
4877
4918
InstructionCost ExpectedCost =
4878
4919
CM.expectedCost (ElementCount::getFixed (1 )).first ;
@@ -4923,7 +4964,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4923
4964
LLVM_DEBUG (dbgs () << " .\n " );
4924
4965
#endif
4925
4966
4926
- if (!C. second && !ForceVectorization) {
4967
+ if (!willGenerateVectorInstructions (*P, VF, TTI) && !ForceVectorization) {
4927
4968
LLVM_DEBUG (
4928
4969
dbgs ()
4929
4970
<< " LV: Not considering vector loop of width " << VF
@@ -5795,15 +5836,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
5795
5836
5796
5837
// Compute the cost of the vector instruction. Note that this cost already
5797
5838
// includes the scalarization overhead of the predicated instruction.
5798
- InstructionCost VectorCost = getInstructionCost (I, VF). first ;
5839
+ InstructionCost VectorCost = getInstructionCost (I, VF);
5799
5840
5800
5841
// Compute the cost of the scalarized instruction. This cost is the cost of
5801
5842
// the instruction as if it wasn't if-converted and instead remained in the
5802
5843
// predicated block. We will scale this cost by block probability after
5803
5844
// computing the scalarization overhead.
5804
5845
InstructionCost ScalarCost =
5805
- VF.getFixedValue () *
5806
- getInstructionCost (I, ElementCount::getFixed (1 )).first ;
5846
+ VF.getFixedValue () * getInstructionCost (I, ElementCount::getFixed (1 ));
5807
5847
5808
5848
// Compute the scalarization overhead of needed insertelement instructions
5809
5849
// and phi nodes.
@@ -5863,22 +5903,19 @@ LoopVectorizationCostModel::expectedCost(
5863
5903
(VF.isVector () && VecValuesToIgnore.count (&I)))
5864
5904
continue ;
5865
5905
5866
- VectorizationCostTy C = getInstructionCost (&I, VF);
5906
+ InstructionCost C = getInstructionCost (&I, VF);
5867
5907
5868
5908
// Check if we should override the cost.
5869
- if (C.first .isValid () &&
5870
- ForceTargetInstructionCost.getNumOccurrences () > 0 )
5871
- C.first = InstructionCost (ForceTargetInstructionCost);
5909
+ if (C.isValid () && ForceTargetInstructionCost.getNumOccurrences () > 0 )
5910
+ C = InstructionCost (ForceTargetInstructionCost);
5872
5911
5873
5912
// Keep a list of instructions with invalid costs.
5874
- if (Invalid && !C.first . isValid ())
5913
+ if (Invalid && !C.isValid ())
5875
5914
Invalid->emplace_back (&I, VF);
5876
5915
5877
- BlockCost.first += C.first ;
5878
- BlockCost.second |= C.second ;
5879
- LLVM_DEBUG (dbgs () << " LV: Found an estimated cost of " << C.first
5880
- << " for VF " << VF << " For instruction: " << I
5881
- << ' \n ' );
5916
+ BlockCost.first += C;
5917
+ LLVM_DEBUG (dbgs () << " LV: Found an estimated cost of " << C << " for VF "
5918
+ << VF << " For instruction: " << I << ' \n ' );
5882
5919
}
5883
5920
5884
5921
// If we are vectorizing a predicated block, it will have been
@@ -6291,49 +6328,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
6291
6328
return getWideningCost (I, VF);
6292
6329
}
6293
6330
6294
- LoopVectorizationCostModel::VectorizationCostTy
6295
- LoopVectorizationCostModel::getInstructionCost (Instruction *I,
6296
- ElementCount VF) {
6297
- // If we know that this instruction will remain uniform, check the cost of
6298
- // the scalar version.
6299
- if (isUniformAfterVectorization (I, VF))
6300
- VF = ElementCount::getFixed (1 );
6301
-
6302
- if (VF.isVector () && isProfitableToScalarize (I, VF))
6303
- return VectorizationCostTy (InstsToScalarize[VF][I], false );
6304
-
6305
- // Forced scalars do not have any scalarization overhead.
6306
- auto ForcedScalar = ForcedScalars.find (VF);
6307
- if (VF.isVector () && ForcedScalar != ForcedScalars.end ()) {
6308
- auto InstSet = ForcedScalar->second ;
6309
- if (InstSet.count (I))
6310
- return VectorizationCostTy (
6311
- (getInstructionCost (I, ElementCount::getFixed (1 )).first *
6312
- VF.getKnownMinValue ()),
6313
- false );
6314
- }
6315
-
6316
- Type *VectorTy;
6317
- InstructionCost C = getInstructionCost (I, VF, VectorTy);
6318
-
6319
- bool TypeNotScalarized = false ;
6320
- if (VF.isVector () && VectorTy->isVectorTy ()) {
6321
- if (unsigned NumParts = TTI.getNumberOfParts (VectorTy)) {
6322
- if (VF.isScalable ())
6323
- // <vscale x 1 x iN> is assumed to be profitable over iN because
6324
- // scalable registers are a distinct register class from scalar ones.
6325
- // If we ever find a target which wants to lower scalable vectors
6326
- // back to scalars, we'll need to update this code to explicitly
6327
- // ask TTI about the register class uses for each part.
6328
- TypeNotScalarized = NumParts <= VF.getKnownMinValue ();
6329
- else
6330
- TypeNotScalarized = NumParts < VF.getKnownMinValue ();
6331
- } else
6332
- C = InstructionCost::getInvalid ();
6333
- }
6334
- return VectorizationCostTy (C, TypeNotScalarized);
6335
- }
6336
-
6337
6331
InstructionCost LoopVectorizationCostModel::getScalarizationOverhead (
6338
6332
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
6339
6333
@@ -6724,8 +6718,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
6724
6718
}
6725
6719
6726
6720
InstructionCost
6727
- LoopVectorizationCostModel::getInstructionCost (Instruction *I, ElementCount VF,
6728
- Type *&VectorTy) {
6721
+ LoopVectorizationCostModel::getInstructionCost (Instruction *I,
6722
+ ElementCount VF) {
6723
+ // If we know that this instruction will remain uniform, check the cost of
6724
+ // the scalar version.
6725
+ if (isUniformAfterVectorization (I, VF))
6726
+ VF = ElementCount::getFixed (1 );
6727
+
6728
+ if (VF.isVector () && isProfitableToScalarize (I, VF))
6729
+ return InstsToScalarize[VF][I];
6730
+
6731
+ // Forced scalars do not have any scalarization overhead.
6732
+ auto ForcedScalar = ForcedScalars.find (VF);
6733
+ if (VF.isVector () && ForcedScalar != ForcedScalars.end ()) {
6734
+ auto InstSet = ForcedScalar->second ;
6735
+ if (InstSet.count (I))
6736
+ return getInstructionCost (I, ElementCount::getFixed (1 )) *
6737
+ VF.getKnownMinValue ();
6738
+ }
6739
+
6729
6740
Type *RetTy = I->getType ();
6730
6741
if (canTruncateToMinimalBitwidth (I, VF))
6731
6742
RetTy = IntegerType::get (RetTy->getContext (), MinBWs[I]);
@@ -6748,6 +6759,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6748
6759
};
6749
6760
(void ) hasSingleCopyAfterVectorization;
6750
6761
6762
+ Type *VectorTy;
6751
6763
if (isScalarAfterVectorization (I, VF)) {
6752
6764
// With the exception of GEPs and PHIs, after scalarization there should
6753
6765
// only be one copy of the instruction generated in the loop. This is
@@ -6763,6 +6775,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6763
6775
} else
6764
6776
VectorTy = ToVectorTy (RetTy, VF);
6765
6777
6778
+ if (VF.isVector () && VectorTy->isVectorTy () &&
6779
+ !TTI.getNumberOfParts (VectorTy))
6780
+ return InstructionCost::getInvalid ();
6781
+
6766
6782
// TODO: We need to estimate the cost of intrinsic calls.
6767
6783
switch (I->getOpcode ()) {
6768
6784
case Instruction::GetElementPtr:
0 commit comments