Skip to content

Commit 29b8b72

Browse files
authored
[LV] Move check if any vector insts will be generated to VPlan. (#96622)
This patch moves the check if any vector instructions will be generated from getInstructionCost to be based on VPlan. This simplifies getInstructionCost, is more accurate as we check the final result and also allows us to exit early once we visit a recipe that generates vector instructions. The helper can then be re-used by the VPlan-based cost model to match the legacy selectVectorizationFactor behavior, this fixing a crash and paving the way to recommit #92555. PR: #96622
1 parent 7f3c40a commit 29b8b72

File tree

7 files changed

+168
-192
lines changed

7 files changed

+168
-192
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ class VPBuilder {
226226

227227
/// TODO: The following VectorizationFactor was pulled out of
228228
/// LoopVectorizationCostModel class. LV also deals with
229-
/// VectorizerParams::VectorizationFactor and VectorizationCostTy.
229+
/// VectorizerParams::VectorizationFactor.
230230
/// We need to streamline them.
231231

232232
/// Information about vectorization costs.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 139 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -1090,7 +1090,7 @@ class LoopVectorizationCostModel {
10901090
bool selectUserVectorizationFactor(ElementCount UserVF) {
10911091
collectUniformsAndScalars(UserVF);
10921092
collectInstsToScalarize(UserVF);
1093-
return expectedCost(UserVF).first.isValid();
1093+
return expectedCost(UserVF).isValid();
10941094
}
10951095

10961096
/// \return The size (in bits) of the smallest and widest types in the code
@@ -1591,20 +1591,13 @@ class LoopVectorizationCostModel {
15911591
Scalars.clear();
15921592
}
15931593

1594-
/// The vectorization cost is a combination of the cost itself and a boolean
1595-
/// indicating whether any of the contributing operations will actually
1596-
/// operate on vector values after type legalization in the backend. If this
1597-
/// latter value is false, then all operations will be scalarized (i.e. no
1598-
/// vectorization has actually taken place).
1599-
using VectorizationCostTy = std::pair<InstructionCost, bool>;
1600-
16011594
/// Returns the expected execution cost. The unit of the cost does
16021595
/// not matter because we use the 'cost' units to compare different
16031596
/// vector widths. The cost that is returned is *not* normalized by
16041597
/// the factor width. If \p Invalid is not nullptr, this function
16051598
/// will add a pair(Instruction*, ElementCount) to \p Invalid for
16061599
/// each instruction that has an Invalid cost for the given VF.
1607-
VectorizationCostTy
1600+
InstructionCost
16081601
expectedCost(ElementCount VF,
16091602
SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
16101603

@@ -1642,12 +1635,7 @@ class LoopVectorizationCostModel {
16421635

16431636
/// Returns the execution time cost of an instruction for a given vector
16441637
/// width. Vector width of one means scalar.
1645-
VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
1646-
1647-
/// The cost-computation logic from getInstructionCost which provides
1648-
/// the vector type as an output parameter.
1649-
InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
1650-
Type *&VectorTy);
1638+
InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
16511639

16521640
/// Return the cost of instructions in an inloop reduction pattern, if I is
16531641
/// part of that pattern.
@@ -4795,9 +4783,101 @@ static void emitInvalidCostRemarks(SmallVector<InstructionVFPair> InvalidCosts,
47954783
} while (!Tail.empty());
47964784
}
47974785

4786+
/// Check if any recipe of \p Plan will generate a vector value, which will be
4787+
/// assigned a vector register.
4788+
static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4789+
const TargetTransformInfo &TTI) {
4790+
assert(VF.isVector() && "Checking a scalar VF?");
4791+
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType(),
4792+
Plan.getCanonicalIV()->getScalarType()->getContext());
4793+
// Set of already visited types.
4794+
DenseSet<Type *> Visited;
4795+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4796+
vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) {
4797+
for (VPRecipeBase &R : *VPBB) {
4798+
// Continue early if the recipe is considered to not produce a vector
4799+
// result. Note that this includes VPInstruction where some opcodes may
4800+
// produce a vector, to preserve existing behavior as VPInstructions model
4801+
// aspects not directly mapped to existing IR instructions.
4802+
switch (R.getVPDefID()) {
4803+
case VPDef::VPDerivedIVSC:
4804+
case VPDef::VPScalarIVStepsSC:
4805+
case VPDef::VPScalarCastSC:
4806+
case VPDef::VPReplicateSC:
4807+
case VPDef::VPInstructionSC:
4808+
case VPDef::VPCanonicalIVPHISC:
4809+
case VPDef::VPVectorPointerSC:
4810+
case VPDef::VPExpandSCEVSC:
4811+
case VPDef::VPEVLBasedIVPHISC:
4812+
case VPDef::VPPredInstPHISC:
4813+
case VPDef::VPBranchOnMaskSC:
4814+
continue;
4815+
case VPDef::VPReductionSC:
4816+
case VPDef::VPActiveLaneMaskPHISC:
4817+
case VPDef::VPWidenCallSC:
4818+
case VPDef::VPWidenCanonicalIVSC:
4819+
case VPDef::VPWidenCastSC:
4820+
case VPDef::VPWidenGEPSC:
4821+
case VPDef::VPWidenSC:
4822+
case VPDef::VPWidenSelectSC:
4823+
case VPDef::VPBlendSC:
4824+
case VPDef::VPFirstOrderRecurrencePHISC:
4825+
case VPDef::VPWidenPHISC:
4826+
case VPDef::VPWidenIntOrFpInductionSC:
4827+
case VPDef::VPWidenPointerInductionSC:
4828+
case VPDef::VPReductionPHISC:
4829+
case VPDef::VPInterleaveSC:
4830+
case VPDef::VPWidenLoadEVLSC:
4831+
case VPDef::VPWidenLoadSC:
4832+
case VPDef::VPWidenStoreEVLSC:
4833+
case VPDef::VPWidenStoreSC:
4834+
break;
4835+
default:
4836+
llvm_unreachable("unhandled recipe");
4837+
}
4838+
4839+
auto WillWiden = [&TTI, VF](Type *ScalarTy) {
4840+
Type *VectorTy = ToVectorTy(ScalarTy, VF);
4841+
unsigned NumLegalParts = TTI.getNumberOfParts(VectorTy);
4842+
if (!NumLegalParts)
4843+
return false;
4844+
if (VF.isScalable()) {
4845+
// <vscale x 1 x iN> is assumed to be profitable over iN because
4846+
// scalable registers are a distinct register class from scalar
4847+
// ones. If we ever find a target which wants to lower scalable
4848+
// vectors back to scalars, we'll need to update this code to
4849+
// explicitly ask TTI about the register class uses for each part.
4850+
return NumLegalParts <= VF.getKnownMinValue();
4851+
}
4852+
// Two or more parts that share a register - are vectorized.
4853+
return NumLegalParts < VF.getKnownMinValue();
4854+
};
4855+
4856+
// If no def nor is a store, e.g., branches, continue - no value to check.
4857+
if (R.getNumDefinedValues() == 0 &&
4858+
!isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4859+
&R))
4860+
continue;
4861+
// For multi-def recipes, currently only interleaved loads, suffice to
4862+
// check first def only.
4863+
// For stores check their stored value; for interleaved stores suffice
4864+
// the check first stored value only. In all cases this is the second
4865+
// operand.
4866+
VPValue *ToCheck =
4867+
R.getNumDefinedValues() >= 1 ? R.getVPValue(0) : R.getOperand(1);
4868+
Type *ScalarTy = TypeInfo.inferScalarType(ToCheck);
4869+
if (!Visited.insert({ScalarTy}).second)
4870+
continue;
4871+
if (WillWiden(ScalarTy))
4872+
return true;
4873+
}
4874+
}
4875+
4876+
return false;
4877+
}
4878+
47984879
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4799-
InstructionCost ExpectedCost =
4800-
CM.expectedCost(ElementCount::getFixed(1)).first;
4880+
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
48014881
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
48024882
assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
48034883
assert(any_of(VPlans,
@@ -4826,9 +4906,8 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
48264906
if (VF.isScalar())
48274907
continue;
48284908

4829-
LoopVectorizationCostModel::VectorizationCostTy C =
4830-
CM.expectedCost(VF, &InvalidCosts);
4831-
VectorizationFactor Candidate(VF, C.first, ScalarCost.ScalarCost);
4909+
InstructionCost C = CM.expectedCost(VF, &InvalidCosts);
4910+
VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
48324911

48334912
#ifndef NDEBUG
48344913
unsigned AssumedMinimumVscale =
@@ -4845,7 +4924,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
48454924
LLVM_DEBUG(dbgs() << ".\n");
48464925
#endif
48474926

4848-
if (!C.second && !ForceVectorization) {
4927+
if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
48494928
LLVM_DEBUG(
48504929
dbgs()
48514930
<< "LV: Not considering vector loop of width " << VF
@@ -5146,7 +5225,7 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
51465225
// If we did not calculate the cost for VF (because the user selected the VF)
51475226
// then we calculate the cost of VF here.
51485227
if (LoopCost == 0) {
5149-
LoopCost = expectedCost(VF).first;
5228+
LoopCost = expectedCost(VF);
51505229
assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
51515230

51525231
// Loop body is free and there is no need for interleaving.
@@ -5717,15 +5796,14 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
57175796

57185797
// Compute the cost of the vector instruction. Note that this cost already
57195798
// includes the scalarization overhead of the predicated instruction.
5720-
InstructionCost VectorCost = getInstructionCost(I, VF).first;
5799+
InstructionCost VectorCost = getInstructionCost(I, VF);
57215800

57225801
// Compute the cost of the scalarized instruction. This cost is the cost of
57235802
// the instruction as if it wasn't if-converted and instead remained in the
57245803
// predicated block. We will scale this cost by block probability after
57255804
// computing the scalarization overhead.
57265805
InstructionCost ScalarCost =
5727-
VF.getFixedValue() *
5728-
getInstructionCost(I, ElementCount::getFixed(1)).first;
5806+
VF.getFixedValue() * getInstructionCost(I, ElementCount::getFixed(1));
57295807

57305808
// Compute the scalarization overhead of needed insertelement instructions
57315809
// and phi nodes.
@@ -5769,14 +5847,13 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
57695847
return Discount;
57705848
}
57715849

5772-
LoopVectorizationCostModel::VectorizationCostTy
5773-
LoopVectorizationCostModel::expectedCost(
5850+
InstructionCost LoopVectorizationCostModel::expectedCost(
57745851
ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
5775-
VectorizationCostTy Cost;
5852+
InstructionCost Cost;
57765853

57775854
// For each block.
57785855
for (BasicBlock *BB : TheLoop->blocks()) {
5779-
VectorizationCostTy BlockCost;
5856+
InstructionCost BlockCost;
57805857

57815858
// For each instruction in the old loop.
57825859
for (Instruction &I : BB->instructionsWithoutDebug()) {
@@ -5785,22 +5862,19 @@ LoopVectorizationCostModel::expectedCost(
57855862
(VF.isVector() && VecValuesToIgnore.count(&I)))
57865863
continue;
57875864

5788-
VectorizationCostTy C = getInstructionCost(&I, VF);
5865+
InstructionCost C = getInstructionCost(&I, VF);
57895866

57905867
// Check if we should override the cost.
5791-
if (C.first.isValid() &&
5792-
ForceTargetInstructionCost.getNumOccurrences() > 0)
5793-
C.first = InstructionCost(ForceTargetInstructionCost);
5868+
if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5869+
C = InstructionCost(ForceTargetInstructionCost);
57945870

57955871
// Keep a list of instructions with invalid costs.
5796-
if (Invalid && !C.first.isValid())
5872+
if (Invalid && !C.isValid())
57975873
Invalid->emplace_back(&I, VF);
57985874

5799-
BlockCost.first += C.first;
5800-
BlockCost.second |= C.second;
5801-
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
5802-
<< " for VF " << VF << " For instruction: " << I
5803-
<< '\n');
5875+
BlockCost += C;
5876+
LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5877+
<< VF << " For instruction: " << I << '\n');
58045878
}
58055879

58065880
// If we are vectorizing a predicated block, it will have been
@@ -5811,10 +5885,9 @@ LoopVectorizationCostModel::expectedCost(
58115885
// cost by the probability of executing it. blockNeedsPredication from
58125886
// Legal is used so as to not include all blocks in tail folded loops.
58135887
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5814-
BlockCost.first /= getReciprocalPredBlockProb();
5888+
BlockCost /= getReciprocalPredBlockProb();
58155889

5816-
Cost.first += BlockCost.first;
5817-
Cost.second |= BlockCost.second;
5890+
Cost += BlockCost;
58185891
}
58195892

58205893
return Cost;
@@ -6213,49 +6286,6 @@ LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
62136286
return getWideningCost(I, VF);
62146287
}
62156288

6216-
LoopVectorizationCostModel::VectorizationCostTy
6217-
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6218-
ElementCount VF) {
6219-
// If we know that this instruction will remain uniform, check the cost of
6220-
// the scalar version.
6221-
if (isUniformAfterVectorization(I, VF))
6222-
VF = ElementCount::getFixed(1);
6223-
6224-
if (VF.isVector() && isProfitableToScalarize(I, VF))
6225-
return VectorizationCostTy(InstsToScalarize[VF][I], false);
6226-
6227-
// Forced scalars do not have any scalarization overhead.
6228-
auto ForcedScalar = ForcedScalars.find(VF);
6229-
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6230-
auto InstSet = ForcedScalar->second;
6231-
if (InstSet.count(I))
6232-
return VectorizationCostTy(
6233-
(getInstructionCost(I, ElementCount::getFixed(1)).first *
6234-
VF.getKnownMinValue()),
6235-
false);
6236-
}
6237-
6238-
Type *VectorTy;
6239-
InstructionCost C = getInstructionCost(I, VF, VectorTy);
6240-
6241-
bool TypeNotScalarized = false;
6242-
if (VF.isVector() && VectorTy->isVectorTy()) {
6243-
if (unsigned NumParts = TTI.getNumberOfParts(VectorTy)) {
6244-
if (VF.isScalable())
6245-
// <vscale x 1 x iN> is assumed to be profitable over iN because
6246-
// scalable registers are a distinct register class from scalar ones.
6247-
// If we ever find a target which wants to lower scalable vectors
6248-
// back to scalars, we'll need to update this code to explicitly
6249-
// ask TTI about the register class uses for each part.
6250-
TypeNotScalarized = NumParts <= VF.getKnownMinValue();
6251-
else
6252-
TypeNotScalarized = NumParts < VF.getKnownMinValue();
6253-
} else
6254-
C = InstructionCost::getInvalid();
6255-
}
6256-
return VectorizationCostTy(C, TypeNotScalarized);
6257-
}
6258-
62596289
InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
62606290
Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const {
62616291

@@ -6646,8 +6676,25 @@ void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
66466676
}
66476677

66486678
InstructionCost
6649-
LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
6650-
Type *&VectorTy) {
6679+
LoopVectorizationCostModel::getInstructionCost(Instruction *I,
6680+
ElementCount VF) {
6681+
// If we know that this instruction will remain uniform, check the cost of
6682+
// the scalar version.
6683+
if (isUniformAfterVectorization(I, VF))
6684+
VF = ElementCount::getFixed(1);
6685+
6686+
if (VF.isVector() && isProfitableToScalarize(I, VF))
6687+
return InstsToScalarize[VF][I];
6688+
6689+
// Forced scalars do not have any scalarization overhead.
6690+
auto ForcedScalar = ForcedScalars.find(VF);
6691+
if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
6692+
auto InstSet = ForcedScalar->second;
6693+
if (InstSet.count(I))
6694+
return getInstructionCost(I, ElementCount::getFixed(1)) *
6695+
VF.getKnownMinValue();
6696+
}
6697+
66516698
Type *RetTy = I->getType();
66526699
if (canTruncateToMinimalBitwidth(I, VF))
66536700
RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
@@ -6670,6 +6717,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
66706717
};
66716718
(void) hasSingleCopyAfterVectorization;
66726719

6720+
Type *VectorTy;
66736721
if (isScalarAfterVectorization(I, VF)) {
66746722
// With the exception of GEPs and PHIs, after scalarization there should
66756723
// only be one copy of the instruction generated in the loop. This is
@@ -6685,6 +6733,10 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
66856733
} else
66866734
VectorTy = ToVectorTy(RetTy, VF);
66876735

6736+
if (VF.isVector() && VectorTy->isVectorTy() &&
6737+
!TTI.getNumberOfParts(VectorTy))
6738+
return InstructionCost::getInvalid();
6739+
66886740
// TODO: We need to estimate the cost of intrinsic calls.
66896741
switch (I->getOpcode()) {
66906742
case Instruction::GetElementPtr:

0 commit comments

Comments
 (0)