Skip to content

Commit 11713e8

Browse files
authored
[LV] Move VPlan-based calculateRegisterUsage to VPlanAnalysis (NFC). (#135673)
Move VPlan-based calculateRegisterUsage from LoopVectorize to VPlanAnalysis.cpp. It is a VPlan-based analysis and this helps to reduce the size of LoopVectorize. PR: #135673
1 parent 4ca4f81 commit 11713e8

File tree

14 files changed

+303
-301
lines changed

14 files changed

+303
-301
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 4 additions & 280 deletions
Original file line numberDiff line numberDiff line change
@@ -987,25 +987,6 @@ class LoopVectorizationCostModel {
987987
/// decision in a map for use in planning and plan execution.
988988
void setVectorizedCallDecision(ElementCount VF);
989989

990-
/// A struct that represents some properties of the register usage
991-
/// of a loop.
992-
struct RegisterUsage {
993-
/// Holds the number of loop invariant values that are used in the loop.
994-
/// The key is ClassID of target-provided register class.
995-
SmallMapVector<unsigned, unsigned, 4> LoopInvariantRegs;
996-
/// Holds the maximum number of concurrent live intervals in the loop.
997-
/// The key is ClassID of target-provided register class.
998-
SmallMapVector<unsigned, unsigned, 4> MaxLocalUsers;
999-
1000-
/// Check if any of the tracked live intervals exceeds the number of
1001-
/// available registers for the target.
1002-
bool exceedsMaxNumRegs(const TargetTransformInfo &TTI) const {
1003-
return any_of(MaxLocalUsers, [&TTI](auto &LU) {
1004-
return LU.second > TTI.getNumberOfRegisters(LU.first);
1005-
});
1006-
}
1007-
};
1008-
1009990
/// Collect values we want to ignore in the cost model.
1010991
void collectValuesToIgnore();
1011992

@@ -4343,15 +4324,6 @@ static bool hasReplicatorRegion(VPlan &Plan) {
43434324
}
43444325

43454326
#ifndef NDEBUG
4346-
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
4347-
/// by calculating the highest number of values that are live at a single
4348-
/// location as a rough estimate. Returns the register usage for each VF in \p
4349-
/// VFs.
4350-
static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
4351-
calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
4352-
const TargetTransformInfo &TTI,
4353-
const SmallPtrSetImpl<const Value *> &ValuesToIgnore);
4354-
43554327
VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
43564328
InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
43574329
LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
@@ -4377,7 +4349,7 @@ VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
43774349
for (auto &P : VPlans) {
43784350
ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
43794351
P->vectorFactors().end());
4380-
auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore);
4352+
auto RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
43814353
for (auto [VF, RU] : zip_equal(VFs, RUs)) {
43824354
// The cost for scalar VF=1 is already calculated, so ignore it.
43834355
if (VF.isScalar())
@@ -4704,254 +4676,6 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
47044676
}
47054677
}
47064678

4707-
/// Get the VF scaling factor applied to the recipe's output, if the recipe has
4708-
/// one.
4709-
static unsigned getVFScaleFactor(VPRecipeBase *R) {
4710-
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
4711-
return RR->getVFScaleFactor();
4712-
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
4713-
return RR->getVFScaleFactor();
4714-
return 1;
4715-
}
4716-
4717-
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
4718-
/// by calculating the highest number of values that are live at a single
4719-
/// location as a rough estimate. Returns the register usage for each VF in \p
4720-
/// VFs.
4721-
static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
4722-
calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
4723-
const TargetTransformInfo &TTI,
4724-
const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4725-
// Each 'key' in the map opens a new interval. The values
4726-
// of the map are the index of the 'last seen' usage of the
4727-
// recipe that is the key.
4728-
using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
4729-
4730-
// Maps indices to recipes.
4731-
SmallVector<VPRecipeBase *, 64> Idx2Recipe;
4732-
// Marks the end of each interval.
4733-
IntervalMap EndPoint;
4734-
// Saves the list of recipe indices that are used in the loop.
4735-
SmallPtrSet<VPRecipeBase *, 8> Ends;
4736-
// Saves the list of values that are used in the loop but are defined outside
4737-
// the loop (not including non-recipe values such as arguments and
4738-
// constants).
4739-
SmallSetVector<VPValue *, 8> LoopInvariants;
4740-
LoopInvariants.insert(&Plan.getVectorTripCount());
4741-
4742-
// We scan the loop in a topological order in order and assign a number to
4743-
// each recipe. We use RPO to ensure that defs are met before their users. We
4744-
// assume that each recipe that has in-loop users starts an interval. We
4745-
// record every time that an in-loop value is used, so we have a list of the
4746-
// first and last occurrences of each recipe.
4747-
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
4748-
Plan.getVectorLoopRegion());
4749-
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4750-
if (!VPBB->getParent())
4751-
break;
4752-
for (VPRecipeBase &R : *VPBB) {
4753-
Idx2Recipe.push_back(&R);
4754-
4755-
// Save the end location of each USE.
4756-
for (VPValue *U : R.operands()) {
4757-
auto *DefR = U->getDefiningRecipe();
4758-
4759-
// Ignore non-recipe values such as arguments, constants, etc.
4760-
// FIXME: Might need some motivation why these values are ignored. If
4761-
// for example an argument is used inside the loop it will increase the
4762-
// register pressure (so shouldn't we add it to LoopInvariants).
4763-
if (!DefR && (!U->getLiveInIRValue() ||
4764-
!isa<Instruction>(U->getLiveInIRValue())))
4765-
continue;
4766-
4767-
// If this recipe is outside the loop then record it and continue.
4768-
if (!DefR) {
4769-
LoopInvariants.insert(U);
4770-
continue;
4771-
}
4772-
4773-
// Overwrite previous end points.
4774-
EndPoint[DefR] = Idx2Recipe.size();
4775-
Ends.insert(DefR);
4776-
}
4777-
}
4778-
if (VPBB == Plan.getVectorLoopRegion()->getExiting()) {
4779-
// VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4780-
// exiting block, where their increment will get materialized eventually.
4781-
for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
4782-
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4783-
EndPoint[&R] = Idx2Recipe.size();
4784-
Ends.insert(&R);
4785-
}
4786-
}
4787-
}
4788-
}
4789-
4790-
// Saves the list of intervals that end with the index in 'key'.
4791-
using RecipeList = SmallVector<VPRecipeBase *, 2>;
4792-
SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
4793-
4794-
// Next, we transpose the EndPoints into a multi map that holds the list of
4795-
// intervals that *end* at a specific location.
4796-
for (auto &Interval : EndPoint)
4797-
TransposeEnds[Interval.second].push_back(Interval.first);
4798-
4799-
SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
4800-
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> RUs(VFs.size());
4801-
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
4802-
4803-
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
4804-
4805-
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4806-
4807-
const auto &TTICapture = TTI;
4808-
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4809-
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
4810-
(VF.isScalable() &&
4811-
!TTICapture.isElementTypeLegalForScalableVector(Ty)))
4812-
return 0;
4813-
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
4814-
};
4815-
4816-
// We scan the instructions linearly and record each time that a new interval
4817-
// starts, by placing it in a set. If we find this value in TransposEnds then
4818-
// we remove it from the set. The max register usage is the maximum register
4819-
// usage of the recipes of the set.
4820-
for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) {
4821-
VPRecipeBase *R = Idx2Recipe[Idx];
4822-
4823-
// Remove all of the recipes that end at this location.
4824-
RecipeList &List = TransposeEnds[Idx];
4825-
for (VPRecipeBase *ToRemove : List)
4826-
OpenIntervals.erase(ToRemove);
4827-
4828-
// Ignore recipes that are never used within the loop and do not have side
4829-
// effects.
4830-
if (!Ends.count(R) && !R->mayHaveSideEffects())
4831-
continue;
4832-
4833-
// Skip recipes for ignored values.
4834-
// TODO: Should mark recipes for ephemeral values that cannot be removed
4835-
// explictly in VPlan.
4836-
if (isa<VPSingleDefRecipe>(R) &&
4837-
ValuesToIgnore.contains(
4838-
cast<VPSingleDefRecipe>(R)->getUnderlyingValue()))
4839-
continue;
4840-
4841-
// For each VF find the maximum usage of registers.
4842-
for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
4843-
// Count the number of registers used, per register class, given all open
4844-
// intervals.
4845-
// Note that elements in this SmallMapVector will be default constructed
4846-
// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
4847-
// there is no previous entry for ClassID.
4848-
SmallMapVector<unsigned, unsigned, 4> RegUsage;
4849-
4850-
for (auto *R : OpenIntervals) {
4851-
// Skip recipes that weren't present in the original loop.
4852-
// TODO: Remove after removing the legacy
4853-
// LoopVectorizationCostModel::calculateRegisterUsage
4854-
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
4855-
VPBranchOnMaskRecipe>(R))
4856-
continue;
4857-
4858-
if (VFs[J].isScalar() ||
4859-
isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
4860-
VPScalarIVStepsRecipe>(R) ||
4861-
(isa<VPInstruction>(R) &&
4862-
all_of(cast<VPSingleDefRecipe>(R)->users(),
4863-
[&](VPUser *U) {
4864-
return cast<VPRecipeBase>(U)->usesScalars(
4865-
R->getVPSingleValue());
4866-
})) ||
4867-
(isa<VPReductionPHIRecipe>(R) &&
4868-
(cast<VPReductionPHIRecipe>(R))->isInLoop())) {
4869-
unsigned ClassID = TTI.getRegisterClassForType(
4870-
false, TypeInfo.inferScalarType(R->getVPSingleValue()));
4871-
// FIXME: The target might use more than one register for the type
4872-
// even in the scalar case.
4873-
RegUsage[ClassID] += 1;
4874-
} else {
4875-
// The output from scaled phis and scaled reductions actually has
4876-
// fewer lanes than the VF.
4877-
unsigned ScaleFactor = getVFScaleFactor(R);
4878-
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
4879-
LLVM_DEBUG(if (VF != VFs[J]) {
4880-
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
4881-
<< " for " << *R << "\n";
4882-
});
4883-
4884-
for (VPValue *DefV : R->definedValues()) {
4885-
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
4886-
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
4887-
RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
4888-
}
4889-
}
4890-
}
4891-
4892-
for (const auto &Pair : RegUsage) {
4893-
auto &Entry = MaxUsages[J][Pair.first];
4894-
Entry = std::max(Entry, Pair.second);
4895-
}
4896-
}
4897-
4898-
LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
4899-
<< OpenIntervals.size() << '\n');
4900-
4901-
// Add the current recipe to the list of open intervals.
4902-
OpenIntervals.insert(R);
4903-
}
4904-
4905-
// We also search for instructions that are defined outside the loop, but are
4906-
// used inside the loop. We need this number separately from the max-interval
4907-
// usage number because when we unroll, loop-invariant values do not take
4908-
// more register.
4909-
LoopVectorizationCostModel::RegisterUsage RU;
4910-
for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
4911-
// Note that elements in this SmallMapVector will be default constructed
4912-
// as 0. So we can use "Invariant[ClassID] += n" in the code below even if
4913-
// there is no previous entry for ClassID.
4914-
SmallMapVector<unsigned, unsigned, 4> Invariant;
4915-
4916-
for (auto *In : LoopInvariants) {
4917-
// FIXME: The target might use more than one register for the type
4918-
// even in the scalar case.
4919-
bool IsScalar = all_of(In->users(), [&](VPUser *U) {
4920-
return cast<VPRecipeBase>(U)->usesScalars(In);
4921-
});
4922-
4923-
ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
4924-
unsigned ClassID = TTI.getRegisterClassForType(
4925-
VF.isVector(), TypeInfo.inferScalarType(In));
4926-
Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF);
4927-
}
4928-
4929-
LLVM_DEBUG({
4930-
dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
4931-
dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
4932-
<< " item\n";
4933-
for (const auto &pair : MaxUsages[Idx]) {
4934-
dbgs() << "LV(REG): RegisterClass: "
4935-
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
4936-
<< " registers\n";
4937-
}
4938-
dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
4939-
<< " item\n";
4940-
for (const auto &pair : Invariant) {
4941-
dbgs() << "LV(REG): RegisterClass: "
4942-
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
4943-
<< " registers\n";
4944-
}
4945-
});
4946-
4947-
RU.LoopInvariantRegs = Invariant;
4948-
RU.MaxLocalUsers = MaxUsages[Idx];
4949-
RUs[Idx] = RU;
4950-
}
4951-
4952-
return RUs;
4953-
}
4954-
49554679
unsigned
49564680
LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
49574681
InstructionCost LoopCost) {
@@ -5002,8 +4726,8 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
50024726
return 1;
50034727
}
50044728

5005-
RegisterUsage R =
5006-
::calculateRegisterUsage(Plan, {VF}, TTI, ValuesToIgnore)[0];
4729+
VPRegisterUsage R =
4730+
calculateRegisterUsageForPlan(Plan, {VF}, TTI, ValuesToIgnore)[0];
50074731
// We divide by these constants so assume that we have at least one
50084732
// instruction that uses at least one register.
50094733
for (auto &Pair : R.MaxLocalUsers) {
@@ -7380,7 +7104,7 @@ VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
73807104
for (auto &P : VPlans) {
73817105
ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
73827106
P->vectorFactors().end());
7383-
auto RUs = ::calculateRegisterUsage(*P, VFs, TTI, CM.ValuesToIgnore);
7107+
auto RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
73847108
for (auto [VF, RU] : zip_equal(VFs, RUs)) {
73857109
if (VF.isScalar())
73867110
continue;

0 commit comments

Comments
 (0)