Skip to content

Commit aba5a3f

Browse files
committed
[LV] Move VPlan-based calculateREgisterUsage to VPlanAnalysis (NFC).
1 parent 202cd7b commit aba5a3f

File tree

14 files changed

+289
-265
lines changed

14 files changed

+289
-265
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 2 additions & 246 deletions
Original file line numberDiff line numberDiff line change
@@ -4863,250 +4863,6 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48634863
}
48644864
}
48654865

4866-
/// Get the VF scaling factor applied to the recipe's output, if the recipe has
4867-
/// one.
4868-
static unsigned getVFScaleFactor(VPRecipeBase *R) {
4869-
if (auto *RR = dyn_cast<VPReductionPHIRecipe>(R))
4870-
return RR->getVFScaleFactor();
4871-
if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
4872-
return RR->getVFScaleFactor();
4873-
return 1;
4874-
}
4875-
4876-
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
4877-
/// by calculating the highest number of values that are live at a single
4878-
/// location as a rough estimate. Returns the register usage for each VF in \p
4879-
/// VFs.
4880-
static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
4881-
calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
4882-
const TargetTransformInfo &TTI,
4883-
const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4884-
// Each 'key' in the map opens a new interval. The values
4885-
// of the map are the index of the 'last seen' usage of the
4886-
// recipe that is the key.
4887-
using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
4888-
4889-
// Maps indices to recipes.
4890-
SmallVector<VPRecipeBase *, 64> Idx2Recipe;
4891-
// Marks the end of each interval.
4892-
IntervalMap EndPoint;
4893-
// Saves the list of recipe indices that are used in the loop.
4894-
SmallPtrSet<VPRecipeBase *, 8> Ends;
4895-
// Saves the list of values that are used in the loop but are defined outside
4896-
// the loop (not including non-recipe values such as arguments and
4897-
// constants).
4898-
SmallSetVector<VPValue *, 8> LoopInvariants;
4899-
LoopInvariants.insert(&Plan.getVectorTripCount());
4900-
4901-
// We scan the loop in a topological order in order and assign a number to
4902-
// each recipe. We use RPO to ensure that defs are met before their users. We
4903-
// assume that each recipe that has in-loop users starts an interval. We
4904-
// record every time that an in-loop value is used, so we have a list of the
4905-
// first and last occurrences of each recipe.
4906-
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
4907-
Plan.getVectorLoopRegion());
4908-
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4909-
if (!VPBB->getParent())
4910-
break;
4911-
for (VPRecipeBase &R : *VPBB) {
4912-
Idx2Recipe.push_back(&R);
4913-
4914-
// Save the end location of each USE.
4915-
for (VPValue *U : R.operands()) {
4916-
auto *DefR = U->getDefiningRecipe();
4917-
4918-
// Ignore non-recipe values such as arguments, constants, etc.
4919-
// FIXME: Might need some motivation why these values are ignored. If
4920-
// for example an argument is used inside the loop it will increase the
4921-
// register pressure (so shouldn't we add it to LoopInvariants).
4922-
if (!DefR && (!U->getLiveInIRValue() ||
4923-
!isa<Instruction>(U->getLiveInIRValue())))
4924-
continue;
4925-
4926-
// If this recipe is outside the loop then record it and continue.
4927-
if (!DefR) {
4928-
LoopInvariants.insert(U);
4929-
continue;
4930-
}
4931-
4932-
// Overwrite previous end points.
4933-
EndPoint[DefR] = Idx2Recipe.size();
4934-
Ends.insert(DefR);
4935-
}
4936-
}
4937-
if (VPBB == Plan.getVectorLoopRegion()->getExiting()) {
4938-
// VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4939-
// exiting block, where their increment will get materialized eventually.
4940-
for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
4941-
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4942-
EndPoint[&R] = Idx2Recipe.size();
4943-
Ends.insert(&R);
4944-
}
4945-
}
4946-
}
4947-
}
4948-
4949-
// Saves the list of intervals that end with the index in 'key'.
4950-
using RecipeList = SmallVector<VPRecipeBase *, 2>;
4951-
SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
4952-
4953-
// Next, we transpose the EndPoints into a multi map that holds the list of
4954-
// intervals that *end* at a specific location.
4955-
for (auto &Interval : EndPoint)
4956-
TransposeEnds[Interval.second].push_back(Interval.first);
4957-
4958-
SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
4959-
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> RUs(VFs.size());
4960-
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
4961-
4962-
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
4963-
4964-
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4965-
4966-
const auto &TTICapture = TTI;
4967-
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4968-
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
4969-
(VF.isScalable() &&
4970-
!TTICapture.isElementTypeLegalForScalableVector(Ty)))
4971-
return 0;
4972-
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
4973-
};
4974-
4975-
// We scan the instructions linearly and record each time that a new interval
4976-
// starts, by placing it in a set. If we find this value in TransposEnds then
4977-
// we remove it from the set. The max register usage is the maximum register
4978-
// usage of the recipes of the set.
4979-
for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) {
4980-
VPRecipeBase *R = Idx2Recipe[Idx];
4981-
4982-
// Remove all of the recipes that end at this location.
4983-
RecipeList &List = TransposeEnds[Idx];
4984-
for (VPRecipeBase *ToRemove : List)
4985-
OpenIntervals.erase(ToRemove);
4986-
4987-
// Ignore recipes that are never used within the loop and do not have side
4988-
// effects.
4989-
if (!Ends.count(R) && !R->mayHaveSideEffects())
4990-
continue;
4991-
4992-
// Skip recipes for ignored values.
4993-
// TODO: Should mark recipes for ephemeral values that cannot be removed
4994-
// explictly in VPlan.
4995-
if (isa<VPSingleDefRecipe>(R) &&
4996-
ValuesToIgnore.contains(
4997-
cast<VPSingleDefRecipe>(R)->getUnderlyingValue()))
4998-
continue;
4999-
5000-
// For each VF find the maximum usage of registers.
5001-
for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
5002-
// Count the number of registers used, per register class, given all open
5003-
// intervals.
5004-
// Note that elements in this SmallMapVector will be default constructed
5005-
// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
5006-
// there is no previous entry for ClassID.
5007-
SmallMapVector<unsigned, unsigned, 4> RegUsage;
5008-
5009-
for (auto *R : OpenIntervals) {
5010-
// Skip recipes that weren't present in the original loop.
5011-
// TODO: Remove after removing the legacy
5012-
// LoopVectorizationCostModel::calculateRegisterUsage
5013-
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
5014-
VPBranchOnMaskRecipe>(R))
5015-
continue;
5016-
5017-
if (VFs[J].isScalar() ||
5018-
isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
5019-
VPScalarIVStepsRecipe>(R) ||
5020-
(isa<VPInstruction>(R) &&
5021-
all_of(cast<VPSingleDefRecipe>(R)->users(), [&](VPUser *U) {
5022-
return cast<VPRecipeBase>(U)->usesScalars(R->getVPSingleValue());
5023-
}))) {
5024-
unsigned ClassID = TTI.getRegisterClassForType(
5025-
false, TypeInfo.inferScalarType(R->getVPSingleValue()));
5026-
// FIXME: The target might use more than one register for the type
5027-
// even in the scalar case.
5028-
RegUsage[ClassID] += 1;
5029-
} else {
5030-
// The output from scaled phis and scaled reductions actually has
5031-
// fewer lanes than the VF.
5032-
unsigned ScaleFactor = getVFScaleFactor(R);
5033-
ElementCount VF = VFs[J].divideCoefficientBy(ScaleFactor);
5034-
LLVM_DEBUG(if (VF != VFs[J]) {
5035-
dbgs() << "LV(REG): Scaled down VF from " << VFs[J] << " to " << VF
5036-
<< " for " << *R << "\n";
5037-
});
5038-
5039-
for (VPValue *DefV : R->definedValues()) {
5040-
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
5041-
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
5042-
RegUsage[ClassID] += GetRegUsage(ScalarTy, VF);
5043-
}
5044-
}
5045-
}
5046-
5047-
for (const auto &Pair : RegUsage) {
5048-
auto &Entry = MaxUsages[J][Pair.first];
5049-
Entry = std::max(Entry, Pair.second);
5050-
}
5051-
}
5052-
5053-
LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5054-
<< OpenIntervals.size() << '\n');
5055-
5056-
// Add the current recipe to the list of open intervals.
5057-
OpenIntervals.insert(R);
5058-
}
5059-
5060-
// We also search for instructions that are defined outside the loop, but are
5061-
// used inside the loop. We need this number separately from the max-interval
5062-
// usage number because when we unroll, loop-invariant values do not take
5063-
// more register.
5064-
LoopVectorizationCostModel::RegisterUsage RU;
5065-
for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5066-
// Note that elements in this SmallMapVector will be default constructed
5067-
// as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5068-
// there is no previous entry for ClassID.
5069-
SmallMapVector<unsigned, unsigned, 4> Invariant;
5070-
5071-
for (auto *In : LoopInvariants) {
5072-
// FIXME: The target might use more than one register for the type
5073-
// even in the scalar case.
5074-
bool IsScalar = all_of(In->users(), [&](VPUser *U) {
5075-
return cast<VPRecipeBase>(U)->usesScalars(In);
5076-
});
5077-
5078-
ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5079-
unsigned ClassID = TTI.getRegisterClassForType(
5080-
VF.isVector(), TypeInfo.inferScalarType(In));
5081-
Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF);
5082-
}
5083-
5084-
LLVM_DEBUG({
5085-
dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5086-
dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5087-
<< " item\n";
5088-
for (const auto &pair : MaxUsages[Idx]) {
5089-
dbgs() << "LV(REG): RegisterClass: "
5090-
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5091-
<< " registers\n";
5092-
}
5093-
dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5094-
<< " item\n";
5095-
for (const auto &pair : Invariant) {
5096-
dbgs() << "LV(REG): RegisterClass: "
5097-
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5098-
<< " registers\n";
5099-
}
5100-
});
5101-
5102-
RU.LoopInvariantRegs = Invariant;
5103-
RU.MaxLocalUsers = MaxUsages[Idx];
5104-
RUs[Idx] = RU;
5105-
}
5106-
5107-
return RUs;
5108-
}
5109-
51104866
unsigned
51114867
LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
51124868
InstructionCost LoopCost) {
@@ -5158,8 +4914,8 @@ LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
51584914
return 1;
51594915
}
51604916

5161-
RegisterUsage R =
5162-
::calculateRegisterUsage(Plan, {VF}, TTI, ValuesToIgnore)[0];
4917+
VPRegisterUsage R =
4918+
calculateRegisterUsageForVPlan(Plan, {VF}, TTI, ValuesToIgnore)[0];
51634919
// We divide by these constants so assume that we have at least one
51644920
// instruction that uses at least one register.
51654921
for (auto &Pair : R.MaxLocalUsers) {

0 commit comments

Comments
 (0)