Skip to content

Commit fb1d28d

Browse files
fhahnSamTebbs33
authored andcommitted
[LV] Compute register usage for interleaving on VPlan. (llvm#126437)
Add a version of calculateRegisterUsage that works estimates register usage for a VPlan. This mostly just ports the existing code, with some updates to figure out what recipes will generate vectors vs scalars. There are number of changes in the computed register usages, but they should be more accurate w.r.t. to the generated vector code. There are the following changes: * Scalar usage increases in most cases by 1, as we always create a scalar canonical IV, which is alive across the loop and is not considered by the legacy implementation * Output is ordered by insertion, now scalar registers are added first due the canonical IV phi. * Using the VPlan, we now also more precisely know if an induction will be vectorized or scalarized. Depends on llvm#126415 PR: llvm#126437
1 parent 54d066c commit fb1d28d

File tree

15 files changed

+975
-514
lines changed

15 files changed

+975
-514
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 232 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -995,7 +995,8 @@ class LoopVectorizationCostModel {
995995
/// If interleave count has been specified by metadata it will be returned.
996996
/// Otherwise, the interleave count is computed and returned. VF and LoopCost
997997
/// are the selected vectorization factor and the cost of the selected VF.
998-
unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
998+
unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
999+
InstructionCost LoopCost);
9991000

10001001
/// Memory access instruction may be vectorized in more than one way.
10011002
/// Form of instruction after vectorization depends on cost.
@@ -4850,8 +4851,233 @@ void LoopVectorizationCostModel::collectElementTypesForWidening() {
48504851
}
48514852
}
48524853

4854+
/// Estimate the register usage for \p Plan and vectorization factors in \p VFs
4855+
/// by calculating the highest number of values that are live at a single
4856+
/// location as a rough estimate. Returns the register usage for each VF in \p
4857+
/// VFs.
4858+
static SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
4859+
calculateRegisterUsage(VPlan &Plan, ArrayRef<ElementCount> VFs,
4860+
const TargetTransformInfo &TTI,
4861+
const SmallPtrSetImpl<const Value *> &ValuesToIgnore) {
4862+
// Each 'key' in the map opens a new interval. The values
4863+
// of the map are the index of the 'last seen' usage of the
4864+
// recipe that is the key.
4865+
using IntervalMap = SmallDenseMap<VPRecipeBase *, unsigned, 16>;
4866+
4867+
// Maps indices to recipes.
4868+
SmallVector<VPRecipeBase *, 64> Idx2Recipe;
4869+
// Marks the end of each interval.
4870+
IntervalMap EndPoint;
4871+
// Saves the list of recipe indices that are used in the loop.
4872+
SmallPtrSet<VPRecipeBase *, 8> Ends;
4873+
// Saves the list of values that are used in the loop but are defined outside
4874+
// the loop (not including non-recipe values such as arguments and
4875+
// constants).
4876+
SmallSetVector<VPValue *, 8> LoopInvariants;
4877+
LoopInvariants.insert(&Plan.getVectorTripCount());
4878+
4879+
// We scan the loop in a topological order in order and assign a number to
4880+
// each recipe. We use RPO to ensure that defs are met before their users. We
4881+
// assume that each recipe that has in-loop users starts an interval. We
4882+
// record every time that an in-loop value is used, so we have a list of the
4883+
// first and last occurrences of each recipe.
4884+
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
4885+
Plan.getVectorLoopRegion());
4886+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
4887+
if (!VPBB->getParent())
4888+
break;
4889+
for (VPRecipeBase &R : *VPBB) {
4890+
Idx2Recipe.push_back(&R);
4891+
4892+
// Save the end location of each USE.
4893+
for (VPValue *U : R.operands()) {
4894+
auto *DefR = U->getDefiningRecipe();
4895+
4896+
// Ignore non-recipe values such as arguments, constants, etc.
4897+
// FIXME: Might need some motivation why these values are ignored. If
4898+
// for example an argument is used inside the loop it will increase the
4899+
// register pressure (so shouldn't we add it to LoopInvariants).
4900+
if (!DefR && (!U->getLiveInIRValue() ||
4901+
!isa<Instruction>(U->getLiveInIRValue())))
4902+
continue;
4903+
4904+
// If this recipe is outside the loop then record it and continue.
4905+
if (!DefR) {
4906+
LoopInvariants.insert(U);
4907+
continue;
4908+
}
4909+
4910+
// Overwrite previous end points.
4911+
EndPoint[DefR] = Idx2Recipe.size();
4912+
Ends.insert(DefR);
4913+
}
4914+
}
4915+
if (VPBB == Plan.getVectorLoopRegion()->getExiting()) {
4916+
// VPWidenIntOrFpInductionRecipes are used implicitly at the end of the
4917+
// exiting block, where their increment will get materialized eventually.
4918+
for (auto &R : Plan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
4919+
if (isa<VPWidenIntOrFpInductionRecipe>(&R)) {
4920+
EndPoint[&R] = Idx2Recipe.size();
4921+
Ends.insert(&R);
4922+
}
4923+
}
4924+
}
4925+
}
4926+
4927+
// Saves the list of intervals that end with the index in 'key'.
4928+
using RecipeList = SmallVector<VPRecipeBase *, 2>;
4929+
SmallDenseMap<unsigned, RecipeList, 16> TransposeEnds;
4930+
4931+
// Next, we transpose the EndPoints into a multi map that holds the list of
4932+
// intervals that *end* at a specific location.
4933+
for (auto &Interval : EndPoint)
4934+
TransposeEnds[Interval.second].push_back(Interval.first);
4935+
4936+
SmallPtrSet<VPRecipeBase *, 8> OpenIntervals;
4937+
SmallVector<LoopVectorizationCostModel::RegisterUsage, 8> RUs(VFs.size());
4938+
SmallVector<SmallMapVector<unsigned, unsigned, 4>, 8> MaxUsages(VFs.size());
4939+
4940+
LLVM_DEBUG(dbgs() << "LV(REG): Calculating max register usage:\n");
4941+
4942+
VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4943+
4944+
const auto &TTICapture = TTI;
4945+
auto GetRegUsage = [&TTICapture](Type *Ty, ElementCount VF) -> unsigned {
4946+
if (Ty->isTokenTy() || !VectorType::isValidElementType(Ty) ||
4947+
(VF.isScalable() &&
4948+
!TTICapture.isElementTypeLegalForScalableVector(Ty)))
4949+
return 0;
4950+
return TTICapture.getRegUsageForType(VectorType::get(Ty, VF));
4951+
};
4952+
4953+
// We scan the instructions linearly and record each time that a new interval
4954+
// starts, by placing it in a set. If we find this value in TransposEnds then
4955+
// we remove it from the set. The max register usage is the maximum register
4956+
// usage of the recipes of the set.
4957+
for (unsigned int Idx = 0, Sz = Idx2Recipe.size(); Idx < Sz; ++Idx) {
4958+
VPRecipeBase *R = Idx2Recipe[Idx];
4959+
4960+
// Remove all of the recipes that end at this location.
4961+
RecipeList &List = TransposeEnds[Idx];
4962+
for (VPRecipeBase *ToRemove : List)
4963+
OpenIntervals.erase(ToRemove);
4964+
4965+
// Ignore recipes that are never used within the loop and do not have side
4966+
// effects.
4967+
if (!Ends.count(R) && !R->mayHaveSideEffects())
4968+
continue;
4969+
4970+
// Skip recipes for ignored values.
4971+
// TODO: Should mark recipes for ephemeral values that cannot be removed
4972+
// explictly in VPlan.
4973+
if (isa<VPSingleDefRecipe>(R) &&
4974+
ValuesToIgnore.contains(
4975+
cast<VPSingleDefRecipe>(R)->getUnderlyingValue()))
4976+
continue;
4977+
4978+
// For each VF find the maximum usage of registers.
4979+
for (unsigned J = 0, E = VFs.size(); J < E; ++J) {
4980+
// Count the number of registers used, per register class, given all open
4981+
// intervals.
4982+
// Note that elements in this SmallMapVector will be default constructed
4983+
// as 0. So we can use "RegUsage[ClassID] += n" in the code below even if
4984+
// there is no previous entry for ClassID.
4985+
SmallMapVector<unsigned, unsigned, 4> RegUsage;
4986+
4987+
for (auto *R : OpenIntervals) {
4988+
// Skip recipes that weren't present in the original loop.
4989+
// TODO: Remove after removing the legacy
4990+
// LoopVectorizationCostModel::calculateRegisterUsage
4991+
if (isa<VPVectorPointerRecipe, VPVectorEndPointerRecipe,
4992+
VPBranchOnMaskRecipe>(R))
4993+
continue;
4994+
4995+
if (VFs[J].isScalar() ||
4996+
isa<VPCanonicalIVPHIRecipe, VPReplicateRecipe, VPDerivedIVRecipe,
4997+
VPScalarIVStepsRecipe>(R) ||
4998+
(isa<VPInstruction>(R) &&
4999+
all_of(cast<VPSingleDefRecipe>(R)->users(), [&](VPUser *U) {
5000+
return cast<VPRecipeBase>(U)->usesScalars(R->getVPSingleValue());
5001+
}))) {
5002+
unsigned ClassID = TTI.getRegisterClassForType(
5003+
false, TypeInfo.inferScalarType(R->getVPSingleValue()));
5004+
// FIXME: The target might use more than one register for the type
5005+
// even in the scalar case.
5006+
RegUsage[ClassID] += 1;
5007+
} else {
5008+
for (VPValue *DefV : R->definedValues()) {
5009+
Type *ScalarTy = TypeInfo.inferScalarType(DefV);
5010+
unsigned ClassID = TTI.getRegisterClassForType(true, ScalarTy);
5011+
RegUsage[ClassID] += GetRegUsage(ScalarTy, VFs[J]);
5012+
}
5013+
}
5014+
}
5015+
5016+
for (const auto &Pair : RegUsage) {
5017+
auto &Entry = MaxUsages[J][Pair.first];
5018+
Entry = std::max(Entry, Pair.second);
5019+
}
5020+
}
5021+
5022+
LLVM_DEBUG(dbgs() << "LV(REG): At #" << Idx << " Interval # "
5023+
<< OpenIntervals.size() << '\n');
5024+
5025+
// Add the current recipe to the list of open intervals.
5026+
OpenIntervals.insert(R);
5027+
}
5028+
5029+
// We also search for instructions that are defined outside the loop, but are
5030+
// used inside the loop. We need this number separately from the max-interval
5031+
// usage number because when we unroll, loop-invariant values do not take
5032+
// more register.
5033+
LoopVectorizationCostModel::RegisterUsage RU;
5034+
for (unsigned Idx = 0, End = VFs.size(); Idx < End; ++Idx) {
5035+
// Note that elements in this SmallMapVector will be default constructed
5036+
// as 0. So we can use "Invariant[ClassID] += n" in the code below even if
5037+
// there is no previous entry for ClassID.
5038+
SmallMapVector<unsigned, unsigned, 4> Invariant;
5039+
5040+
for (auto *In : LoopInvariants) {
5041+
// FIXME: The target might use more than one register for the type
5042+
// even in the scalar case.
5043+
bool IsScalar = all_of(In->users(), [&](VPUser *U) {
5044+
return cast<VPRecipeBase>(U)->usesScalars(In);
5045+
});
5046+
5047+
ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[Idx];
5048+
unsigned ClassID = TTI.getRegisterClassForType(
5049+
VF.isVector(), TypeInfo.inferScalarType(In));
5050+
Invariant[ClassID] += GetRegUsage(TypeInfo.inferScalarType(In), VF);
5051+
}
5052+
5053+
LLVM_DEBUG({
5054+
dbgs() << "LV(REG): VF = " << VFs[Idx] << '\n';
5055+
dbgs() << "LV(REG): Found max usage: " << MaxUsages[Idx].size()
5056+
<< " item\n";
5057+
for (const auto &pair : MaxUsages[Idx]) {
5058+
dbgs() << "LV(REG): RegisterClass: "
5059+
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5060+
<< " registers\n";
5061+
}
5062+
dbgs() << "LV(REG): Found invariant usage: " << Invariant.size()
5063+
<< " item\n";
5064+
for (const auto &pair : Invariant) {
5065+
dbgs() << "LV(REG): RegisterClass: "
5066+
<< TTI.getRegisterClassName(pair.first) << ", " << pair.second
5067+
<< " registers\n";
5068+
}
5069+
});
5070+
5071+
RU.LoopInvariantRegs = Invariant;
5072+
RU.MaxLocalUsers = MaxUsages[Idx];
5073+
RUs[Idx] = RU;
5074+
}
5075+
5076+
return RUs;
5077+
}
5078+
48535079
unsigned
4854-
LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
5080+
LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
48555081
InstructionCost LoopCost) {
48565082
// -- The interleave heuristics --
48575083
// We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -4901,7 +5127,8 @@ LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
49015127
return 1;
49025128
}
49035129

4904-
RegisterUsage R = calculateRegisterUsage({VF})[0];
5130+
RegisterUsage R =
5131+
::calculateRegisterUsage(Plan, {VF}, TTI, ValuesToIgnore)[0];
49055132
// We divide by these constants so assume that we have at least one
49065133
// instruction that uses at least one register.
49075134
for (auto &Pair : R.MaxLocalUsers) {
@@ -5152,7 +5379,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
51525379
// We also search for instructions that are defined outside the loop, but are
51535380
// used inside the loop. We need this number separately from the max-interval
51545381
// usage number because when we unroll, loop-invariant values do not take
5155-
// more register.
5382+
// more registers.
51565383
LoopBlocksDFS DFS(TheLoop);
51575384
DFS.perform(LI);
51585385

@@ -10657,7 +10884,7 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1065710884
AddBranchWeights, CM.CostKind);
1065810885
if (LVP.hasPlanWithVF(VF.Width)) {
1065910886
// Select the interleave count.
10660-
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
10887+
IC = CM.selectInterleaveCount(LVP.getPlanFor(VF.Width), VF.Width, VF.Cost);
1066110888

1066210889
unsigned SelectedIC = std::max(IC, UserIC);
1066310890
// Optimistically generate runtime checks if they are needed. Drop them if

llvm/test/Transforms/LoopVectorize/AArch64/i1-reg-usage.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@ target triple = "aarch64"
88
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_neon' from <stdin>
99
; CHECK: LV(REG): VF = 32
1010
; CHECK-NEXT: LV(REG): Found max usage: 2 item
11+
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
1112
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 72 registers
12-
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
1313

1414
define i1 @or_reduction_neon(i32 %arg, ptr %ptr) {
1515
entry:
@@ -31,8 +31,8 @@ loop:
3131
; CHECK-LABEL: LV: Checking a loop in 'or_reduction_sve'
3232
; CHECK: LV(REG): VF = 64
3333
; CHECK-NEXT: LV(REG): Found max usage: 2 item
34+
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 2 registers
3435
; CHECK-NEXT: LV(REG): RegisterClass: Generic::VectorRC, 136 registers
35-
; CHECK-NEXT: LV(REG): RegisterClass: Generic::ScalarRC, 1 registers
3636

3737
define i1 @or_reduction_sve(i32 %arg, ptr %ptr) vscale_range(2,2) "target-features"="+sve" {
3838
entry:

0 commit comments

Comments
 (0)