Skip to content

Commit 5bfefff

Browse files
committed
Reland [FuncSpec] Split the specialization bonus into CodeSize and Latency.
Currently we use a combined metric TargetTransformInfo::TCK_SizeAndLatency when estimating the specialization bonus. This is suboptimal, and in some cases erroneous. For example we shouldn't be weighting the codesize decrease attributed to constant propagation by the block frequency of the dead code. Instead only the latency savings should be weighted by block frequency. The total codesize savings from all the specialization arguments should be deducted from the specialization cost. Differential Revision: https://reviews.llvm.org/D155103
1 parent d1d0e13 commit 5bfefff

File tree

3 files changed

+185
-136
lines changed

3 files changed

+185
-136
lines changed

llvm/include/llvm/Transforms/IPO/FunctionSpecialization.h

Lines changed: 41 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -105,17 +105,49 @@ struct Spec {
105105
SpecSig Sig;
106106

107107
// Profitability of the specialization.
108-
Cost Score;
108+
unsigned Score;
109109

110110
// List of call sites, matching this specialization.
111111
SmallVector<CallBase *> CallSites;
112112

113-
Spec(Function *F, const SpecSig &S, Cost Score)
113+
Spec(Function *F, const SpecSig &S, unsigned Score)
114114
: F(F), Sig(S), Score(Score) {}
115-
Spec(Function *F, const SpecSig &&S, Cost Score)
115+
Spec(Function *F, const SpecSig &&S, unsigned Score)
116116
: F(F), Sig(S), Score(Score) {}
117117
};
118118

119+
struct Bonus {
120+
unsigned CodeSize = 0;
121+
unsigned Latency = 0;
122+
123+
Bonus() = default;
124+
125+
Bonus(Cost CodeSize, Cost Latency) {
126+
int64_t Sz = *CodeSize.getValue();
127+
int64_t Ltc = *Latency.getValue();
128+
129+
assert(Sz >= 0 && Ltc >= 0 && "CodeSize and Latency cannot be negative");
130+
// It is safe to down cast since we know the arguments
131+
// cannot be negative and Cost is of type int64_t.
132+
this->CodeSize = static_cast<unsigned>(Sz);
133+
this->Latency = static_cast<unsigned>(Ltc);
134+
}
135+
136+
Bonus &operator+=(const Bonus RHS) {
137+
CodeSize += RHS.CodeSize;
138+
Latency += RHS.Latency;
139+
return *this;
140+
}
141+
142+
Bonus operator+(const Bonus RHS) const {
143+
return Bonus(CodeSize + RHS.CodeSize, Latency + RHS.Latency);
144+
}
145+
146+
bool operator==(const Bonus RHS) const {
147+
return CodeSize == RHS.CodeSize && Latency == RHS.Latency;
148+
}
149+
};
150+
119151
class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
120152
const DataLayout &DL;
121153
BlockFrequencyInfo &BFI;
@@ -144,10 +176,10 @@ class InstCostVisitor : public InstVisitor<InstCostVisitor, Constant *> {
144176
return Solver.isBlockExecutable(BB) && !DeadBlocks.contains(BB);
145177
}
146178

147-
Cost getUserBonus(Instruction *User, Value *Use = nullptr,
148-
Constant *C = nullptr);
179+
Bonus getUserBonus(Instruction *User, Value *Use = nullptr,
180+
Constant *C = nullptr);
149181

150-
Cost getBonusFromPendingPHIs();
182+
Bonus getBonusFromPendingPHIs();
151183

152184
private:
153185
friend class InstVisitor<InstCostVisitor, Constant *>;
@@ -209,8 +241,8 @@ class FunctionSpecializer {
209241
}
210242

211243
/// Compute a bonus for replacing argument \p A with constant \p C.
212-
Cost getSpecializationBonus(Argument *A, Constant *C,
213-
InstCostVisitor &Visitor);
244+
Bonus getSpecializationBonus(Argument *A, Constant *C,
245+
InstCostVisitor &Visitor);
214246

215247
private:
216248
Constant *getPromotableAlloca(AllocaInst *Alloca, CallInst *Call);
@@ -237,7 +269,7 @@ class FunctionSpecializer {
237269
/// @param AllSpecs A vector to add potential specializations to.
238270
/// @param SM A map for a function's specialisation range
239271
/// @return True, if any potential specializations were found
240-
bool findSpecializations(Function *F, Cost SpecCost,
272+
bool findSpecializations(Function *F, unsigned SpecCost,
241273
SmallVectorImpl<Spec> &AllSpecs, SpecMap &SM);
242274

243275
bool isCandidateFunction(Function *F);

llvm/lib/Transforms/IPO/FunctionSpecialization.cpp

Lines changed: 68 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -101,29 +101,21 @@ static cl::opt<bool> SpecializeLiteralConstant(
101101
"Enable specialization of functions that take a literal constant as an "
102102
"argument"));
103103

104-
// Estimates the instruction cost of all the basic blocks in \p WorkList.
105-
// The successors of such blocks are added to the list as long as they are
106-
// executable and they have a unique predecessor. \p WorkList represents
107-
// the basic blocks of a specialization which become dead once we replace
108-
// instructions that are known to be constants. The aim here is to estimate
109-
// the combination of size and latency savings in comparison to the non
110-
// specialized version of the function.
104+
// Estimates the codesize savings due to dead code after constant propagation.
105+
// \p WorkList represents the basic blocks of a specialization which will
106+
// eventually become dead once we replace instructions that are known to be
107+
// constants. The successors of such blocks are added to the list as long as
108+
// the \p Solver found they were executable prior to specialization, and only
109+
// if they have a unique predecessor.
111110
static Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList,
112111
DenseSet<BasicBlock *> &DeadBlocks,
113112
ConstMap &KnownConstants, SCCPSolver &Solver,
114-
BlockFrequencyInfo &BFI,
115113
TargetTransformInfo &TTI) {
116-
Cost Bonus = 0;
117-
114+
Cost CodeSize = 0;
118115
// Accumulate the instruction cost of each basic block weighted by frequency.
119116
while (!WorkList.empty()) {
120117
BasicBlock *BB = WorkList.pop_back_val();
121118

122-
uint64_t Weight = BFI.getBlockFreq(BB).getFrequency() /
123-
BFI.getEntryFreq();
124-
if (!Weight)
125-
continue;
126-
127119
// These blocks are considered dead as far as the InstCostVisitor
128120
// is concerned. They haven't been proven dead yet by the Solver,
129121
// but may become if we propagate the specialization arguments.
@@ -139,11 +131,11 @@ static Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList,
139131
if (KnownConstants.contains(&I))
140132
continue;
141133

142-
Bonus += Weight *
143-
TTI.getInstructionCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
134+
Cost C = TTI.getInstructionCost(&I, TargetTransformInfo::TCK_CodeSize);
144135

145-
LLVM_DEBUG(dbgs() << "FnSpecialization: Bonus " << Bonus
146-
<< " after user " << I << "\n");
136+
LLVM_DEBUG(dbgs() << "FnSpecialization: CodeSize " << C
137+
<< " for user " << I << "\n");
138+
CodeSize += C;
147139
}
148140

149141
// Keep adding dead successors to the list as long as they are
@@ -153,7 +145,7 @@ static Cost estimateBasicBlocks(SmallVectorImpl<BasicBlock *> &WorkList,
153145
SuccBB->getUniquePredecessor() == BB)
154146
WorkList.push_back(SuccBB);
155147
}
156-
return Bonus;
148+
return CodeSize;
157149
}
158150

159151
static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
@@ -164,55 +156,57 @@ static Constant *findConstantFor(Value *V, ConstMap &KnownConstants) {
164156
return nullptr;
165157
}
166158

167-
Cost InstCostVisitor::getBonusFromPendingPHIs() {
168-
Cost Bonus = 0;
159+
Bonus InstCostVisitor::getBonusFromPendingPHIs() {
160+
Bonus B;
169161
while (!PendingPHIs.empty()) {
170162
Instruction *Phi = PendingPHIs.pop_back_val();
171163
// The pending PHIs could have been proven dead by now.
172164
if (isBlockExecutable(Phi->getParent()))
173-
Bonus += getUserBonus(Phi);
165+
B += getUserBonus(Phi);
174166
}
175-
return Bonus;
167+
return B;
176168
}
177169

178-
Cost InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
170+
Bonus InstCostVisitor::getUserBonus(Instruction *User, Value *Use, Constant *C) {
179171
// We have already propagated a constant for this user.
180172
if (KnownConstants.contains(User))
181-
return 0;
173+
return {0, 0};
182174

183175
// Cache the iterator before visiting.
184176
LastVisited = Use ? KnownConstants.insert({Use, C}).first
185177
: KnownConstants.end();
186178

187-
if (auto *I = dyn_cast<SwitchInst>(User))
188-
return estimateSwitchInst(*I);
189-
190-
if (auto *I = dyn_cast<BranchInst>(User))
191-
return estimateBranchInst(*I);
192-
193-
C = visit(*User);
194-
if (!C)
195-
return 0;
179+
Cost CodeSize = 0;
180+
if (auto *I = dyn_cast<SwitchInst>(User)) {
181+
CodeSize = estimateSwitchInst(*I);
182+
} else if (auto *I = dyn_cast<BranchInst>(User)) {
183+
CodeSize = estimateBranchInst(*I);
184+
} else {
185+
C = visit(*User);
186+
if (!C)
187+
return {0, 0};
188+
KnownConstants.insert({User, C});
189+
}
196190

197-
KnownConstants.insert({User, C});
191+
CodeSize += TTI.getInstructionCost(User, TargetTransformInfo::TCK_CodeSize);
198192

199193
uint64_t Weight = BFI.getBlockFreq(User->getParent()).getFrequency() /
200194
BFI.getEntryFreq();
201-
if (!Weight)
202-
return 0;
203195

204-
Cost Bonus = Weight *
205-
TTI.getInstructionCost(User, TargetTransformInfo::TCK_SizeAndLatency);
196+
Cost Latency = Weight *
197+
TTI.getInstructionCost(User, TargetTransformInfo::TCK_Latency);
206198

207-
LLVM_DEBUG(dbgs() << "FnSpecialization: Bonus " << Bonus
208-
<< " for user " << *User << "\n");
199+
LLVM_DEBUG(dbgs() << "FnSpecialization: {CodeSize = " << CodeSize
200+
<< ", Latency = " << Latency << "} for user "
201+
<< *User << "\n");
209202

203+
Bonus B(CodeSize, Latency);
210204
for (auto *U : User->users())
211205
if (auto *UI = dyn_cast<Instruction>(U))
212206
if (UI != User && isBlockExecutable(UI->getParent()))
213-
Bonus += getUserBonus(UI, User, C);
207+
B += getUserBonus(UI, User, C);
214208

215-
return Bonus;
209+
return B;
216210
}
217211

218212
Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
@@ -238,8 +232,7 @@ Cost InstCostVisitor::estimateSwitchInst(SwitchInst &I) {
238232
WorkList.push_back(BB);
239233
}
240234

241-
return estimateBasicBlocks(WorkList, DeadBlocks, KnownConstants, Solver, BFI,
242-
TTI);
235+
return estimateBasicBlocks(WorkList, DeadBlocks, KnownConstants, Solver, TTI);
243236
}
244237

245238
Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
@@ -256,8 +249,7 @@ Cost InstCostVisitor::estimateBranchInst(BranchInst &I) {
256249
Succ->getUniquePredecessor() == I.getParent())
257250
WorkList.push_back(Succ);
258251

259-
return estimateBasicBlocks(WorkList, DeadBlocks, KnownConstants, Solver, BFI,
260-
TTI);
252+
return estimateBasicBlocks(WorkList, DeadBlocks, KnownConstants, Solver, TTI);
261253
}
262254

263255
Constant *InstCostVisitor::visitPHINode(PHINode &I) {
@@ -578,13 +570,18 @@ bool FunctionSpecializer::run() {
578570
if (!Inserted && !Metrics.isRecursive && !SpecializeLiteralConstant)
579571
continue;
580572

573+
int64_t Sz = *Metrics.NumInsts.getValue();
574+
assert(Sz > 0 && "CodeSize should be positive");
575+
// It is safe to down cast from int64_t, NumInsts is always positive.
576+
unsigned SpecCost = static_cast<unsigned>(Sz);
577+
581578
LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization cost for "
582-
<< F.getName() << " is " << Metrics.NumInsts << "\n");
579+
<< F.getName() << " is " << SpecCost << "\n");
583580

584581
if (Inserted && Metrics.isRecursive)
585582
promoteConstantStackValues(&F);
586583

587-
if (!findSpecializations(&F, Metrics.NumInsts, AllSpecs, SM)) {
584+
if (!findSpecializations(&F, SpecCost, AllSpecs, SM)) {
588585
LLVM_DEBUG(
589586
dbgs() << "FnSpecialization: No possible specializations found for "
590587
<< F.getName() << "\n");
@@ -719,7 +716,7 @@ static Function *cloneCandidateFunction(Function *F) {
719716
return Clone;
720717
}
721718

722-
bool FunctionSpecializer::findSpecializations(Function *F, Cost SpecCost,
719+
bool FunctionSpecializer::findSpecializations(Function *F, unsigned SpecCost,
723720
SmallVectorImpl<Spec> &AllSpecs,
724721
SpecMap &SM) {
725722
// A mapping from a specialisation signature to the index of the respective
@@ -785,21 +782,22 @@ bool FunctionSpecializer::findSpecializations(Function *F, Cost SpecCost,
785782
AllSpecs[Index].CallSites.push_back(&CS);
786783
} else {
787784
// Calculate the specialisation gain.
788-
Cost Score = 0;
785+
Bonus B;
789786
InstCostVisitor Visitor = getInstCostVisitorFor(F);
790787
for (ArgInfo &A : S.Args)
791-
Score += getSpecializationBonus(A.Formal, A.Actual, Visitor);
792-
Score += Visitor.getBonusFromPendingPHIs();
788+
B += getSpecializationBonus(A.Formal, A.Actual, Visitor);
789+
B += Visitor.getBonusFromPendingPHIs();
793790

794-
LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization score = "
795-
<< Score << "\n");
791+
LLVM_DEBUG(dbgs() << "FnSpecialization: Specialization score {CodeSize = "
792+
<< B.CodeSize << ", Latency = " << B.Latency
793+
<< "}\n");
796794

797795
// Discard unprofitable specialisations.
798-
if (!ForceSpecialization && Score <= SpecCost)
796+
if (!ForceSpecialization && B.Latency <= SpecCost - B.CodeSize)
799797
continue;
800798

801799
// Create a new specialisation entry.
802-
auto &Spec = AllSpecs.emplace_back(F, S, Score);
800+
auto &Spec = AllSpecs.emplace_back(F, S, B.Latency);
803801
if (CS.getFunction() != F)
804802
Spec.CallSites.push_back(&CS);
805803
const unsigned Index = AllSpecs.size() - 1;
@@ -866,19 +864,20 @@ Function *FunctionSpecializer::createSpecialization(Function *F,
866864
}
867865

868866
/// Compute a bonus for replacing argument \p A with constant \p C.
869-
Cost FunctionSpecializer::getSpecializationBonus(Argument *A, Constant *C,
867+
Bonus FunctionSpecializer::getSpecializationBonus(Argument *A, Constant *C,
870868
InstCostVisitor &Visitor) {
871869
LLVM_DEBUG(dbgs() << "FnSpecialization: Analysing bonus for constant: "
872870
<< C->getNameOrAsOperand() << "\n");
873871

874-
Cost TotalCost = 0;
872+
Bonus B;
875873
for (auto *U : A->users())
876874
if (auto *UI = dyn_cast<Instruction>(U))
877875
if (Visitor.isBlockExecutable(UI->getParent()))
878-
TotalCost += Visitor.getUserBonus(UI, A, C);
876+
B += Visitor.getUserBonus(UI, A, C);
879877

880-
LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated user bonus "
881-
<< TotalCost << " for argument " << *A << "\n");
878+
LLVM_DEBUG(dbgs() << "FnSpecialization: Accumulated bonus {CodeSize = "
879+
<< B.CodeSize << ", Latency = " << B.Latency
880+
<< "} for argument " << *A << "\n");
882881

883882
// The below heuristic is only concerned with exposing inlining
884883
// opportunities via indirect call promotion. If the argument is not a
@@ -888,7 +887,7 @@ Cost FunctionSpecializer::getSpecializationBonus(Argument *A, Constant *C,
888887
// while traversing the users of the specialization arguments ?
889888
Function *CalledFunction = dyn_cast<Function>(C->stripPointerCasts());
890889
if (!CalledFunction)
891-
return TotalCost;
890+
return B;
892891

893892
// Get TTI for the called function (used for the inline cost).
894893
auto &CalleeTTI = (GetTTI)(*CalledFunction);
@@ -898,7 +897,7 @@ Cost FunctionSpecializer::getSpecializationBonus(Argument *A, Constant *C,
898897
// calls to be promoted to direct calls. If the indirect call promotion
899898
// would likely enable the called function to be inlined, specializing is a
900899
// good idea.
901-
int Bonus = 0;
900+
int InliningBonus = 0;
902901
for (User *U : A->users()) {
903902
if (!isa<CallInst>(U) && !isa<InvokeInst>(U))
904903
continue;
@@ -925,15 +924,15 @@ Cost FunctionSpecializer::getSpecializationBonus(Argument *A, Constant *C,
925924
// We clamp the bonus for this call to be between zero and the default
926925
// threshold.
927926
if (IC.isAlways())
928-
Bonus += Params.DefaultThreshold;
927+
InliningBonus += Params.DefaultThreshold;
929928
else if (IC.isVariable() && IC.getCostDelta() > 0)
930-
Bonus += IC.getCostDelta();
929+
InliningBonus += IC.getCostDelta();
931930

932-
LLVM_DEBUG(dbgs() << "FnSpecialization: Inlining bonus " << Bonus
931+
LLVM_DEBUG(dbgs() << "FnSpecialization: Inlining bonus " << InliningBonus
933932
<< " for user " << *U << "\n");
934933
}
935934

936-
return TotalCost + Bonus;
935+
return B += {0, InliningBonus};
937936
}
938937

939938
/// Determine if it is possible to specialise the function for constant values

0 commit comments

Comments
 (0)