Skip to content

Commit 9895c70

Browse files
[InlineCost] Implement cost-benefit-based inliner
This patch adds an alternative cost metric for the inliner to take into account both the cost (i.e. size) and cycle count savings into account. Without this patch, we decide to inline a given call site if the size of inlining the call site is below the threshold that is computed according to the hotness of the call site. This patch adds a new cost metric, turned off by default, to take over the handling of hot call sites. Specifically, with the new cost metric, we decide to inline a given call site if the ratio of cycle savings to size exceeds a threshold. The cycle savings are computed from call site costs, parameter propagation, folded conditional branches, etc, all weighted by their respective profile counts. The size is primarily the callee size, but we subtract call site costs and the size of basic blocks that are never executed. The new cost metric implicitly takes advantage of the machine function splitter recently introduced by Snehasish Kumar, which dramatically reduces the cost of duplicating (e.g. inlining) cold basic blocks by placing cold basic blocks of hot functions in the .text.split section. We evaluated the new cost metric on clang bootstrap and SPECInt 2017. For clang bootstrap, we observe 0.69% runtime improvement. For SPECInt we report the change in IntRate the C/C++ benchmarks. All benchmarks apart from perlbench and omnetpp improve, on average by 0.21% with the max for mcf at 1.96%. Benchmark % Change 500.perlbench_r -0.45 502.gcc_r 0.13 505.mcf_r 1.96 520.omnetpp_r -0.28 523.xalancbmk_r 0.49 525.x264_r 0.00 531.deepsjeng_r 0.00 541.leela_r 0.35 557.xz_r 0.21 Differential Revision: https://reviews.llvm.org/D92780
1 parent f4511ae commit 9895c70

File tree

1 file changed

+178
-0
lines changed

1 file changed

+178
-0
lines changed

llvm/lib/Analysis/InlineCost.cpp

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,20 @@ static cl::opt<int>
7171
cl::init(45), cl::ZeroOrMore,
7272
cl::desc("Threshold for inlining cold callsites"));
7373

74+
static cl::opt<bool> InlineEnableCostBenefitAnalysis(
75+
"inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
76+
cl::desc("Enable the cost-benefit analysis for the inliner"));
77+
78+
static cl::opt<int> InlineSavingsMultiplier(
79+
"inline-savings-multiplier", cl::Hidden, cl::init(8), cl::ZeroOrMore,
80+
cl::desc("Multiplier to multiply cycle savings by during inlining"));
81+
82+
static cl::opt<int>
83+
InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
84+
cl::ZeroOrMore,
85+
cl::desc("The maximum size of a callee that get's "
86+
"inlined without sufficient cycle savings"));
87+
7488
// We introduce this threshold to help performance of instrumentation based
7589
// PGO before we actually hook up inliner with analysis passes such as BPI and
7690
// BFI.
@@ -183,6 +197,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
183197
CallBase &CandidateCall;
184198

185199
/// Extension points for handling callsite features.
200+
// Called before a basic block was analyzed.
201+
virtual void onBlockStart(const BasicBlock *BB) {}
202+
186203
/// Called after a basic block was analyzed.
187204
virtual void onBlockAnalyzed(const BasicBlock *BB) {}
188205

@@ -454,12 +471,24 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
454471
/// Ignore the threshold when finalizing analysis.
455472
const bool IgnoreThreshold;
456473

474+
// True if the cost-benefit-analysis-based inliner is enabled.
475+
const bool CostBenefitAnalysisEnabled;
476+
457477
/// Inlining cost measured in abstract units, accounts for all the
458478
/// instructions expected to be executed for a given function invocation.
459479
/// Instructions that are statically proven to be dead based on call-site
460480
/// arguments are not counted here.
461481
int Cost = 0;
462482

483+
// The cumulative cost at the beginning of the basic block being analyzed. At
484+
// the end of analyzing each basic block, "Cost - CostAtBBStart" represents
485+
// the size of that basic block.
486+
int CostAtBBStart = 0;
487+
488+
// The static size of live but cold basic blocks. This is "static" in the
489+
// sense that it's not weighted by profile counts at all.
490+
int ColdSize = 0;
491+
463492
bool SingleBB = true;
464493

465494
unsigned SROACostSavings = 0;
@@ -597,7 +626,21 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
597626
SROACostSavings += InlineConstants::InstrCost;
598627
}
599628

629+
void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
630+
600631
void onBlockAnalyzed(const BasicBlock *BB) override {
632+
if (CostBenefitAnalysisEnabled) {
633+
// Keep track of the static size of live but cold basic blocks. For now,
634+
// we define a cold basic block to be one that's never executed.
635+
assert(GetBFI && "GetBFI must be available");
636+
BlockFrequencyInfo *BFI = &(GetBFI(F));
637+
assert(BFI && "BFI must be available");
638+
auto ProfileCount = BFI->getBlockProfileCount(BB);
639+
assert(ProfileCount.hasValue());
640+
if (ProfileCount.getValue() == 0)
641+
ColdSize += Cost - CostAtBBStart;
642+
}
643+
601644
auto *TI = BB->getTerminator();
602645
// If we had any successors at this point, than post-inlining is likely to
603646
// have them as well. Note that we assume any basic blocks which existed
@@ -628,6 +671,131 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
628671
InstructionCostDetailMap[I].ThresholdAfter = Threshold;
629672
}
630673

674+
bool isCostBenefitAnalysisEnabled() {
675+
if (!InlineEnableCostBenefitAnalysis)
676+
return false;
677+
678+
if (!PSI || !PSI->hasProfileSummary())
679+
return false;
680+
681+
if (!GetBFI)
682+
return false;
683+
684+
auto *Caller = CandidateCall.getParent()->getParent();
685+
if (!Caller->getEntryCount())
686+
return false;
687+
688+
BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
689+
if (!CallerBFI)
690+
return false;
691+
692+
// For now, limit to hot call site.
693+
if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
694+
return false;
695+
696+
if (!F.getEntryCount())
697+
return false;
698+
699+
BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
700+
if (!CalleeBFI)
701+
return false;
702+
703+
return true;
704+
}
705+
706+
// Determine whether we should inline the given call site, taking into account
707+
// both the size cost and the cycle savings. Return None if we don't have
708+
// suficient profiling information to determine.
709+
Optional<bool> costBenefitAnalysis() {
710+
if (!CostBenefitAnalysisEnabled)
711+
return None;
712+
713+
// buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
714+
// for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
715+
// falling back to the cost-based metric.
716+
// TODO: Improve this hacky condition.
717+
if (Threshold == 0)
718+
return None;
719+
720+
assert(GetBFI);
721+
BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
722+
assert(CalleeBFI);
723+
724+
// The cycle savings expressed as the sum of InlineConstants::InstrCost
725+
// multiplied by the estimated dynamic count of each instruction we can
726+
// avoid. Savings come from the call site cost, such as argument setup and
727+
// the call instruction, as well as the instructions that are folded.
728+
//
729+
// We use 128-bit APInt here to avoid potential overflow. This variable
730+
// should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
731+
// assumes that we can avoid or fold a billion instructions, each with a
732+
// profile count of 10^^15 -- roughly the number of cycles for a 24-hour
733+
// period on a 4GHz machine.
734+
APInt CycleSavings(128, 0);
735+
736+
for (auto &BB : F) {
737+
APInt CurrentSavings(128, 0);
738+
for (auto &I : BB) {
739+
if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
740+
// Count a conditional branch as savings if it becomes unconditional.
741+
if (BI->isConditional() &&
742+
dyn_cast_or_null<ConstantInt>(
743+
SimplifiedValues.lookup(BI->getCondition()))) {
744+
CurrentSavings += InlineConstants::InstrCost;
745+
}
746+
} else if (Value *V = dyn_cast<Value>(&I)) {
747+
// Count an instruction as savings if we can fold it.
748+
if (SimplifiedValues.count(V)) {
749+
CurrentSavings += InlineConstants::InstrCost;
750+
}
751+
}
752+
// TODO: Consider other forms of savings like switch statements,
753+
// indirect calls becoming direct, SROACostSavings, LoadEliminationCost,
754+
// etc.
755+
}
756+
757+
auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
758+
assert(ProfileCount.hasValue());
759+
CurrentSavings *= ProfileCount.getValue();
760+
CycleSavings += CurrentSavings;
761+
}
762+
763+
// Compute the cycle savings per call.
764+
auto EntryProfileCount = F.getEntryCount();
765+
assert(EntryProfileCount.hasValue());
766+
auto EntryCount = EntryProfileCount.getCount();
767+
CycleSavings += EntryCount / 2;
768+
CycleSavings = CycleSavings.udiv(EntryCount);
769+
770+
// Compute the total savings for the call site.
771+
auto *CallerBB = CandidateCall.getParent();
772+
BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
773+
CycleSavings += getCallsiteCost(this->CandidateCall, DL);
774+
CycleSavings *= CallerBFI->getBlockProfileCount(CallerBB).getValue();
775+
776+
// Remove the cost of the cold basic blocks.
777+
int Size = Cost - ColdSize;
778+
779+
// Allow tiny callees to be inlined regardless of whether they meet the
780+
// savings threshold.
781+
Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
782+
783+
// Return true if the savings justify the cost of inlining. Specifically,
784+
// we evaluate the following inequality:
785+
//
786+
// CycleSavings PSI->getOrCompHotCountThreshold()
787+
// -------------- >= -----------------------------------
788+
// Size InlineSavingsMultiplier
789+
//
790+
// Note that the left hand side is specific to a call site. The right hand
791+
// side is a constant for the entire executable.
792+
APInt LHS = CycleSavings;
793+
LHS *= InlineSavingsMultiplier;
794+
APInt RHS(128, PSI->getOrCompHotCountThreshold());
795+
RHS *= Size;
796+
return LHS.uge(RHS);
797+
}
798+
631799
InlineResult finalizeAnalysis() override {
632800
// Loops generally act a lot like calls in that they act like barriers to
633801
// movement, require a certain amount of setup, etc. So when optimising for
@@ -656,6 +824,13 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
656824
else if (NumVectorInstructions <= NumInstructions / 2)
657825
Threshold -= VectorBonus / 2;
658826

827+
if (auto Result = costBenefitAnalysis()) {
828+
if (Result.getValue())
829+
return InlineResult::success();
830+
else
831+
return InlineResult::failure("Cost over threshold.");
832+
}
833+
659834
if (IgnoreThreshold || Cost < std::max(1, Threshold))
660835
return InlineResult::success();
661836
return InlineResult::failure("Cost over threshold.");
@@ -729,6 +904,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
729904
Params.ComputeFullInlineCost || ORE),
730905
Params(Params), Threshold(Params.DefaultThreshold),
731906
BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
907+
CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
732908
Writer(this) {}
733909

734910
/// Annotation Writer for instruction details
@@ -2146,6 +2322,8 @@ InlineResult CallAnalyzer::analyze() {
21462322
if (BB->empty())
21472323
continue;
21482324

2325+
onBlockStart(BB);
2326+
21492327
// Disallow inlining a blockaddress with uses other than strictly callbr.
21502328
// A blockaddress only has defined behavior for an indirect branch in the
21512329
// same function, and we do not currently support inlining indirect

0 commit comments

Comments
 (0)