@@ -71,6 +71,20 @@ static cl::opt<int>
71
71
cl::init (45 ), cl::ZeroOrMore,
72
72
cl::desc(" Threshold for inlining cold callsites" ));
73
73
74
+ static cl::opt<bool > InlineEnableCostBenefitAnalysis (
75
+ " inline-enable-cost-benefit-analysis" , cl::Hidden, cl::init(false ),
76
+ cl::desc(" Enable the cost-benefit analysis for the inliner" ));
77
+
78
+ static cl::opt<int > InlineSavingsMultiplier (
79
+ " inline-savings-multiplier" , cl::Hidden, cl::init(8 ), cl::ZeroOrMore,
80
+ cl::desc(" Multiplier to multiply cycle savings by during inlining" ));
81
+
82
+ static cl::opt<int >
83
+ InlineSizeAllowance (" inline-size-allowance" , cl::Hidden, cl::init(100 ),
84
+ cl::ZeroOrMore,
85
+ cl::desc(" The maximum size of a callee that get's "
86
+ " inlined without sufficient cycle savings" ));
87
+
74
88
// We introduce this threshold to help performance of instrumentation based
75
89
// PGO before we actually hook up inliner with analysis passes such as BPI and
76
90
// BFI.
@@ -183,6 +197,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
183
197
CallBase &CandidateCall;
184
198
185
199
// / Extension points for handling callsite features.
200
+ // Called before a basic block was analyzed.
201
+ virtual void onBlockStart (const BasicBlock *BB) {}
202
+
186
203
// / Called after a basic block was analyzed.
187
204
virtual void onBlockAnalyzed (const BasicBlock *BB) {}
188
205
@@ -454,12 +471,24 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
454
471
// / Ignore the threshold when finalizing analysis.
455
472
const bool IgnoreThreshold;
456
473
474
+ // True if the cost-benefit-analysis-based inliner is enabled.
475
+ const bool CostBenefitAnalysisEnabled;
476
+
457
477
// / Inlining cost measured in abstract units, accounts for all the
458
478
// / instructions expected to be executed for a given function invocation.
459
479
// / Instructions that are statically proven to be dead based on call-site
460
480
// / arguments are not counted here.
461
481
int Cost = 0 ;
462
482
483
+ // The cumulative cost at the beginning of the basic block being analyzed. At
484
+ // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
485
+ // the size of that basic block.
486
+ int CostAtBBStart = 0 ;
487
+
488
+ // The static size of live but cold basic blocks. This is "static" in the
489
+ // sense that it's not weighted by profile counts at all.
490
+ int ColdSize = 0 ;
491
+
463
492
bool SingleBB = true ;
464
493
465
494
unsigned SROACostSavings = 0 ;
@@ -597,7 +626,21 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
597
626
SROACostSavings += InlineConstants::InstrCost;
598
627
}
599
628
629
+ void onBlockStart (const BasicBlock *BB) override { CostAtBBStart = Cost; }
630
+
600
631
void onBlockAnalyzed (const BasicBlock *BB) override {
632
+ if (CostBenefitAnalysisEnabled) {
633
+ // Keep track of the static size of live but cold basic blocks. For now,
634
+ // we define a cold basic block to be one that's never executed.
635
+ assert (GetBFI && " GetBFI must be available" );
636
+ BlockFrequencyInfo *BFI = &(GetBFI (F));
637
+ assert (BFI && " BFI must be available" );
638
+ auto ProfileCount = BFI->getBlockProfileCount (BB);
639
+ assert (ProfileCount.hasValue ());
640
+ if (ProfileCount.getValue () == 0 )
641
+ ColdSize += Cost - CostAtBBStart;
642
+ }
643
+
601
644
auto *TI = BB->getTerminator ();
602
645
// If we had any successors at this point, than post-inlining is likely to
603
646
// have them as well. Note that we assume any basic blocks which existed
@@ -628,6 +671,131 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
628
671
InstructionCostDetailMap[I].ThresholdAfter = Threshold;
629
672
}
630
673
674
+ bool isCostBenefitAnalysisEnabled () {
675
+ if (!InlineEnableCostBenefitAnalysis)
676
+ return false ;
677
+
678
+ if (!PSI || !PSI->hasProfileSummary ())
679
+ return false ;
680
+
681
+ if (!GetBFI)
682
+ return false ;
683
+
684
+ auto *Caller = CandidateCall.getParent ()->getParent ();
685
+ if (!Caller->getEntryCount ())
686
+ return false ;
687
+
688
+ BlockFrequencyInfo *CallerBFI = &(GetBFI (*Caller));
689
+ if (!CallerBFI)
690
+ return false ;
691
+
692
+ // For now, limit to hot call site.
693
+ if (!PSI->isHotCallSite (CandidateCall, CallerBFI))
694
+ return false ;
695
+
696
+ if (!F.getEntryCount ())
697
+ return false ;
698
+
699
+ BlockFrequencyInfo *CalleeBFI = &(GetBFI (F));
700
+ if (!CalleeBFI)
701
+ return false ;
702
+
703
+ return true ;
704
+ }
705
+
706
+ // Determine whether we should inline the given call site, taking into account
707
+ // both the size cost and the cycle savings. Return None if we don't have
708
+ // suficient profiling information to determine.
709
+ Optional<bool > costBenefitAnalysis () {
710
+ if (!CostBenefitAnalysisEnabled)
711
+ return None;
712
+
713
+ // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
714
+ // for the prelink phase of the AutoFDO + ThinLTO build. Honor the logic by
715
+ // falling back to the cost-based metric.
716
+ // TODO: Improve this hacky condition.
717
+ if (Threshold == 0 )
718
+ return None;
719
+
720
+ assert (GetBFI);
721
+ BlockFrequencyInfo *CalleeBFI = &(GetBFI (F));
722
+ assert (CalleeBFI);
723
+
724
+ // The cycle savings expressed as the sum of InlineConstants::InstrCost
725
+ // multiplied by the estimated dynamic count of each instruction we can
726
+ // avoid. Savings come from the call site cost, such as argument setup and
727
+ // the call instruction, as well as the instructions that are folded.
728
+ //
729
+ // We use 128-bit APInt here to avoid potential overflow. This variable
730
+ // should stay well below 10^^24 (or 2^^80) in practice. This "worst" case
731
+ // assumes that we can avoid or fold a billion instructions, each with a
732
+ // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
733
+ // period on a 4GHz machine.
734
+ APInt CycleSavings (128 , 0 );
735
+
736
+ for (auto &BB : F) {
737
+ APInt CurrentSavings (128 , 0 );
738
+ for (auto &I : BB) {
739
+ if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
740
+ // Count a conditional branch as savings if it becomes unconditional.
741
+ if (BI->isConditional () &&
742
+ dyn_cast_or_null<ConstantInt>(
743
+ SimplifiedValues.lookup (BI->getCondition ()))) {
744
+ CurrentSavings += InlineConstants::InstrCost;
745
+ }
746
+ } else if (Value *V = dyn_cast<Value>(&I)) {
747
+ // Count an instruction as savings if we can fold it.
748
+ if (SimplifiedValues.count (V)) {
749
+ CurrentSavings += InlineConstants::InstrCost;
750
+ }
751
+ }
752
+ // TODO: Consider other forms of savings like switch statements,
753
+ // indirect calls becoming direct, SROACostSavings, LoadEliminationCost,
754
+ // etc.
755
+ }
756
+
757
+ auto ProfileCount = CalleeBFI->getBlockProfileCount (&BB);
758
+ assert (ProfileCount.hasValue ());
759
+ CurrentSavings *= ProfileCount.getValue ();
760
+ CycleSavings += CurrentSavings;
761
+ }
762
+
763
+ // Compute the cycle savings per call.
764
+ auto EntryProfileCount = F.getEntryCount ();
765
+ assert (EntryProfileCount.hasValue ());
766
+ auto EntryCount = EntryProfileCount.getCount ();
767
+ CycleSavings += EntryCount / 2 ;
768
+ CycleSavings = CycleSavings.udiv (EntryCount);
769
+
770
+ // Compute the total savings for the call site.
771
+ auto *CallerBB = CandidateCall.getParent ();
772
+ BlockFrequencyInfo *CallerBFI = &(GetBFI (*(CallerBB->getParent ())));
773
+ CycleSavings += getCallsiteCost (this ->CandidateCall , DL);
774
+ CycleSavings *= CallerBFI->getBlockProfileCount (CallerBB).getValue ();
775
+
776
+ // Remove the cost of the cold basic blocks.
777
+ int Size = Cost - ColdSize;
778
+
779
+ // Allow tiny callees to be inlined regardless of whether they meet the
780
+ // savings threshold.
781
+ Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1 ;
782
+
783
+ // Return true if the savings justify the cost of inlining. Specifically,
784
+ // we evaluate the following inequality:
785
+ //
786
+ // CycleSavings PSI->getOrCompHotCountThreshold()
787
+ // -------------- >= -----------------------------------
788
+ // Size InlineSavingsMultiplier
789
+ //
790
+ // Note that the left hand side is specific to a call site. The right hand
791
+ // side is a constant for the entire executable.
792
+ APInt LHS = CycleSavings;
793
+ LHS *= InlineSavingsMultiplier;
794
+ APInt RHS (128 , PSI->getOrCompHotCountThreshold ());
795
+ RHS *= Size;
796
+ return LHS.uge (RHS);
797
+ }
798
+
631
799
InlineResult finalizeAnalysis () override {
632
800
// Loops generally act a lot like calls in that they act like barriers to
633
801
// movement, require a certain amount of setup, etc. So when optimising for
@@ -656,6 +824,13 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
656
824
else if (NumVectorInstructions <= NumInstructions / 2 )
657
825
Threshold -= VectorBonus / 2 ;
658
826
827
+ if (auto Result = costBenefitAnalysis ()) {
828
+ if (Result.getValue ())
829
+ return InlineResult::success ();
830
+ else
831
+ return InlineResult::failure (" Cost over threshold." );
832
+ }
833
+
659
834
if (IgnoreThreshold || Cost < std::max (1 , Threshold))
660
835
return InlineResult::success ();
661
836
return InlineResult::failure (" Cost over threshold." );
@@ -729,6 +904,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
729
904
Params.ComputeFullInlineCost || ORE),
730
905
Params(Params), Threshold(Params.DefaultThreshold),
731
906
BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
907
+ CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
732
908
Writer(this ) {}
733
909
734
910
// / Annotation Writer for instruction details
@@ -2146,6 +2322,8 @@ InlineResult CallAnalyzer::analyze() {
2146
2322
if (BB->empty ())
2147
2323
continue ;
2148
2324
2325
+ onBlockStart (BB);
2326
+
2149
2327
// Disallow inlining a blockaddress with uses other than strictly callbr.
2150
2328
// A blockaddress only has defined behavior for an indirect branch in the
2151
2329
// same function, and we do not currently support inlining indirect
0 commit comments