[InlineCost] Implement cost-benefit-based inliner

kazutakahirata · kazutakahirata · commit 9895c7012d61 · 2020-12-18T00:37:24.000-08:00
This patch adds an alternative cost metric for the inliner to take into account both the cost (i.e. size) and cycle count savings into account. Without this patch, we decide to inline a given call site if the size of inlining the call site is below the threshold that is computed according to the hotness of the call site. This patch adds a new cost metric, turned off by default, to take over the handling of hot call sites. Specifically, with the new cost metric, we decide to inline a given call site if the ratio of cycle savings to size exceeds a threshold. The cycle savings are computed from call site costs, parameter propagation, folded conditional branches, etc, all weighted by their respective profile counts. The size is primarily the callee size, but we subtract call site costs and the size of basic blocks that are never executed. The new cost metric implicitly takes advantage of the machine function splitter recently introduced by Snehasish Kumar, which dramatically reduces the cost of duplicating (e.g. inlining) cold basic blocks by placing cold basic blocks of hot functions in the .text.split section. We evaluated the new cost metric on clang bootstrap and SPECInt 2017. For clang bootstrap, we observe 0.69% runtime improvement. For SPECInt we report the change in IntRate the C/C++ benchmarks. All benchmarks apart from perlbench and omnetpp improve, on average by 0.21% with the max for mcf at 1.96%. Benchmark % Change 500.perlbench_r -0.45 502.gcc_r 0.13 505.mcf_r 1.96 520.omnetpp_r -0.28 523.xalancbmk_r 0.49 525.x264_r 0.00 531.deepsjeng_r 0.00 541.leela_r 0.35 557.xz_r 0.21 Differential Revision: https://reviews.llvm.org/D92780
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
@@ -71,6 +71,20 @@ static cl::opt<int>
                           cl::init(45), cl::ZeroOrMore,
                           cl::desc("Threshold for inlining cold callsites"));
 
+static cl::opt<bool> InlineEnableCostBenefitAnalysis(
+    "inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
+    cl::desc("Enable the cost-benefit analysis for the inliner"));
+
+static cl::opt<int> InlineSavingsMultiplier(
+    "inline-savings-multiplier", cl::Hidden, cl::init(8), cl::ZeroOrMore,
+    cl::desc("Multiplier to multiply cycle savings by during inlining"));
+
+static cl::opt<int>
+    InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
+                        cl::ZeroOrMore,
+                        cl::desc("The maximum size of a callee that get's "
+                                 "inlined without sufficient cycle savings"));
+
 // We introduce this threshold to help performance of instrumentation based
 // PGO before we actually hook up inliner with analysis passes such as BPI and
 // BFI.
@@ -183,6 +197,9 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   CallBase &CandidateCall;
 
   /// Extension points for handling callsite features.
+  // Called before a basic block was analyzed.
+  virtual void onBlockStart(const BasicBlock *BB) {}
+
   /// Called after a basic block was analyzed.
   virtual void onBlockAnalyzed(const BasicBlock *BB) {}
 
@@ -454,12 +471,24 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
   /// Ignore the threshold when finalizing analysis.
   const bool IgnoreThreshold;
 
+  // True if the cost-benefit-analysis-based inliner is enabled.
+  const bool CostBenefitAnalysisEnabled;
+
   /// Inlining cost measured in abstract units, accounts for all the
   /// instructions expected to be executed for a given function invocation.
   /// Instructions that are statically proven to be dead based on call-site
   /// arguments are not counted here.
   int Cost = 0;
 
+  // The cumulative cost at the beginning of the basic block being analyzed.  At
+  // the end of analyzing each basic block, "Cost - CostAtBBStart" represents
+  // the size of that basic block.
+  int CostAtBBStart = 0;
+
+  // The static size of live but cold basic blocks.  This is "static" in the
+  // sense that it's not weighted by profile counts at all.
+  int ColdSize = 0;
+
   bool SingleBB = true;
 
   unsigned SROACostSavings = 0;
@@ -597,7 +626,21 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     SROACostSavings += InlineConstants::InstrCost;
   }
 
+  void onBlockStart(const BasicBlock *BB) override { CostAtBBStart = Cost; }
+
   void onBlockAnalyzed(const BasicBlock *BB) override {
+    if (CostBenefitAnalysisEnabled) {
+      // Keep track of the static size of live but cold basic blocks.  For now,
+      // we define a cold basic block to be one that's never executed.
+      assert(GetBFI && "GetBFI must be available");
+      BlockFrequencyInfo *BFI = &(GetBFI(F));
+      assert(BFI && "BFI must be available");
+      auto ProfileCount = BFI->getBlockProfileCount(BB);
+      assert(ProfileCount.hasValue());
+      if (ProfileCount.getValue() == 0)
+        ColdSize += Cost - CostAtBBStart;
+    }
+
     auto *TI = BB->getTerminator();
     // If we had any successors at this point, than post-inlining is likely to
     // have them as well. Note that we assume any basic blocks which existed
@@ -628,6 +671,131 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     InstructionCostDetailMap[I].ThresholdAfter = Threshold;
   }
 
+  bool isCostBenefitAnalysisEnabled() {
+    if (!InlineEnableCostBenefitAnalysis)
+      return false;
+
+    if (!PSI || !PSI->hasProfileSummary())
+      return false;
+
+    if (!GetBFI)
+      return false;
+
+    auto *Caller = CandidateCall.getParent()->getParent();
+    if (!Caller->getEntryCount())
+      return false;
+
+    BlockFrequencyInfo *CallerBFI = &(GetBFI(*Caller));
+    if (!CallerBFI)
+      return false;
+
+    // For now, limit to hot call site.
+    if (!PSI->isHotCallSite(CandidateCall, CallerBFI))
+      return false;
+
+    if (!F.getEntryCount())
+      return false;
+
+    BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
+    if (!CalleeBFI)
+      return false;
+
+    return true;
+  }
+
+  // Determine whether we should inline the given call site, taking into account
+  // both the size cost and the cycle savings.  Return None if we don't have
+  // suficient profiling information to determine.
+  Optional<bool> costBenefitAnalysis() {
+    if (!CostBenefitAnalysisEnabled)
+      return None;
+
+    // buildInlinerPipeline in the pass builder sets HotCallSiteThreshold to 0
+    // for the prelink phase of the AutoFDO + ThinLTO build.  Honor the logic by
+    // falling back to the cost-based metric.
+    // TODO: Improve this hacky condition.
+    if (Threshold == 0)
+      return None;
+
+    assert(GetBFI);
+    BlockFrequencyInfo *CalleeBFI = &(GetBFI(F));
+    assert(CalleeBFI);
+
+    // The cycle savings expressed as the sum of InlineConstants::InstrCost
+    // multiplied by the estimated dynamic count of each instruction we can
+    // avoid.  Savings come from the call site cost, such as argument setup and
+    // the call instruction, as well as the instructions that are folded.
+    //
+    // We use 128-bit APInt here to avoid potential overflow.  This variable
+    // should stay well below 10^^24 (or 2^^80) in practice.  This "worst" case
+    // assumes that we can avoid or fold a billion instructions, each with a
+    // profile count of 10^^15 -- roughly the number of cycles for a 24-hour
+    // period on a 4GHz machine.
+    APInt CycleSavings(128, 0);
+
+    for (auto &BB : F) {
+      APInt CurrentSavings(128, 0);
+      for (auto &I : BB) {
+        if (BranchInst *BI = dyn_cast<BranchInst>(&I)) {
+          // Count a conditional branch as savings if it becomes unconditional.
+          if (BI->isConditional() &&
+              dyn_cast_or_null<ConstantInt>(
+                  SimplifiedValues.lookup(BI->getCondition()))) {
+            CurrentSavings += InlineConstants::InstrCost;
+          }
+        } else if (Value *V = dyn_cast<Value>(&I)) {
+          // Count an instruction as savings if we can fold it.
+          if (SimplifiedValues.count(V)) {
+            CurrentSavings += InlineConstants::InstrCost;
+          }
+        }
+        // TODO: Consider other forms of savings like switch statements,
+        // indirect calls becoming direct, SROACostSavings, LoadEliminationCost,
+        // etc.
+      }
+
+      auto ProfileCount = CalleeBFI->getBlockProfileCount(&BB);
+      assert(ProfileCount.hasValue());
+      CurrentSavings *= ProfileCount.getValue();
+      CycleSavings += CurrentSavings;
+    }
+
+    // Compute the cycle savings per call.
+    auto EntryProfileCount = F.getEntryCount();
+    assert(EntryProfileCount.hasValue());
+    auto EntryCount = EntryProfileCount.getCount();
+    CycleSavings += EntryCount / 2;
+    CycleSavings = CycleSavings.udiv(EntryCount);
+
+    // Compute the total savings for the call site.
+    auto *CallerBB = CandidateCall.getParent();
+    BlockFrequencyInfo *CallerBFI = &(GetBFI(*(CallerBB->getParent())));
+    CycleSavings += getCallsiteCost(this->CandidateCall, DL);
+    CycleSavings *= CallerBFI->getBlockProfileCount(CallerBB).getValue();
+
+    // Remove the cost of the cold basic blocks.
+    int Size = Cost - ColdSize;
+
+    // Allow tiny callees to be inlined regardless of whether they meet the
+    // savings threshold.
+    Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
+
+    // Return true if the savings justify the cost of inlining.  Specifically,
+    // we evaluate the following inequality:
+    //
+    //  CycleSavings      PSI->getOrCompHotCountThreshold()
+    // -------------- >= -----------------------------------
+    //       Size              InlineSavingsMultiplier
+    //
+    // Note that the left hand side is specific to a call site.  The right hand
+    // side is a constant for the entire executable.
+    APInt LHS = CycleSavings;
+    LHS *= InlineSavingsMultiplier;
+    APInt RHS(128, PSI->getOrCompHotCountThreshold());
+    RHS *= Size;
+    return LHS.uge(RHS);
+  }
+
   InlineResult finalizeAnalysis() override {
     // Loops generally act a lot like calls in that they act like barriers to
     // movement, require a certain amount of setup, etc. So when optimising for
@@ -656,6 +824,13 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     else if (NumVectorInstructions <= NumInstructions / 2)
       Threshold -= VectorBonus / 2;
 
+    if (auto Result = costBenefitAnalysis()) {
+      if (Result.getValue())
+        return InlineResult::success();
+      else
+        return InlineResult::failure("Cost over threshold.");
+    }
+
     if (IgnoreThreshold || Cost < std::max(1, Threshold))
       return InlineResult::success();
     return InlineResult::failure("Cost over threshold.");
@@ -729,6 +904,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
                               Params.ComputeFullInlineCost || ORE),
         Params(Params), Threshold(Params.DefaultThreshold),
         BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
+        CostBenefitAnalysisEnabled(isCostBenefitAnalysisEnabled()),
         Writer(this) {}
 
   /// Annotation Writer for instruction details
@@ -2146,6 +2322,8 @@ InlineResult CallAnalyzer::analyze() {
     if (BB->empty())
       continue;
 
+    onBlockStart(BB);
+
     // Disallow inlining a blockaddress with uses other than strictly callbr.
     // A blockaddress only has defined behavior for an indirect branch in the
     // same function, and we do not currently support inlining indirect