Skip to content

Commit aa6ee03

Browse files
[NFC][Inliner] Introduce another multiplier for cost benefit analysis and make multipliers overriddable in TargetTransformInfo.
- The motivation is to expose tunable knobs to control the aggressiveness of inlines for different backend (e.g., machines with different icache size, and workload with different icache/itlb PMU counters). Tuning inline aggressiveness shows a small (~+0.3%) but stable improvement on workload/hardware that is more frontend bound. - Both multipliers could be overridden from command line. Reviewed By: kazu Differential Revision: https://reviews.llvm.org/D153154
1 parent f1dbfcc commit aa6ee03

File tree

5 files changed

+170
-13
lines changed

5 files changed

+170
-13
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,9 @@ class TargetTransformInfo {
348348
/// individual classes of instructions would be better.
349349
unsigned getInliningThresholdMultiplier() const;
350350

351+
unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const;
352+
unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const;
353+
351354
/// \returns A value to be added to the inlining threshold.
352355
unsigned adjustInliningThreshold(const CallBase *CB) const;
353356

@@ -1696,6 +1699,9 @@ class TargetTransformInfo::Concept {
16961699
const TTI::PointersChainInfo &Info, Type *AccessTy,
16971700
TTI::TargetCostKind CostKind) = 0;
16981701
virtual unsigned getInliningThresholdMultiplier() const = 0;
1702+
virtual unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const = 0;
1703+
virtual unsigned
1704+
getInliningCostBenefitAnalysisProfitableMultiplier() const = 0;
16991705
virtual unsigned adjustInliningThreshold(const CallBase *CB) = 0;
17001706
virtual int getInlinerVectorBonusPercent() const = 0;
17011707
virtual unsigned getCallerAllocaCost(const CallBase *CB,
@@ -2068,6 +2074,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
20682074
unsigned adjustInliningThreshold(const CallBase *CB) override {
20692075
return Impl.adjustInliningThreshold(CB);
20702076
}
2077+
unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const override {
2078+
return Impl.getInliningCostBenefitAnalysisSavingsMultiplier();
2079+
}
2080+
unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const override {
2081+
return Impl.getInliningCostBenefitAnalysisProfitableMultiplier();
2082+
}
20712083
int getInlinerVectorBonusPercent() const override {
20722084
return Impl.getInlinerVectorBonusPercent();
20732085
}

llvm/include/llvm/Analysis/TargetTransformInfoImpl.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,10 @@ class TargetTransformInfoImplBase {
6969
}
7070

7171
unsigned getInliningThresholdMultiplier() const { return 1; }
72+
unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const { return 8; }
73+
unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
74+
return 8;
75+
}
7276
unsigned adjustInliningThreshold(const CallBase *CB) const { return 0; }
7377
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const {
7478
return 0;

llvm/lib/Analysis/InlineCost.cpp

Lines changed: 76 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -88,10 +88,21 @@ static cl::opt<bool> InlineEnableCostBenefitAnalysis(
8888
"inline-enable-cost-benefit-analysis", cl::Hidden, cl::init(false),
8989
cl::desc("Enable the cost-benefit analysis for the inliner"));
9090

91+
// InlineSavingsMultiplier overrides per TTI multipliers iff it is
92+
// specified explicitly in command line options. This option is exposed
93+
// for tuning and testing.
9194
static cl::opt<int> InlineSavingsMultiplier(
9295
"inline-savings-multiplier", cl::Hidden, cl::init(8),
9396
cl::desc("Multiplier to multiply cycle savings by during inlining"));
9497

98+
// InlineSavingsProfitableMultiplier overrides per TTI multipliers iff it is
99+
// specified explicitly in command line options. This option is exposed
100+
// for tuning and testing.
101+
static cl::opt<int> InlineSavingsProfitableMultiplier(
102+
"inline-savings-profitable-multiplier", cl::Hidden, cl::init(4),
103+
cl::desc("A multiplier on top of cycle savings to decide whether the "
104+
"savings won't justify the cost"));
105+
95106
static cl::opt<int>
96107
InlineSizeAllowance("inline-size-allowance", cl::Hidden, cl::init(100),
97108
cl::desc("The maximum size of a callee that get's "
@@ -815,6 +826,32 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
815826
return true;
816827
}
817828

829+
// A helper function to choose between command line override and default.
830+
unsigned getInliningCostBenefitAnalysisSavingsMultiplier() const {
831+
if (InlineSavingsMultiplier.getNumOccurrences())
832+
return InlineSavingsMultiplier;
833+
return TTI.getInliningCostBenefitAnalysisSavingsMultiplier();
834+
}
835+
836+
// A helper function to choose between command line override and default.
837+
unsigned getInliningCostBenefitAnalysisProfitableMultiplier() const {
838+
if (InlineSavingsProfitableMultiplier.getNumOccurrences())
839+
return InlineSavingsProfitableMultiplier;
840+
return TTI.getInliningCostBenefitAnalysisProfitableMultiplier();
841+
}
842+
843+
void OverrideCycleSavingsAndSizeForTesting(APInt &CycleSavings, int &Size) {
844+
if (std::optional<int> AttrCycleSavings = getStringFnAttrAsInt(
845+
CandidateCall, "inline-cycle-savings-for-test")) {
846+
CycleSavings = *AttrCycleSavings;
847+
}
848+
849+
if (std::optional<int> AttrRuntimeCost = getStringFnAttrAsInt(
850+
CandidateCall, "inline-runtime-cost-for-test")) {
851+
Size = *AttrRuntimeCost;
852+
}
853+
}
854+
818855
// Determine whether we should inline the given call site, taking into account
819856
// both the size cost and the cycle savings. Return std::nullopt if we don't
820857
// have sufficient profiling information to determine.
@@ -884,29 +921,55 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
884921
CycleSavings += getCallsiteCost(this->CandidateCall, DL);
885922
CycleSavings *= *CallerBFI->getBlockProfileCount(CallerBB);
886923

887-
// Remove the cost of the cold basic blocks.
924+
// Remove the cost of the cold basic blocks to model the runtime cost more
925+
// accurately. Both machine block placement and function splitting could
926+
// place cold blocks further from hot blocks.
888927
int Size = Cost - ColdSize;
889928

890929
// Allow tiny callees to be inlined regardless of whether they meet the
891930
// savings threshold.
892931
Size = Size > InlineSizeAllowance ? Size - InlineSizeAllowance : 1;
893932

933+
OverrideCycleSavingsAndSizeForTesting(CycleSavings, Size);
894934
CostBenefit.emplace(APInt(128, Size), CycleSavings);
895935

896-
// Return true if the savings justify the cost of inlining. Specifically,
897-
// we evaluate the following inequality:
936+
// Let R be the ratio of CycleSavings to Size. We accept the inlining
937+
// opportunity if R is really high and reject if R is really low. If R is
938+
// somewhere in the middle, we fall back to the cost-based analysis.
898939
//
899-
// CycleSavings PSI->getOrCompHotCountThreshold()
900-
// -------------- >= -----------------------------------
901-
// Size InlineSavingsMultiplier
940+
// Specifically, let R = CycleSavings / Size, we accept the inlining
941+
// opportunity if:
902942
//
903-
// Note that the left hand side is specific to a call site. The right hand
904-
// side is a constant for the entire executable.
905-
APInt LHS = CycleSavings;
906-
LHS *= InlineSavingsMultiplier;
907-
APInt RHS(128, PSI->getOrCompHotCountThreshold());
908-
RHS *= Size;
909-
return LHS.uge(RHS);
943+
// PSI->getOrCompHotCountThreshold()
944+
// R > -------------------------------------------------
945+
// getInliningCostBenefitAnalysisSavingsMultiplier()
946+
//
947+
// and reject the inlining opportunity if:
948+
//
949+
// PSI->getOrCompHotCountThreshold()
950+
// R <= ----------------------------------------------------
951+
// getInliningCostBenefitAnalysisProfitableMultiplier()
952+
//
953+
// Otherwise, we fall back to the cost-based analysis.
954+
//
955+
// Implementation-wise, use multiplication (CycleSavings * Multiplier,
956+
// HotCountThreshold * Size) rather than division to avoid precision loss.
957+
APInt Threshold(128, PSI->getOrCompHotCountThreshold());
958+
Threshold *= Size;
959+
960+
APInt UpperBoundCycleSavings = CycleSavings;
961+
UpperBoundCycleSavings *= getInliningCostBenefitAnalysisSavingsMultiplier();
962+
if (UpperBoundCycleSavings.uge(Threshold))
963+
return true;
964+
965+
APInt LowerBoundCycleSavings = CycleSavings;
966+
LowerBoundCycleSavings *=
967+
getInliningCostBenefitAnalysisProfitableMultiplier();
968+
if (LowerBoundCycleSavings.ult(Threshold))
969+
return false;
970+
971+
// Otherwise, fall back to the cost-based analysis.
972+
return std::nullopt;
910973
}
911974

912975
InlineResult finalizeAnalysis() override {

llvm/lib/Analysis/TargetTransformInfo.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -212,6 +212,17 @@ unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
212212
return TTIImpl->getInliningThresholdMultiplier();
213213
}
214214

215+
unsigned
216+
TargetTransformInfo::getInliningCostBenefitAnalysisSavingsMultiplier() const {
217+
return TTIImpl->getInliningCostBenefitAnalysisSavingsMultiplier();
218+
}
219+
220+
unsigned
221+
TargetTransformInfo::getInliningCostBenefitAnalysisProfitableMultiplier()
222+
const {
223+
return TTIImpl->getInliningCostBenefitAnalysisProfitableMultiplier();
224+
}
225+
215226
unsigned
216227
TargetTransformInfo::adjustInliningThreshold(const CallBase *CB) const {
217228
return TTIImpl->adjustInliningThreshold(CB);
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -pass-remarks=inline -pass-remarks-missed=inline -inline-savings-multiplier=4 -inline-savings-profitable-multiplier=5 -S 2>&1| FileCheck %s
2+
3+
; Test that inline cost benefit multipler could be configured from command line.
4+
5+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
6+
target triple = "x86_64-unknown-linux-gnu"
7+
8+
; @inlined_caleee is inlined by cost-benefit anlysis.
9+
; @not_inlined_callee is not inlined, decided by cost-benefit analysis
10+
; CHECK: remark: <unknown>:0:0: 'inlined_callee' inlined into 'caller' with (cost=always): benefit over cost
11+
; CHECK: remark: <unknown>:0:0: 'not_inlined_callee' not inlined into 'caller' because it should never be inlined (cost=never): cost over benefit
12+
13+
define i32 @inlined_callee(i32 %c) !prof !17 {
14+
entry:
15+
%mul = mul nsw i32 %c, %c
16+
ret i32 %mul
17+
}
18+
19+
define i32 @not_inlined_callee(i32 %c) !prof !18 {
20+
entry:
21+
%add = add nsw i32 %c, 2
22+
ret i32 %add
23+
}
24+
25+
define i32 @caller(i32 %a, i32 %c) !prof !15 {
26+
entry:
27+
%rem = srem i32 %a, 3
28+
%cmp = icmp eq i32 %rem, 0
29+
br i1 %cmp, label %if.then, label %if.end, !prof !16
30+
31+
if.then:
32+
; CHECK-LABEL: if.then:
33+
; CHECK-NOT: call i32 @inlined_callee
34+
%call = tail call i32 @inlined_callee(i32 %c) "inline-cycle-savings-for-test"="26" "inline-runtime-cost-for-test"="1"
35+
br label %return
36+
37+
if.end:
38+
; CHECK-LABEL: if.end:
39+
; CHECK: call i32 @not_inlined_callee
40+
%call1 = tail call i32 @not_inlined_callee(i32 %c) "inline-cycle-savings-for-test"="19" "inline-runtime-cost-for-test"="1"
41+
br label %return
42+
43+
return:
44+
%retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.end ]
45+
ret i32 %retval.0
46+
}
47+
48+
!llvm.module.flags = !{!1}
49+
50+
!1 = !{i32 1, !"ProfileSummary", !2}
51+
!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
52+
!3 = !{!"ProfileFormat", !"InstrProf"}
53+
!4 = !{!"TotalCount", i64 10000}
54+
!5 = !{!"MaxCount", i64 1000}
55+
!6 = !{!"MaxInternalCount", i64 1}
56+
!7 = !{!"MaxFunctionCount", i64 1000}
57+
!8 = !{!"NumCounts", i64 3}
58+
!9 = !{!"NumFunctions", i64 3}
59+
!10 = !{!"DetailedSummary", !11}
60+
!11 = !{!12, !13, !14}
61+
!12 = !{i32 10000, i64 100, i32 1}
62+
!13 = !{i32 990000, i64 100, i32 1}
63+
!14 = !{i32 999999, i64 1, i32 2}
64+
!15 = !{!"function_entry_count", i64 500}
65+
!16 = !{!"branch_weights", i32 1, i32 2}
66+
!17 = !{!"function_entry_count", i64 200}
67+
!18 = !{!"function_entry_count", i64 400}

0 commit comments

Comments
 (0)