Skip to content

Commit a45d72e

Browse files
committed
[CSSPGO] Add switch for sample loader to honor global pre-inliner decision from llvm-profgen
The change adds a switch to allow sample loader to use global pre-inliner's decision instead. The pre-inliner in llvm-profgen makes inline decision globally based on whole program profile and function byte size as cost proxy. Since pre-inliner also adjusts/merges context profile based on its inline decision, honoring its inline decision in sample loader would lead to better post-inline profile quality especially for thinlto where cross module profile merging isn't possible without pre-inliner. Minor fix in profile reader is also included. When pre-inliner is use, we now also turn off the default merging and trimming logic unless it's explicitly asked. Differential Revision: https://reviews.llvm.org/D108677
1 parent 4a66a11 commit a45d72e

File tree

5 files changed

+227
-6
lines changed

5 files changed

+227
-6
lines changed

llvm/lib/ProfileData/SampleProfReader.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -793,7 +793,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readFuncProfiles() {
793793
}
794794
assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) &&
795795
"Cannot have both context-sensitive and regular profile");
796-
assert(ProfileIsCS == (CSProfileCount > 0) &&
796+
assert((!CSProfileCount || ProfileIsCS) &&
797797
"Section flag should be consistent with actual profile");
798798
return sampleprof_error::success;
799799
}

llvm/lib/Transforms/IPO/SampleProfile.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,11 @@ static cl::opt<bool> CallsitePrioritizedInline(
214214
cl::desc("Use call site prioritized inlining for sample profile loader."
215215
"Currently only CSSPGO is supported."));
216216

217+
static cl::opt<bool> UsePreInlinerDecision(
218+
"sample-profile-use-preinliner", cl::Hidden, cl::ZeroOrMore,
219+
cl::init(false),
220+
cl::desc("Use the preinliner decisions stored in profile context."));
221+
217222
static cl::opt<std::string> ProfileInlineReplayFile(
218223
"sample-profile-inline-replay", cl::init(""), cl::value_desc("filename"),
219224
cl::desc(
@@ -1285,6 +1290,21 @@ SampleProfileLoader::shouldInlineCandidate(InlineCandidate &Candidate) {
12851290
if (Cost.isNever() || Cost.isAlways())
12861291
return Cost;
12871292

1293+
// With CSSPGO, the preinliner in llvm-profgen can estimate global inline
1294+
// decisions based on hotness as well as accurate function byte sizes for
1295+
// given context using function/inlinee sizes from previous build. It
1296+
// stores the decision in profile, and also adjust/merge context profile
1297+
// aiming at better context-sensitive post-inline profile quality, assuming
1298+
// all inline decision estimates are going to be honored by compiler. Here
1299+
// we replay that inline decision under `sample-profile-use-preinliner`.
1300+
if (UsePreInlinerDecision) {
1301+
if (Candidate.CalleeSamples->getContext().hasAttribute(
1302+
ContextShouldBeInlined))
1303+
return InlineCost::getAlways("preinliner");
1304+
else
1305+
return InlineCost::getNever("preinliner");
1306+
}
1307+
12881308
// For old FDO inliner, we inline the call site as long as cost is not
12891309
// "Never". The cost-benefit check is done earlier.
12901310
if (!CallsitePrioritizedInline) {
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11
2+
0: 6
3+
1: 6
4+
3: 287884
5+
4: 287864 _Z3fibi:315608
6+
15: 23
7+
!Attributes: 3
8+
[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20
9+
0: 15
10+
1: 15
11+
3: 74946
12+
4: 74941 _Z3fibi:82359
13+
10: 23324
14+
11: 23327 _Z3fibi:25228
15+
15: 11
16+
!Attributes: 3
17+
[main]:154:0
18+
2: 12
19+
3: 18 _Z5funcAi:11
20+
3.1: 18 _Z5funcBi:19
21+
[external:12 @ main]:154:12
22+
2: 12
23+
3: 10 _Z5funcAi:7
24+
3.1: 10 _Z5funcBi:11
25+
[main:3.1 @ _Z5funcBi]:120:19
26+
0: 19
27+
1: 19 _Z8funcLeafi:20
28+
3: 12
29+
!Attributes: 3
30+
[externalA:17 @ _Z5funcBi]:120:3
31+
0: 3
32+
1: 3
33+
[external:10 @ _Z5funcBi]:120:10
34+
0: 10
35+
1: 10
36+
[main:3 @ _Z5funcAi]:99:11
37+
0: 10
38+
1: 10 _Z8funcLeafi:11
39+
3: 24
Lines changed: 158 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,158 @@
1+
; Test for CSSPGO's sample loader inlining to make sure it can use llvm-profgen preinliner's decision
2+
3+
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/csspgo-use-preinliner.prof -pass-remarks=inline -sample-profile-prioritized-inline -profile-sample-accurate -sample-profile-use-preinliner=0 -S 2>&1 | FileCheck %s --check-prefix=DEFAULT
4+
; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/csspgo-use-preinliner.prof -pass-remarks=inline -sample-profile-prioritized-inline -profile-sample-accurate -sample-profile-use-preinliner=1 -S 2>&1 | FileCheck %s --check-prefix=PREINLINE
5+
6+
7+
; DEFAULT: '_Z5funcAi' inlined into 'main'
8+
; DEFAULT-NOT: inlined into
9+
10+
; PREINLINE-NOT: inlined into
11+
; PREINLINE: '_Z8funcLeafi' inlined into '_Z5funcBi'
12+
; PREINLINE: '_Z8funcLeafi' inlined into '_Z5funcAi'
13+
; PREINLINE-NOT: inlined into
14+
15+
@factor = dso_local global i32 3, align 4, !dbg !0
16+
17+
define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
18+
entry:
19+
br label %for.body, !dbg !25
20+
21+
for.cond.cleanup: ; preds = %for.body
22+
ret i32 %add3, !dbg !27
23+
24+
for.body: ; preds = %for.body, %entry
25+
%x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ]
26+
%r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ]
27+
%call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32
28+
%add = add nuw nsw i32 %x.011, 1, !dbg !31
29+
%call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28
30+
%add2 = add i32 %call, %r.010, !dbg !34
31+
%add3 = add i32 %add2, %call1, !dbg !35
32+
%dec = add nsw i32 %x.011, -1, !dbg !36
33+
%cmp = icmp eq i32 %x.011, 0, !dbg !38
34+
br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25
35+
}
36+
37+
define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #1 !dbg !40 {
38+
entry:
39+
%add = add nsw i32 %x, 100000, !dbg !44
40+
%call = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !45
41+
ret i32 %call, !dbg !46
42+
}
43+
44+
define dso_local i32 @_Z8funcLeafi(i32 %x) local_unnamed_addr #1 !dbg !54 {
45+
entry:
46+
%cmp = icmp sgt i32 %x, 0, !dbg !57
47+
br i1 %cmp, label %while.body, label %while.cond2.preheader, !dbg !59
48+
49+
while.cond2.preheader: ; preds = %entry
50+
%cmp313 = icmp slt i32 %x, 0, !dbg !60
51+
br i1 %cmp313, label %while.body4, label %if.end, !dbg !63
52+
53+
while.body: ; preds = %while.body, %entry
54+
%x.addr.016 = phi i32 [ %sub, %while.body ], [ %x, %entry ]
55+
%tmp = load volatile i32, i32* @factor, align 4, !dbg !64
56+
%call = tail call i32 @_Z3fibi(i32 %tmp), !dbg !67
57+
%sub = sub nsw i32 %x.addr.016, %call, !dbg !68
58+
%cmp1 = icmp sgt i32 %sub, 0, !dbg !69
59+
br i1 %cmp1, label %while.body, label %if.end, !dbg !71
60+
61+
while.body4: ; preds = %while.body4, %while.cond2.preheader
62+
%x.addr.114 = phi i32 [ %add, %while.body4 ], [ %x, %while.cond2.preheader ]
63+
%tmp1 = load volatile i32, i32* @factor, align 4, !dbg !72
64+
%call5 = tail call i32 @_Z3fibi(i32 %tmp1), !dbg !74
65+
%add = add nsw i32 %call5, %x.addr.114, !dbg !75
66+
%cmp3 = icmp slt i32 %add, 0, !dbg !60
67+
br i1 %cmp3, label %while.body4, label %if.end, !dbg !63
68+
69+
if.end: ; preds = %while.body4, %while.body, %while.cond2.preheader
70+
%x.addr.2 = phi i32 [ 0, %while.cond2.preheader ], [ %sub, %while.body ], [ %add, %while.body4 ]
71+
ret i32 %x.addr.2, !dbg !76
72+
}
73+
74+
define dso_local i32 @_Z5funcBi(i32 %x) local_unnamed_addr #0 !dbg !47 {
75+
entry:
76+
%sub = add nsw i32 %x, -100000, !dbg !51
77+
%call = tail call i32 @_Z8funcLeafi(i32 %sub), !dbg !52
78+
ret i32 %call, !dbg !53
79+
}
80+
81+
declare i32 @_Z3fibi(i32)
82+
83+
attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
84+
attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
85+
86+
!llvm.dbg.cu = !{!2}
87+
!llvm.module.flags = !{!14, !15, !16}
88+
!llvm.ident = !{!17}
89+
90+
!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
91+
!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true)
92+
!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
93+
!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo")
94+
!4 = !{}
95+
!5 = !{!6, !10, !11}
96+
!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
97+
!7 = !DISubroutineType(types: !8)
98+
!8 = !{!9, !9}
99+
!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
100+
!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
101+
!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4)
102+
!12 = !{!0}
103+
!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9)
104+
!14 = !{i32 7, !"Dwarf Version", i32 4}
105+
!15 = !{i32 2, !"Debug Info Version", i32 3}
106+
!16 = !{i32 1, !"wchar_size", i32 4}
107+
!17 = !{!"clang version 11.0.0"}
108+
!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21)
109+
!19 = !DISubroutineType(types: !20)
110+
!20 = !{!9}
111+
!21 = !{!22, !23}
112+
!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9)
113+
!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9)
114+
!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3)
115+
!25 = !DILocation(line: 13, column: 3, scope: !26)
116+
!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2)
117+
!27 = !DILocation(line: 17, column: 3, scope: !18)
118+
!28 = !DILocation(line: 14, column: 10, scope: !29)
119+
!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37)
120+
!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3)
121+
!31 = !DILocation(line: 14, column: 29, scope: !29)
122+
!32 = !DILocation(line: 14, column: 21, scope: !33)
123+
!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
124+
!34 = !DILocation(line: 14, column: 19, scope: !29)
125+
!35 = !DILocation(line: 14, column: 7, scope: !29)
126+
!36 = !DILocation(line: 13, column: 33, scope: !37)
127+
!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6)
128+
!38 = !DILocation(line: 13, column: 26, scope: !39)
129+
!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2)
130+
!40 = distinct !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 26, type: !7, scopeLine: 26, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
131+
!44 = !DILocation(line: 27, column: 22, scope: !40)
132+
!45 = !DILocation(line: 27, column: 11, scope: !40)
133+
!46 = !DILocation(line: 29, column: 3, scope: !40)
134+
!47 = distinct !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 32, type: !7, scopeLine: 32, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
135+
!51 = !DILocation(line: 33, column: 22, scope: !47)
136+
!52 = !DILocation(line: 33, column: 11, scope: !47)
137+
!53 = !DILocation(line: 35, column: 3, scope: !47)
138+
!54 = distinct !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 48, type: !7, scopeLine: 48, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2)
139+
!57 = !DILocation(line: 49, column: 9, scope: !58)
140+
!58 = distinct !DILexicalBlock(scope: !54, file: !3, line: 49, column: 7)
141+
!59 = !DILocation(line: 49, column: 7, scope: !54)
142+
!60 = !DILocation(line: 58, column: 14, scope: !61)
143+
!61 = !DILexicalBlockFile(scope: !62, file: !3, discriminator: 2)
144+
!62 = distinct !DILexicalBlock(scope: !58, file: !3, line: 56, column: 8)
145+
!63 = !DILocation(line: 58, column: 5, scope: !61)
146+
!64 = !DILocation(line: 52, column: 16, scope: !65)
147+
!65 = distinct !DILexicalBlock(scope: !66, file: !3, line: 51, column: 19)
148+
!66 = distinct !DILexicalBlock(scope: !58, file: !3, line: 49, column: 14)
149+
!67 = !DILocation(line: 52, column: 12, scope: !65)
150+
!68 = !DILocation(line: 52, column: 9, scope: !65)
151+
!69 = !DILocation(line: 51, column: 14, scope: !70)
152+
!70 = !DILexicalBlockFile(scope: !66, file: !3, discriminator: 2)
153+
!71 = !DILocation(line: 51, column: 5, scope: !70)
154+
!72 = !DILocation(line: 59, column: 16, scope: !73)
155+
!73 = distinct !DILexicalBlock(scope: !62, file: !3, line: 58, column: 19)
156+
!74 = !DILocation(line: 59, column: 12, scope: !73)
157+
!75 = !DILocation(line: 59, column: 9, scope: !73)
158+
!76 = !DILocation(line: 63, column: 3, scope: !54)

llvm/tools/llvm-profgen/ProfileGenerator.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -407,11 +407,15 @@ void CSProfileGenerator::postProcessProfiles() {
407407
.run();
408408
}
409409

410-
// Trim and merge cold context profile using cold threshold above;
411-
SampleContextTrimmer(ProfileMap)
412-
.trimAndMergeColdContextProfiles(
413-
ColdCountThreshold, CSProfTrimColdContext, CSProfMergeColdContext,
414-
CSProfMaxColdContextDepth);
410+
// Trim and merge cold context profile using cold threshold above. By default,
411+
// we skip such merging and trimming when preinliner is on.
412+
if (!EnableCSPreInliner || CSProfTrimColdContext.getNumOccurrences() ||
413+
CSProfMergeColdContext.getNumOccurrences()) {
414+
SampleContextTrimmer(ProfileMap)
415+
.trimAndMergeColdContextProfiles(
416+
ColdCountThreshold, CSProfTrimColdContext, CSProfMergeColdContext,
417+
CSProfMaxColdContextDepth);
418+
}
415419
}
416420

417421
void CSProfileGenerator::computeSummaryAndThreshold() {

0 commit comments

Comments
 (0)