Skip to content

Commit 3b51b51

Browse files
committed
[CSSPGO][llvm-profgen] Report samples for untrackable frames.
Fixing an issue where samples collected for an untrackable frame is not reported. An untrackable frame refers to a frame whose caller is untrackable due to missing debug info or pseudo probe. Though the frame is connected to its parent frame through the frame pointer chain at runtime, the compiler cannot build the connection without debug info or pseudo probe. In such case we just need to report the untrackable frame as the base frame and all of its child frames. With more samples reported I'm seeing this improves the performance of an internal benchmark by 2.5%. Reviewed By: wenlei, wlei Differential Revision: https://reviews.llvm.org/D102961
1 parent 7cd07d3 commit 3b51b51

File tree

6 files changed

+250
-4
lines changed

6 files changed

+250
-4
lines changed
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
2+
target triple = "x86_64-unknown-linux-gnu"
3+
4+
@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1
5+
6+
; Function Attrs: nounwind readnone uwtable willreturn
7+
define dso_local i32 @bar(i32 %x, i32 %y) local_unnamed_addr #0 !dbg !10 {
8+
entry:
9+
call void @llvm.dbg.value(metadata i32 %x, metadata !15, metadata !DIExpression()), !dbg !17
10+
call void @llvm.dbg.value(metadata i32 %y, metadata !16, metadata !DIExpression()), !dbg !17
11+
call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 1, i32 0, i64 -1), !dbg !18
12+
%rem = srem i32 %x, 3, !dbg !20
13+
%tobool.not = icmp eq i32 %rem, 0, !dbg !20
14+
call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 2, i32 2, i64 -1), !dbg !21
15+
call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 3, i32 2, i64 -1), !dbg !23
16+
%0 = sub i32 0, %y, !dbg !24
17+
%retval.0.p = select i1 %tobool.not, i32 %y, i32 %0, !dbg !24
18+
%retval.0 = add i32 %retval.0.p, %x, !dbg !24
19+
call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 4, i32 0, i64 -1), !dbg !25
20+
ret i32 %retval.0, !dbg !25
21+
}
22+
23+
; Function Attrs: noinline nounwind uwtable
24+
define dso_local void @foo() local_unnamed_addr #1 !dbg !26 {
25+
entry:
26+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1), !dbg !32
27+
call void @llvm.dbg.value(metadata i32 0, metadata !30, metadata !DIExpression()), !dbg !33
28+
call void @llvm.dbg.value(metadata i32 0, metadata !31, metadata !DIExpression()), !dbg !33
29+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1), !dbg !34
30+
call void @llvm.dbg.value(metadata i32 1, metadata !31, metadata !DIExpression()), !dbg !33
31+
br label %while.body, !dbg !35
32+
33+
while.body: ; preds = %entry, %if.end
34+
%inc8 = phi i32 [ 1, %entry ], [ %inc, %if.end ]
35+
%s.07 = phi i32 [ 0, %entry ], [ %s.1, %if.end ]
36+
call void @llvm.dbg.value(metadata i32 %s.07, metadata !30, metadata !DIExpression()), !dbg !33
37+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 3, i32 0, i64 -1), !dbg !36
38+
%rem = urem i32 %inc8, 91, !dbg !38
39+
%tobool.not = icmp eq i32 %rem, 0, !dbg !38
40+
br i1 %tobool.not, label %if.else, label %if.then, !dbg !39
41+
42+
if.then: ; preds = %while.body
43+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 4, i32 0, i64 -1), !dbg !40
44+
call void @llvm.dbg.value(metadata i32 %inc8, metadata !15, metadata !DIExpression()) #6, !dbg !41
45+
call void @llvm.dbg.value(metadata i32 %s.07, metadata !16, metadata !DIExpression()) #6, !dbg !41
46+
call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 1, i32 0, i64 -1) #6, !dbg !44
47+
%rem.i = urem i32 %inc8, 3, !dbg !45
48+
%tobool.not.i = icmp eq i32 %rem.i, 0, !dbg !45
49+
call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 2, i32 2, i64 -1) #6, !dbg !46
50+
call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 3, i32 2, i64 -1) #6, !dbg !47
51+
%0 = sub i32 0, %s.07, !dbg !48
52+
%retval.0.p.i = select i1 %tobool.not.i, i32 %s.07, i32 %0, !dbg !48
53+
%retval.0.i = add i32 %retval.0.p.i, %inc8, !dbg !48
54+
call void @llvm.pseudoprobe(i64 -2012135647395072713, i64 4, i32 0, i64 -1) #6, !dbg !49
55+
call void @llvm.dbg.value(metadata i32 %retval.0.i, metadata !30, metadata !DIExpression()), !dbg !33
56+
br label %if.end, !dbg !50
57+
58+
if.else: ; preds = %while.body
59+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 5, i32 0, i64 -1), !dbg !51
60+
%add = add nsw i32 %s.07, 30, !dbg !51
61+
call void @llvm.dbg.value(metadata i32 %add, metadata !30, metadata !DIExpression()), !dbg !33
62+
br label %if.end
63+
64+
if.end: ; preds = %if.else, %if.then
65+
%s.1 = phi i32 [ %retval.0.i, %if.then ], [ %add, %if.else ], !dbg !52
66+
call void @llvm.dbg.value(metadata i32 %s.1, metadata !30, metadata !DIExpression()), !dbg !33
67+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 6, i32 0, i64 -1), !dbg !35
68+
call void @llvm.dbg.value(metadata i32 %inc8, metadata !31, metadata !DIExpression()), !dbg !33
69+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 2, i32 0, i64 -1), !dbg !34
70+
%inc = add nuw nsw i32 %inc8, 1, !dbg !34
71+
call void @llvm.dbg.value(metadata i32 %inc, metadata !31, metadata !DIExpression()), !dbg !33
72+
%exitcond.not = icmp eq i32 %inc, 16000001, !dbg !53
73+
br i1 %exitcond.not, label %while.end, label %while.body, !dbg !35, !llvm.loop !54
74+
75+
while.end: ; preds = %if.end
76+
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 7, i32 0, i64 -1), !dbg !57
77+
%call1 = call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i64 0, i64 0), i32 %s.1), !dbg !58
78+
ret void, !dbg !60
79+
}
80+
81+
; Function Attrs: nofree nounwind
82+
declare dso_local noundef i32 @printf(i8* nocapture noundef readonly, ...) local_unnamed_addr #2
83+
84+
; Function Attrs: nounwind uwtable
85+
define dso_local i32 @main() local_unnamed_addr #3 !dbg !61 {
86+
entry:
87+
call void @llvm.pseudoprobe(i64 -2624081020897602054, i64 1, i32 0, i64 -1), !dbg !64
88+
call void @foo(), !dbg !65
89+
ret i32 0, !dbg !67
90+
}
91+
92+
; Function Attrs: inaccessiblememonly nounwind willreturn
93+
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #4
94+
95+
; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
96+
declare void @llvm.dbg.value(metadata, metadata, metadata) #5
97+
98+
attributes #0 = { nounwind readnone uwtable willreturn "disable-tail-calls"="true" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
99+
attributes #1 = { noinline nounwind uwtable "disable-tail-calls"="true" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
100+
attributes #2 = { nofree nounwind "disable-tail-calls"="true" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
101+
attributes #3 = { nounwind uwtable "disable-tail-calls"="true" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
102+
attributes #4 = { inaccessiblememonly nounwind willreturn }
103+
attributes #5 = { nofree nosync nounwind readnone speculatable willreturn }
104+
attributes #6 = { nounwind }
105+
106+
!llvm.dbg.cu = !{!0}
107+
!llvm.module.flags = !{!3, !4, !5}
108+
!llvm.ident = !{!6}
109+
!llvm.pseudo_probe_desc = !{!7, !8, !9}
110+
111+
!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 12.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, splitDebugInlining: false, nameTableKind: None)
112+
!1 = !DIFile(filename: "test.c", directory: "test")
113+
!2 = !{}
114+
!3 = !{i32 7, !"Dwarf Version", i32 4}
115+
!4 = !{i32 2, !"Debug Info Version", i32 3}
116+
!5 = !{i32 1, !"wchar_size", i32 4}
117+
!6 = !{!"clang version 12.0.0"}
118+
!7 = !{i64 -2012135647395072713, i64 72617220756, !"bar", null}
119+
!8 = !{i64 6699318081062747564, i64 563088904013236, !"foo", null}
120+
!9 = !{i64 -2624081020897602054, i64 281479271677951, !"main", null}
121+
!10 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 3, type: !11, scopeLine: 3, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !14)
122+
!11 = !DISubroutineType(types: !12)
123+
!12 = !{!13, !13, !13}
124+
!13 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
125+
!14 = !{!15, !16}
126+
!15 = !DILocalVariable(name: "x", arg: 1, scope: !10, file: !1, line: 3, type: !13)
127+
!16 = !DILocalVariable(name: "y", arg: 2, scope: !10, file: !1, line: 3, type: !13)
128+
!17 = !DILocation(line: 0, scope: !10)
129+
!18 = !DILocation(line: 4, column: 9, scope: !19)
130+
!19 = distinct !DILexicalBlock(scope: !10, file: !1, line: 4, column: 9)
131+
!20 = !DILocation(line: 4, column: 11, scope: !19)
132+
!21 = !DILocation(line: 5, column: 18, scope: !22)
133+
!22 = distinct !DILexicalBlock(scope: !19, file: !1, line: 4, column: 16)
134+
!23 = !DILocation(line: 7, column: 14, scope: !10)
135+
!24 = !DILocation(line: 4, column: 9, scope: !10)
136+
!25 = !DILocation(line: 8, column: 1, scope: !10)
137+
!26 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 11, type: !27, scopeLine: 11, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !29)
138+
!27 = !DISubroutineType(types: !28)
139+
!28 = !{null}
140+
!29 = !{!30, !31}
141+
!30 = !DILocalVariable(name: "s", scope: !26, file: !1, line: 12, type: !13)
142+
!31 = !DILocalVariable(name: "i", scope: !26, file: !1, line: 12, type: !13)
143+
!32 = !DILocation(line: 12, column: 5, scope: !26)
144+
!33 = !DILocation(line: 0, scope: !26)
145+
!34 = !DILocation(line: 13, column: 15, scope: !26)
146+
!35 = !DILocation(line: 13, column: 7, scope: !26)
147+
!36 = !DILocation(line: 14, column: 17, scope: !37)
148+
!37 = distinct !DILexicalBlock(scope: !26, file: !1, line: 14, column: 17)
149+
!38 = !DILocation(line: 14, column: 19, scope: !37)
150+
!39 = !DILocation(line: 14, column: 17, scope: !26)
151+
!40 = !DILocation(line: 14, column: 33, scope: !37)
152+
!41 = !DILocation(line: 0, scope: !10, inlinedAt: !42)
153+
!42 = distinct !DILocation(line: 14, column: 29, scope: !43)
154+
!43 = !DILexicalBlockFile(scope: !37, file: !1, discriminator: 186646599)
155+
!44 = !DILocation(line: 4, column: 9, scope: !19, inlinedAt: !42)
156+
!45 = !DILocation(line: 4, column: 11, scope: !19, inlinedAt: !42)
157+
!46 = !DILocation(line: 5, column: 18, scope: !22, inlinedAt: !42)
158+
!47 = !DILocation(line: 7, column: 14, scope: !10, inlinedAt: !42)
159+
!48 = !DILocation(line: 4, column: 9, scope: !10, inlinedAt: !42)
160+
!49 = !DILocation(line: 8, column: 1, scope: !10, inlinedAt: !42)
161+
!50 = !DILocation(line: 14, column: 25, scope: !37)
162+
!51 = !DILocation(line: 14, column: 47, scope: !37)
163+
!52 = !DILocation(line: 0, scope: !37)
164+
!53 = !DILocation(line: 13, column: 18, scope: !26)
165+
!54 = distinct !{!54, !35, !55, !56}
166+
!55 = !DILocation(line: 14, column: 50, scope: !26)
167+
!56 = !{!"llvm.loop.mustprogress"}
168+
!57 = !DILocation(line: 15, column: 31, scope: !26)
169+
!58 = !DILocation(line: 15, column: 9, scope: !59)
170+
!59 = !DILexicalBlockFile(scope: !26, file: !1, discriminator: 186646607)
171+
!60 = !DILocation(line: 16, column: 1, scope: !26)
172+
!61 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 18, type: !62, scopeLine: 18, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !2)
173+
!62 = !DISubroutineType(types: !63)
174+
!63 = !{!13}
175+
!64 = !DILocation(line: 19, column: 5, scope: !61)
176+
!65 = !DILocation(line: 19, column: 5, scope: !66)
177+
!66 = !DILexicalBlockFile(scope: !61, file: !1, discriminator: 7)
178+
!67 = !DILocation(line: 20, column: 7, scope: !61)
Binary file not shown.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
PERF_RECORD_MMAP2 1664112/1664112: [0x400000(0x1000) @ 0 08:11 806256818 82060973]: r-xp truncated-pseudoprobe.perfbin
2+
40057d
3+
4005b9
4+
7f67469af555
5+
0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0 0x40059f/0x400553/P/-/-/0
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
; RUN: llvm-profgen --perfscript=%S/Inputs/truncated-pseudoprobe.perfscript --binary=%S/Inputs/truncated-pseudoprobe.perfbin --output=%t
2+
; RUN: FileCheck %s --input-file %t
3+
4+
; CHECK: [foo]:75:0
5+
; CHECK-NEXT: 2: 15
6+
; CHECK-NEXT: 3: 15
7+
; CHECK-NEXT: 4: 15
8+
; CHECK-NEXT: 6: 15
9+
; CHECK-NEXT: 8: 15 bar:15
10+
; CHECK-NEXT: !CFGChecksum: 563088904013236
11+
; CHECK-NEXT: !Attributes: 0
12+
; CHECK: [foo:8 @ bar]:30:15
13+
; CHECK-NEXT: 1: 15
14+
; CHECK-NEXT: 2: 18446744073709551615
15+
; CHECK-NEXT: 3: 18446744073709551615
16+
; CHECK-NEXT: 4: 15
17+
; CHECK-NEXT: !CFGChecksum: 72617220756
18+
; CHECK-NEXT: !Attributes: 1
19+
20+
; truncated-pseudoprobe.perfbin is from the following compile commands:
21+
; llc -pseudo-probe-for-profiling truncated-pseudoprobe.ll -filetype=obj -o truncated-pseudoprobe.o
22+
; clang truncated-pseudoprobe.o -o truncated-pseudoprobe.perfbin

llvm/tools/llvm-profgen/PerfReader.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -143,9 +143,11 @@ void VirtualUnwinder::collectSamplesFromFrameTrie(
143143
if (!Cur->isDummyRoot()) {
144144
if (!Stack.pushFrame(Cur)) {
145145
// Process truncated context
146+
// Start a new traversal ignoring its bottom context
147+
T EmptyStack(Binary);
148+
collectSamplesFromFrame(Cur, EmptyStack);
146149
for (const auto &Item : Cur->Children) {
147-
// Start a new traversal ignoring its bottom context
148-
collectSamplesFromFrameTrie(Item.second.get());
150+
collectSamplesFromFrameTrie(Item.second.get(), EmptyStack);
149151
}
150152
return;
151153
}

llvm/tools/llvm-profgen/PerfReader.h

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,8 +75,33 @@ struct LBREntry {
7575
bool IsArtificial = false;
7676
LBREntry(uint64_t S, uint64_t T, bool I)
7777
: Source(S), Target(T), IsArtificial(I) {}
78+
79+
#ifndef NDEBUG
80+
void print() const {
81+
dbgs() << "from " << format("%#010x", Source) << " to "
82+
<< format("%#010x", Target);
83+
if (IsArtificial)
84+
dbgs() << " Artificial";
85+
}
86+
#endif
7887
};
7988

89+
#ifndef NDEBUG
90+
static inline void printLBRStack(const SmallVectorImpl<LBREntry> &LBRStack) {
91+
for (size_t I = 0; I < LBRStack.size(); I++) {
92+
dbgs() << "[" << I << "] ";
93+
LBRStack[I].print();
94+
dbgs() << "\n";
95+
}
96+
}
97+
98+
static inline void printCallStack(const SmallVectorImpl<uint64_t> &CallStack) {
99+
for (size_t I = 0; I < CallStack.size(); I++) {
100+
dbgs() << "[" << I << "] " << format("%#010x", CallStack[I]) << "\n";
101+
}
102+
}
103+
#endif
104+
80105
// Hash interface for generic data of type T
81106
// Data should implement a \fn getHashCode and a \fn isEqual
82107
// Currently getHashCode is non-virtual to avoid the overhead of calling vtable,
@@ -185,6 +210,15 @@ struct HybridSample : public PerfSample {
185210
}
186211
HashCode = Hash;
187212
}
213+
214+
#ifndef NDEBUG
215+
__attribute__((used)) void print() const {
216+
dbgs() << "LBR stack\n";
217+
printLBRStack(LBRStack);
218+
dbgs() << "Call stack\n";
219+
printCallStack(CallStack);
220+
}
221+
#endif
188222
};
189223

190224
// After parsing the sample, we record the samples by aggregating them
@@ -224,6 +258,7 @@ struct UnwindState {
224258
BranchSamples.emplace_back(std::make_tuple(Source, Target, Count));
225259
}
226260
bool isDummyRoot() { return Address == 0; }
261+
bool isLeafFrame() { return Children.empty(); }
227262
};
228263

229264
ProfiledFrame DummyTrieRoot;
@@ -406,8 +441,13 @@ struct ProbeStack {
406441
// Callsite merging may cause the loss of original probe IDs.
407442
// Cutting off the context from here since the inliner will
408443
// not know how to consume a context with unknown callsites.
409-
if (!CallProbe)
444+
if (!CallProbe) {
445+
if (!Cur->isLeafFrame())
446+
WithColor::warning()
447+
<< "Untracked frame at " << format("%" PRIx64, Cur->Address)
448+
<< " due to missing call probe\n";
410449
return false;
450+
}
411451
Stack.push_back(CallProbe);
412452
return true;
413453
}
@@ -608,7 +648,6 @@ class PerfReader {
608648
BinaryMap BinaryTable;
609649
AddressBinaryMap AddrToBinaryMap; // Used by address-based lookup.
610650

611-
private:
612651
BinarySampleCounterMap BinarySampleCounters;
613652
// Samples with the repeating time generated by the perf reader
614653
AggregatedCounter AggregatedSamples;

0 commit comments

Comments
 (0)