Skip to content

Commit 87f5e22

Browse files
[MemProf] Tolerate missing leaf debug frames (#71233)
Loosen up the matching so that a missing leaf debug frame in the profile does not prevent matching an allocation context if we can match further up the inlined call context. This relies on the pre-inliner, which was already the default when performing normal PGO feedback along with the MemProf feedback, but to ensure matching is not affected by the presence of PGO, enable the pre-inliner for MemProf feedback as well.
1 parent 5548bcd commit 87f5e22

File tree

8 files changed

+198
-49
lines changed

8 files changed

+198
-49
lines changed

llvm/include/llvm/Passes/PassBuilder.h

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -628,11 +628,16 @@ class PassBuilder {
628628
Error parseModulePassPipeline(ModulePassManager &MPM,
629629
ArrayRef<PipelineElement> Pipeline);
630630

631+
// Adds passes to do pre-inlining and related cleanup passes before
632+
// profile instrumentation/matching (to enable better context sensitivity),
633+
// and for memprof to enable better matching with missing debug frames.
634+
void addPreInlinerPasses(ModulePassManager &MPM, OptimizationLevel Level,
635+
ThinOrFullLTOPhase LTOPhase);
636+
631637
void addPGOInstrPasses(ModulePassManager &MPM, OptimizationLevel Level,
632638
bool RunProfileGen, bool IsCS,
633639
bool AtomicCounterUpdate, std::string ProfileFile,
634640
std::string ProfileRemappingFile,
635-
ThinOrFullLTOPhase LTOPhase,
636641
IntrusiveRefCntPtr<vfs::FileSystem> FS);
637642

638643
// Extension Point callbacks

llvm/lib/Passes/PassBuilderPipelines.cpp

Lines changed: 50 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -733,47 +733,52 @@ void PassBuilder::addRequiredLTOPreLinkPasses(ModulePassManager &MPM) {
733733
MPM.addPass(NameAnonGlobalPass());
734734
}
735735

736-
void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
737-
OptimizationLevel Level, bool RunProfileGen,
738-
bool IsCS, bool AtomicCounterUpdate,
739-
std::string ProfileFile,
740-
std::string ProfileRemappingFile,
741-
ThinOrFullLTOPhase LTOPhase,
742-
IntrusiveRefCntPtr<vfs::FileSystem> FS) {
736+
void PassBuilder::addPreInlinerPasses(ModulePassManager &MPM,
737+
OptimizationLevel Level,
738+
ThinOrFullLTOPhase LTOPhase) {
743739
assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
744-
if (!IsCS && !DisablePreInliner) {
745-
InlineParams IP;
740+
if (DisablePreInliner)
741+
return;
742+
InlineParams IP;
746743

747-
IP.DefaultThreshold = PreInlineThreshold;
744+
IP.DefaultThreshold = PreInlineThreshold;
748745

749-
// FIXME: The hint threshold has the same value used by the regular inliner
750-
// when not optimzing for size. This should probably be lowered after
751-
// performance testing.
752-
// FIXME: this comment is cargo culted from the old pass manager, revisit).
753-
IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325;
754-
ModuleInlinerWrapperPass MIWP(
755-
IP, /* MandatoryFirst */ true,
756-
InlineContext{LTOPhase, InlinePass::EarlyInliner});
757-
CGSCCPassManager &CGPipeline = MIWP.getPM();
746+
// FIXME: The hint threshold has the same value used by the regular inliner
747+
// when not optimzing for size. This should probably be lowered after
748+
// performance testing.
749+
// FIXME: this comment is cargo culted from the old pass manager, revisit).
750+
IP.HintThreshold = Level.isOptimizingForSize() ? PreInlineThreshold : 325;
751+
ModuleInlinerWrapperPass MIWP(
752+
IP, /* MandatoryFirst */ true,
753+
InlineContext{LTOPhase, InlinePass::EarlyInliner});
754+
CGSCCPassManager &CGPipeline = MIWP.getPM();
758755

759-
FunctionPassManager FPM;
760-
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
761-
FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies.
762-
FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
763-
true))); // Merge & remove basic blocks.
764-
FPM.addPass(InstCombinePass()); // Combine silly sequences.
765-
invokePeepholeEPCallbacks(FPM, Level);
756+
FunctionPassManager FPM;
757+
FPM.addPass(SROAPass(SROAOptions::ModifyCFG));
758+
FPM.addPass(EarlyCSEPass()); // Catch trivial redundancies.
759+
FPM.addPass(SimplifyCFGPass(SimplifyCFGOptions().convertSwitchRangeToICmp(
760+
true))); // Merge & remove basic blocks.
761+
FPM.addPass(InstCombinePass()); // Combine silly sequences.
762+
invokePeepholeEPCallbacks(FPM, Level);
766763

767-
CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
768-
std::move(FPM), PTO.EagerlyInvalidateAnalyses));
764+
CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
765+
std::move(FPM), PTO.EagerlyInvalidateAnalyses));
769766

770-
MPM.addPass(std::move(MIWP));
767+
MPM.addPass(std::move(MIWP));
771768

772-
// Delete anything that is now dead to make sure that we don't instrument
773-
// dead code. Instrumentation can end up keeping dead code around and
774-
// dramatically increase code size.
775-
MPM.addPass(GlobalDCEPass());
776-
}
769+
// Delete anything that is now dead to make sure that we don't instrument
770+
// dead code. Instrumentation can end up keeping dead code around and
771+
// dramatically increase code size.
772+
MPM.addPass(GlobalDCEPass());
773+
}
774+
775+
void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM,
776+
OptimizationLevel Level, bool RunProfileGen,
777+
bool IsCS, bool AtomicCounterUpdate,
778+
std::string ProfileFile,
779+
std::string ProfileRemappingFile,
780+
IntrusiveRefCntPtr<vfs::FileSystem> FS) {
781+
assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
777782

778783
if (!RunProfileGen) {
779784
assert(!ProfileFile.empty() && "Profile use expecting a profile file!");
@@ -1104,14 +1109,20 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
11041109
MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM),
11051110
PTO.EagerlyInvalidateAnalyses));
11061111

1112+
// Invoke the pre-inliner passes for instrumentation PGO or MemProf.
1113+
if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
1114+
(PGOOpt->Action == PGOOptions::IRInstr ||
1115+
PGOOpt->Action == PGOOptions::IRUse || !PGOOpt->MemoryProfile.empty()))
1116+
addPreInlinerPasses(MPM, Level, Phase);
1117+
11071118
// Add all the requested passes for instrumentation PGO, if requested.
11081119
if (PGOOpt && Phase != ThinOrFullLTOPhase::ThinLTOPostLink &&
11091120
(PGOOpt->Action == PGOOptions::IRInstr ||
11101121
PGOOpt->Action == PGOOptions::IRUse)) {
11111122
addPGOInstrPasses(MPM, Level,
11121123
/*RunProfileGen=*/PGOOpt->Action == PGOOptions::IRInstr,
11131124
/*IsCS=*/false, PGOOpt->AtomicCounterUpdate,
1114-
PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile, Phase,
1125+
PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile,
11151126
PGOOpt->FS);
11161127
MPM.addPass(PGOIndirectCallPromotion(false, false));
11171128
}
@@ -1332,12 +1343,12 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
13321343
addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true,
13331344
/*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
13341345
PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile,
1335-
LTOPhase, PGOOpt->FS);
1346+
PGOOpt->FS);
13361347
else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
13371348
addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false,
13381349
/*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
13391350
PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile,
1340-
LTOPhase, PGOOpt->FS);
1351+
PGOOpt->FS);
13411352
}
13421353

13431354
// Re-compute GlobalsAA here prior to function passes. This is particularly
@@ -1831,12 +1842,12 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
18311842
addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/true,
18321843
/*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
18331844
PGOOpt->CSProfileGenFile, PGOOpt->ProfileRemappingFile,
1834-
ThinOrFullLTOPhase::FullLTOPostLink, PGOOpt->FS);
1845+
PGOOpt->FS);
18351846
else if (PGOOpt->CSAction == PGOOptions::CSIRUse)
18361847
addPGOInstrPasses(MPM, Level, /*RunProfileGen=*/false,
18371848
/*IsCS=*/true, PGOOpt->AtomicCounterUpdate,
18381849
PGOOpt->ProfileFile, PGOOpt->ProfileRemappingFile,
1839-
ThinOrFullLTOPhase::FullLTOPostLink, PGOOpt->FS);
1850+
PGOOpt->FS);
18401851
}
18411852

18421853
// Break up allocas

llvm/lib/Transforms/Instrumentation/MemProfiler.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -808,19 +808,19 @@ static void readMemprof(Module &M, Function &F,
808808
auto CalleeGUID = Function::getGUID(Name);
809809
auto StackId = computeStackId(CalleeGUID, GetOffset(DIL),
810810
ProfileHasColumns ? DIL->getColumn() : 0);
811-
// LeafFound will only be false on the first iteration, since we either
812-
// set it true or break out of the loop below.
811+
// Check if we have found the profile's leaf frame. If yes, collect
812+
// the rest of the call's inlined context starting here. If not, see if
813+
// we find a match further up the inlined context (in case the profile
814+
// was missing debug frames at the leaf).
813815
if (!LeafFound) {
814816
AllocInfoIter = LocHashToAllocInfo.find(StackId);
815817
CallSitesIter = LocHashToCallSites.find(StackId);
816-
// Check if the leaf is in one of the maps. If not, no need to look
817-
// further at this call.
818-
if (AllocInfoIter == LocHashToAllocInfo.end() &&
819-
CallSitesIter == LocHashToCallSites.end())
820-
break;
821-
LeafFound = true;
818+
if (AllocInfoIter != LocHashToAllocInfo.end() ||
819+
CallSitesIter != LocHashToCallSites.end())
820+
LeafFound = true;
822821
}
823-
InlinedCallStack.push_back(StackId);
822+
if (LeafFound)
823+
InlinedCallStack.push_back(StackId);
824824
}
825825
// If leaf not in either of the maps, skip inst.
826826
if (!LeafFound)

llvm/test/Other/new-pm-memprof.ll

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
;; Ensure we invoke the preinliner when feeding back a memprof profile.
2+
3+
;; The opt invocation will fail as the profdata file is empty, which is fine
4+
;; since we are simply testing the pass pipeline below.
5+
; RUN: not opt -debug-pass-manager -passes='default<O2>' -memory-profile-file=/dev/null %s 2>&1 | FileCheck %s
6+
7+
; CHECK: Running pass: InlinerPass on (foo)
8+
; CHECK: Running pass: MemProfUsePass
9+
10+
define void @foo() {
11+
ret void
12+
}
Binary file not shown.
Binary file not shown.

llvm/test/Transforms/PGOProfile/Inputs/update_memprof_inputs.sh

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,3 +94,36 @@ ${LLVMPROFDATA} merge --text ${OUTDIR}/memprof_pgo.profraw -o ${OUTDIR}/memprof_
9494
rm ${OUTDIR}/memprof.cc
9595
rm ${OUTDIR}/pgo.exe
9696
rm ${OUTDIR}/memprof_pgo.profraw
97+
98+
# Use musttail to simulate a missing leaf debug frame in the profiled binary.
99+
# Note we don't currently match onto explicit ::operator new calls, which is
100+
# why the non-musttail case uses implicit new (which doesn't support musttail).
101+
# Note that changes in the code below which affect relative line number
102+
# offsets of calls from their parent function can affect callsite matching in
103+
# the LLVM IR.
104+
cat > ${OUTDIR}/memprof_missing_leaf.cc << EOF
105+
#include <new>
106+
#ifndef USE_MUSTTAIL
107+
#define USE_MUSTTAIL 0
108+
#endif
109+
110+
// clang::musttail requires that the argument signature matches that of the caller.
111+
void *bar(std::size_t s) {
112+
#if USE_MUSTTAIL
113+
[[clang::musttail]] return ::operator new (s);
114+
#else
115+
return new char[s];
116+
#endif
117+
}
118+
119+
int main() {
120+
char *a = (char *)bar(1);
121+
delete a;
122+
return 0;
123+
}
124+
EOF
125+
126+
${CLANG} ${COMMON_FLAGS} -fmemory-profile -DUSE_MUSTTAIL=1 ${OUTDIR}/memprof_missing_leaf.cc -o ${OUTDIR}/memprof_missing_leaf.exe
127+
env MEMPROF_OPTIONS=log_path=stdout ${OUTDIR}/memprof_missing_leaf.exe > ${OUTDIR}/memprof_missing_leaf.memprofraw
128+
129+
rm ${OUTDIR}/memprof_missing_leaf.cc
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
;; Tests memprof profile matching when the leaf frame is missing in the
2+
;; profile. In this case the call to operator new was inlined before
3+
;; matching and we are able to match the next call frame up the inlined
4+
;; context.
5+
6+
;; Avoid failures on big-endian systems that can't read the profile properly
7+
; REQUIRES: x86_64-linux
8+
9+
;; TODO: Use text profile inputs once that is available for memprof.
10+
;; # To update the Inputs below, run Inputs/update_memprof_inputs.sh.
11+
;; # To generate below LLVM IR for use in matching.
12+
;; $ clang++ -gmlt -fdebug-info-for-profiling -S memprof_missing_leaf.cc \
13+
;; -O2 -emit-llvm
14+
15+
; RUN: llvm-profdata merge %S/Inputs/memprof_missing_leaf.memprofraw --profiled-binary %S/Inputs/memprof_missing_leaf.exe -o %t.memprofdata
16+
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -S | FileCheck %s
17+
18+
; CHECK: call {{.*}} @_Znam{{.*}} #[[ATTR:[0-9]+]]
19+
; CHECK: attributes #[[ATTR]] = {{.*}} "memprof"="notcold"
20+
21+
; ModuleID = '<stdin>'
22+
source_filename = "memprof_missing_leaf.cc"
23+
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
24+
target triple = "x86_64-unknown-linux-gnu"
25+
26+
; Function Attrs: nobuiltin allocsize(0)
27+
declare noundef nonnull ptr @_Znam(i64 noundef) #0
28+
29+
; Function Attrs: mustprogress norecurse uwtable
30+
define dso_local noundef i32 @main() #1 !dbg !8 {
31+
entry:
32+
%s.addr.i = alloca i64, align 8
33+
%retval = alloca i32, align 4
34+
%a = alloca ptr, align 8
35+
store i32 0, ptr %retval, align 4
36+
store i64 1, ptr %s.addr.i, align 8, !tbaa !11
37+
%0 = load i64, ptr %s.addr.i, align 8, !dbg !15, !tbaa !11
38+
%call.i = call noalias noundef nonnull ptr @_Znam(i64 noundef %0) #3, !dbg !18
39+
store ptr %call.i, ptr %a, align 8, !dbg !19, !tbaa !20
40+
%1 = load ptr, ptr %a, align 8, !dbg !22, !tbaa !20
41+
%isnull = icmp eq ptr %1, null, !dbg !23
42+
br i1 %isnull, label %delete.end, label %delete.notnull, !dbg !23
43+
44+
delete.notnull: ; preds = %entry
45+
call void @_ZdlPv(ptr noundef %1) #4, !dbg !23
46+
br label %delete.end, !dbg !23
47+
48+
delete.end: ; preds = %delete.notnull, %entry
49+
ret i32 0, !dbg !24
50+
}
51+
52+
; Function Attrs: nobuiltin nounwind
53+
declare void @_ZdlPv(ptr noundef) #2
54+
55+
attributes #0 = { nobuiltin allocsize(0) "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
56+
attributes #1 = { mustprogress norecurse uwtable "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
57+
attributes #2 = { nobuiltin nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" }
58+
attributes #3 = { builtin allocsize(0) }
59+
attributes #4 = { builtin nounwind }
60+
61+
!llvm.dbg.cu = !{!0}
62+
!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
63+
64+
!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 18.0.0 ([email protected]:llvm/llvm-project.git 71bf052ec90e77cb4aa66505d47cbc4b6016ac1d)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None)
65+
!1 = !DIFile(filename: "memprof_missing_leaf.cc", directory: ".", checksumkind: CSK_MD5, checksum: "f1445a8699406a6b826128704d257677")
66+
!2 = !{i32 7, !"Dwarf Version", i32 5}
67+
!3 = !{i32 2, !"Debug Info Version", i32 3}
68+
!4 = !{i32 1, !"wchar_size", i32 4}
69+
!5 = !{i32 8, !"PIC Level", i32 2}
70+
!6 = !{i32 7, !"PIE Level", i32 2}
71+
!7 = !{i32 7, !"uwtable", i32 2}
72+
!8 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 15, type: !9, scopeLine: 15, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
73+
!9 = !DISubroutineType(types: !10)
74+
!10 = !{}
75+
!11 = !{!12, !12, i64 0}
76+
!12 = !{!"long", !13, i64 0}
77+
!13 = !{!"omnipotent char", !14, i64 0}
78+
!14 = !{!"Simple C++ TBAA"}
79+
!15 = !DILocation(line: 11, column: 19, scope: !16, inlinedAt: !17)
80+
!16 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barm", scope: !1, file: !1, line: 7, type: !9, scopeLine: 7, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0)
81+
!17 = distinct !DILocation(line: 16, column: 21, scope: !8)
82+
!18 = !DILocation(line: 11, column: 10, scope: !16, inlinedAt: !17)
83+
!19 = !DILocation(line: 16, column: 9, scope: !8)
84+
!20 = !{!21, !21, i64 0}
85+
!21 = !{!"any pointer", !13, i64 0}
86+
!22 = !DILocation(line: 17, column: 10, scope: !8)
87+
!23 = !DILocation(line: 17, column: 3, scope: !8)
88+
!24 = !DILocation(line: 18, column: 3, scope: !8)

0 commit comments

Comments
 (0)