Skip to content

Commit ae6d5dd

Browse files
[MemProf] Prune unneeded non-cold contexts (#124823)
We can take advantage of the fact that we subsequently only clone cold allocation contexts, since not cold behavior is the default, and significantly reduce the amount of metadata (and later ThinLTO summary and MemProfContextDisambiguation graph nodes) by pruning unnecessary not cold contexts when building metadata from the trie. Specifically, we only need to keep notcold contexts that overlap the longest with cold allocations, to know how deeply to clone those contexts to expose the cold allocation behavior. For a large target this reduced ThinLTO bitcode object sizes by about 35%. It reduced the ThinLTO indexing time by about half and the peak ThinLTO indexing memory by about 20%.
1 parent bda1976 commit ae6d5dd

File tree

5 files changed

+203
-47
lines changed

5 files changed

+203
-47
lines changed

llvm/include/llvm/Analysis/MemoryProfileInfo.h

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,37 @@ class CallStackTrie {
5656
// Allocation types for call context sharing the context prefix at this
5757
// node.
5858
uint8_t AllocTypes;
59+
// Updated as we add allocations to note if this is the deepest point in the
60+
// trie that has an ambiguous allocation type (both Cold and NotCold). It is
61+
// used to prune unneeded NotCold contexts, taking advantage of the fact
62+
// that we later will only clone Cold contexts, as NotCold is the allocation
63+
// default. We only need to keep as metadata the NotCold contexts that
64+
// overlap the longest with Cold allocations, so that we know how deeply we
65+
// need to clone. For example, assume we add the following contexts to the
66+
// trie:
67+
// 1 3 (notcold)
68+
// 1 2 4 (cold)
69+
// 1 2 5 (notcold)
70+
// 1 2 6 (notcold)
71+
// the trie looks like:
72+
// 1
73+
// / \
74+
// 2 3
75+
// /|\
76+
// 4 5 6
77+
//
78+
// It is sufficient to prune all but one not cold contexts (either 1,2,5 or
79+
// 1,2,6, we arbitrarily keep the first one we encounter which will be
80+
// 1,2,5). We'll initially have DeepestAmbiguousAllocType set false for trie
81+
// node 1 after the trie is built, and true for node 2. This indicates that
82+
// the not cold context ending in 3 is not needed (its immediate callee has
83+
// this value set false). The first notcold context we encounter when
84+
// iterating the callers of node 2 will be the context ending in 5 (since
85+
// std::map iteration is in sorted order of key), which will see that this
86+
// field is true for its callee, so we will keep it. But at that point we
87+
// set the callee's flag to false which prevents adding the not cold context
88+
// ending in 6 unnecessarily.
89+
bool DeepestAmbiguousAllocType = true;
5990
// If the user has requested reporting of hinted sizes, keep track of the
6091
// associated full stack id and profiled sizes. Can have more than one
6192
// after trimming (e.g. when building from metadata). This is only placed on
@@ -103,7 +134,8 @@ class CallStackTrie {
103134
bool buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
104135
std::vector<uint64_t> &MIBCallStack,
105136
std::vector<Metadata *> &MIBNodes,
106-
bool CalleeHasAmbiguousCallerContext);
137+
bool CalleeHasAmbiguousCallerContext,
138+
bool &CalleeDeepestAmbiguousAllocType);
107139

108140
public:
109141
CallStackTrie() = default;

llvm/lib/Analysis/MemoryProfileInfo.cpp

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,13 @@ cl::opt<bool> MemProfReportHintedSizes(
5151
"memprof-report-hinted-sizes", cl::init(false), cl::Hidden,
5252
cl::desc("Report total allocation sizes of hinted allocations"));
5353

54+
// This is useful if we have enabled reporting of hinted sizes, and want to get
55+
// information from the indexing step for all contexts (especially for testing),
56+
// or have specified a value less than 100% for -memprof-cloning-cold-threshold.
57+
cl::opt<bool> MemProfKeepAllNotColdContexts(
58+
"memprof-keep-all-not-cold-contexts", cl::init(false), cl::Hidden,
59+
cl::desc("Keep all non-cold contexts (increases cloning overheads)"));
60+
5461
AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity,
5562
uint64_t AllocCount,
5663
uint64_t TotalLifetime) {
@@ -156,10 +163,16 @@ void CallStackTrie::addCallStack(
156163
continue;
157164
}
158165
// Update existing caller node if it exists.
166+
CallStackTrieNode *Prev = nullptr;
159167
auto Next = Curr->Callers.find(StackId);
160168
if (Next != Curr->Callers.end()) {
169+
Prev = Curr;
161170
Curr = Next->second;
162171
Curr->addAllocType(AllocType);
172+
// If this node has an ambiguous alloc type, its callee is not the deepest
173+
// point where we have an ambigous allocation type.
174+
if (!hasSingleAllocType(Curr->AllocTypes))
175+
Prev->DeepestAmbiguousAllocType = false;
163176
continue;
164177
}
165178
// Otherwise add a new caller node.
@@ -243,14 +256,35 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) {
243256
bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
244257
std::vector<uint64_t> &MIBCallStack,
245258
std::vector<Metadata *> &MIBNodes,
246-
bool CalleeHasAmbiguousCallerContext) {
259+
bool CalleeHasAmbiguousCallerContext,
260+
bool &CalleeDeepestAmbiguousAllocType) {
247261
// Trim context below the first node in a prefix with a single alloc type.
248262
// Add an MIB record for the current call stack prefix.
249263
if (hasSingleAllocType(Node->AllocTypes)) {
250-
std::vector<ContextTotalSize> ContextSizeInfo;
251-
collectContextSizeInfo(Node, ContextSizeInfo);
252-
MIBNodes.push_back(createMIBNode(
253-
Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo));
264+
// Because we only clone cold contexts (we don't clone for exposing NotCold
265+
// contexts as that is the default allocation behavior), we create MIB
266+
// metadata for this context if any of the following are true:
267+
// 1) It is cold.
268+
// 2) The immediate callee is the deepest point where we have an ambiguous
269+
// allocation type (i.e. the other callers that are cold need to know
270+
// that we have a not cold context overlapping to this point so that we
271+
// know how deep to clone).
272+
// 3) MemProfKeepAllNotColdContexts is enabled, which is useful if we are
273+
// reporting hinted sizes, and want to get information from the indexing
274+
// step for all contexts, or have specified a value less than 100% for
275+
// -memprof-cloning-cold-threshold.
276+
if (Node->hasAllocType(AllocationType::Cold) ||
277+
CalleeDeepestAmbiguousAllocType || MemProfKeepAllNotColdContexts) {
278+
std::vector<ContextTotalSize> ContextSizeInfo;
279+
collectContextSizeInfo(Node, ContextSizeInfo);
280+
MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack,
281+
(AllocationType)Node->AllocTypes,
282+
ContextSizeInfo));
283+
// If we just emitted an MIB for a not cold caller, don't need to emit
284+
// another one for the callee to correctly disambiguate its cold callers.
285+
if (!Node->hasAllocType(AllocationType::Cold))
286+
CalleeDeepestAmbiguousAllocType = false;
287+
}
254288
return true;
255289
}
256290

@@ -261,9 +295,9 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
261295
bool AddedMIBNodesForAllCallerContexts = true;
262296
for (auto &Caller : Node->Callers) {
263297
MIBCallStack.push_back(Caller.first);
264-
AddedMIBNodesForAllCallerContexts &=
265-
buildMIBNodes(Caller.second, Ctx, MIBCallStack, MIBNodes,
266-
NodeHasAmbiguousCallerContext);
298+
AddedMIBNodesForAllCallerContexts &= buildMIBNodes(
299+
Caller.second, Ctx, MIBCallStack, MIBNodes,
300+
NodeHasAmbiguousCallerContext, Node->DeepestAmbiguousAllocType);
267301
// Remove Caller.
268302
MIBCallStack.pop_back();
269303
}
@@ -337,10 +371,16 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
337371
MIBCallStack.push_back(AllocStackId);
338372
std::vector<Metadata *> MIBNodes;
339373
assert(!Alloc->Callers.empty() && "addCallStack has not been called yet");
340-
// The last parameter is meant to say whether the callee of the given node
341-
// has more than one caller. Here the node being passed in is the alloc
342-
// and it has no callees. So it's false.
343-
if (buildMIBNodes(Alloc, Ctx, MIBCallStack, MIBNodes, false)) {
374+
// The CalleeHasAmbiguousCallerContext flag is meant to say whether the
375+
// callee of the given node has more than one caller. Here the node being
376+
// passed in is the alloc and it has no callees. So it's false.
377+
// Similarly, the last parameter is meant to say whether the callee of the
378+
// given node is the deepest point where we have ambiguous alloc types, which
379+
// is also false as the alloc has no callees.
380+
bool DeepestAmbiguousAllocType = true;
381+
if (buildMIBNodes(Alloc, Ctx, MIBCallStack, MIBNodes,
382+
/*CalleeHasAmbiguousCallerContext=*/false,
383+
DeepestAmbiguousAllocType)) {
344384
assert(MIBCallStack.size() == 1 &&
345385
"Should only be left with Alloc's location in stack");
346386
CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));

llvm/test/Transforms/PGOProfile/memprof.ll

Lines changed: 7 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -64,14 +64,14 @@
6464
; RUN: opt < %s -passes='pgo-instr-use,memprof-use<profile-filename=%t.pgomemprofdata>' -pgo-test-profile-file=%t.pgomemprofdata -pgo-warn-missing-function -S 2>&1 | FileCheck %s --check-prefixes=MEMPROF,ALL,PGO
6565

6666
;; Check that the total sizes are reported if requested.
67-
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-report-hinted-sizes 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZES
67+
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-report-hinted-sizes -memprof-keep-all-not-cold-contexts 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZES
6868

6969
;; Check that we hint additional allocations with a threshold < 100%
7070
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-report-hinted-sizes -memprof-matching-cold-threshold=60 2>&1 | FileCheck %s --check-prefixes=TOTALSIZESSINGLE,TOTALSIZESTHRESH60
7171

7272
;; Make sure that the -memprof-cloning-cold-threshold flag is enough to cause
7373
;; the size metadata to be generated for the LTO link.
74-
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-cloning-cold-threshold=80 2>&1 | FileCheck %s --check-prefixes=TOTALSIZES
74+
; RUN: opt < %s -passes='memprof-use<profile-filename=%t.memprofdata>' -pgo-warn-missing-function -S -memprof-cloning-cold-threshold=80 -memprof-keep-all-not-cold-contexts 2>&1 | FileCheck %s --check-prefixes=TOTALSIZES
7575

7676
;; Make sure we emit a random hotness seed if requested.
7777
; RUN: llvm-profdata merge -memprof-random-hotness %S/Inputs/memprof.memprofraw --profiled-binary %S/Inputs/memprof.exe -o %t.memprofdatarand 2>&1 | FileCheck %s --check-prefix=RAND
@@ -339,7 +339,7 @@ for.end: ; preds = %for.cond
339339

340340
; MEMPROF: #[[A1]] = { builtin allocsize(0) "memprof"="notcold" }
341341
; MEMPROF: #[[A2]] = { builtin allocsize(0) "memprof"="cold" }
342-
; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]], ![[MIB5:[0-9]+]]}
342+
; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]]}
343343
; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"cold"}
344344
; MEMPROF: ![[STACK1]] = !{i64 2732490490862098848, i64 748269490701775343}
345345
; MEMPROF: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"cold"}
@@ -348,8 +348,6 @@ for.end: ; preds = %for.cond
348348
; MEMPROF: ![[STACK3]] = !{i64 2732490490862098848, i64 2104812325165620841, i64 6281715513834610934, i64 6281715513834610934, i64 6281715513834610934, i64 6281715513834610934}
349349
; MEMPROF: ![[MIB4]] = !{![[STACK4:[0-9]+]], !"cold"}
350350
; MEMPROF: ![[STACK4]] = !{i64 2732490490862098848, i64 8467819354083268568}
351-
; MEMPROF: ![[MIB5]] = !{![[STACK5:[0-9]+]], !"notcold"}
352-
; MEMPROF: ![[STACK5]] = !{i64 2732490490862098848, i64 8690657650969109624}
353351
; MEMPROF: ![[C1]] = !{i64 2732490490862098848}
354352
; MEMPROF: ![[C2]] = !{i64 8467819354083268568}
355353
; MEMPROF: ![[C3]] = !{i64 9086428284934609951}
@@ -390,17 +388,15 @@ for.end: ; preds = %for.cond
390388

391389
; MEMPROFNOCOLINFO: #[[A1]] = { builtin allocsize(0) "memprof"="notcold" }
392390
; MEMPROFNOCOLINFO: #[[A2]] = { builtin allocsize(0) "memprof"="cold" }
393-
; MEMPROFNOCOLINFO: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]], ![[MIB5:[0-9]+]]}
391+
; MEMPROFNOCOLINFO: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]]}
394392
; MEMPROFNOCOLINFO: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"cold"}
395393
; MEMPROFNOCOLINFO: ![[STACK1]] = !{i64 5281664982037379640, i64 6362220161075421157, i64 -5772587307814069790, i64 -5772587307814069790, i64 -5772587307814069790, i64 3577763375057267810}
396394
; MEMPROFNOCOLINFO: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"notcold"}
397395
; MEMPROFNOCOLINFO: ![[STACK2]] = !{i64 5281664982037379640, i64 6362220161075421157, i64 -5772587307814069790, i64 -5772587307814069790, i64 -5772587307814069790, i64 -5772587307814069790}
398-
; MEMPROFNOCOLINFO: ![[MIB3]] = !{![[STACK3:[0-9]+]], !"notcold"}
399-
; MEMPROFNOCOLINFO: ![[STACK3]] = !{i64 5281664982037379640, i64 -6896091699916449732}
396+
; MEMPROFNOCOLINFO: ![[MIB3]] = !{![[STACK3:[0-9]+]], !"cold"}
397+
; MEMPROFNOCOLINFO: ![[STACK3]] = !{i64 5281664982037379640, i64 -6871734214936418908}
400398
; MEMPROFNOCOLINFO: ![[MIB4]] = !{![[STACK4:[0-9]+]], !"cold"}
401-
; MEMPROFNOCOLINFO: ![[STACK4]] = !{i64 5281664982037379640, i64 -6871734214936418908}
402-
; MEMPROFNOCOLINFO: ![[MIB5]] = !{![[STACK5:[0-9]+]], !"cold"}
403-
; MEMPROFNOCOLINFO: ![[STACK5]] = !{i64 5281664982037379640, i64 -6201180255894224618}
399+
; MEMPROFNOCOLINFO: ![[STACK4]] = !{i64 5281664982037379640, i64 -6201180255894224618}
404400
; MEMPROFNOCOLINFO: ![[C1]] = !{i64 5281664982037379640}
405401
; MEMPROFNOCOLINFO: ![[C2]] = !{i64 -6871734214936418908}
406402
; MEMPROFNOCOLINFO: ![[C3]] = !{i64 -5588766871448036195}
@@ -420,7 +416,6 @@ for.end: ; preds = %for.cond
420416
; MEMPROFRAND2: !"cold"
421417
; MEMPROFRAND2: !"cold"
422418
; MEMPROFRAND2: !"notcold"
423-
; MEMPROFRAND2: !"notcold"
424419

425420
; MEMPROFSTATS: 8 memprof - Number of alloc contexts in memory profile.
426421
; MEMPROFSTATS: 10 memprof - Number of callsites in memory profile.

llvm/test/Transforms/PGOProfile/memprof_match_hot_cold_new_calls.ll

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,7 +105,7 @@ entry:
105105

106106
declare noundef ptr @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 noundef, i64 noundef, ptr noundef nonnull align 1 dereferenceable(1), i8 noundef zeroext)
107107

108-
; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]], ![[MIB5:[0-9]+]]}
108+
; MEMPROF: ![[M1]] = !{![[MIB1:[0-9]+]], ![[MIB2:[0-9]+]], ![[MIB3:[0-9]+]], ![[MIB4:[0-9]+]]}
109109
; MEMPROF: ![[MIB1]] = !{![[STACK1:[0-9]+]], !"cold"}
110110
; MEMPROF: ![[STACK1]] = !{i64 2732490490862098848, i64 748269490701775343}
111111
; MEMPROF: ![[MIB2]] = !{![[STACK2:[0-9]+]], !"cold"}
@@ -114,8 +114,6 @@ declare noundef ptr @_ZnwmSt11align_val_tRKSt9nothrow_t12__hot_cold_t(i64 nounde
114114
; MEMPROF: ![[STACK3]] = !{i64 2732490490862098848, i64 2104812325165620841, i64 6281715513834610934, i64 6281715513834610934, i64 6281715513834610934, i64 6281715513834610934}
115115
; MEMPROF: ![[MIB4]] = !{![[STACK4:[0-9]+]], !"cold"}
116116
; MEMPROF: ![[STACK4]] = !{i64 2732490490862098848, i64 8467819354083268568}
117-
; MEMPROF: ![[MIB5]] = !{![[STACK5:[0-9]+]], !"notcold"}
118-
; MEMPROF: ![[STACK5]] = !{i64 2732490490862098848, i64 8690657650969109624}
119117
; MEMPROF: ![[C1]] = !{i64 2732490490862098848}
120118

121119
!llvm.dbg.cu = !{!0}

0 commit comments

Comments
 (0)