Skip to content

Commit 8836d68

Browse files
[MemProf] Optionally discard small non-cold contexts (#139113)
Adds a new option -memprof-callsite-cold-threshold that allows specifying a percent that will cause non-cold contexts to be discarded if the percent cold bytes at a callsite including that context exceeds the given threshold. Default is 100% (no discarding). This reduces the amount of cloning needed to expose cold allocation contexts when parts of the context are dominantly cold. This motivated the change in PR138792, since discarding a context might require a different decision about which not-cold contexts must be kept to expose cloning requirements, so we need to determine that on the fly. Additionally, this required a change to include the context size information in the alloc trie in more cases, so we now guard the inclusion of this information in the generated metadata on the option values.
1 parent 7517a1b commit 8836d68

File tree

4 files changed

+264
-22
lines changed

4 files changed

+264
-22
lines changed

llvm/include/llvm/Analysis/MemoryProfileInfo.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,7 +103,8 @@ class CallStackTrie {
103103
bool buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
104104
std::vector<uint64_t> &MIBCallStack,
105105
std::vector<Metadata *> &MIBNodes,
106-
bool CalleeHasAmbiguousCallerContext);
106+
bool CalleeHasAmbiguousCallerContext, uint64_t &TotalBytes,
107+
uint64_t &ColdBytes);
107108

108109
public:
109110
CallStackTrie() = default;

llvm/lib/Analysis/MemoryProfileInfo.cpp

Lines changed: 88 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include "llvm/Analysis/MemoryProfileInfo.h"
1414
#include "llvm/IR/Constants.h"
1515
#include "llvm/Support/CommandLine.h"
16+
#include "llvm/Support/Format.h"
1617

1718
using namespace llvm;
1819
using namespace llvm::memprof;
@@ -58,6 +59,19 @@ cl::opt<bool> MemProfKeepAllNotColdContexts(
5859
"memprof-keep-all-not-cold-contexts", cl::init(false), cl::Hidden,
5960
cl::desc("Keep all non-cold contexts (increases cloning overheads)"));
6061

62+
cl::opt<unsigned> MinClonedColdBytePercent(
63+
"memprof-cloning-cold-threshold", cl::init(100), cl::Hidden,
64+
cl::desc("Min percent of cold bytes to hint alloc cold during cloning"));
65+
66+
// Discard non-cold contexts if they overlap with much larger cold contexts,
67+
// specifically, if all contexts reaching a given callsite are at least this
68+
// percent cold byte allocations. This reduces the amount of cloning required
69+
// to expose the cold contexts when they greatly dominate non-cold contexts.
70+
cl::opt<unsigned> MinCallsiteColdBytePercent(
71+
"memprof-callsite-cold-threshold", cl::init(100), cl::Hidden,
72+
cl::desc("Min percent of cold bytes at a callsite to discard non-cold "
73+
"contexts"));
74+
6175
AllocationType llvm::memprof::getAllocType(uint64_t TotalLifetimeAccessDensity,
6276
uint64_t AllocCount,
6377
uint64_t TotalLifetime) {
@@ -208,13 +222,29 @@ void CallStackTrie::addCallStack(MDNode *MIB) {
208222

209223
static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
210224
AllocationType AllocType,
211-
ArrayRef<ContextTotalSize> ContextSizeInfo) {
225+
ArrayRef<ContextTotalSize> ContextSizeInfo,
226+
uint64_t &TotalBytes, uint64_t &ColdBytes) {
212227
SmallVector<Metadata *> MIBPayload(
213228
{buildCallstackMetadata(MIBCallStack, Ctx)});
214229
MIBPayload.push_back(
215230
MDString::get(Ctx, getAllocTypeAttributeString(AllocType)));
216-
if (!ContextSizeInfo.empty()) {
217-
for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) {
231+
232+
if (ContextSizeInfo.empty()) {
233+
// The profile matcher should have provided context size info if there was a
234+
// MinCallsiteColdBytePercent < 100. Here we check >=100 to gracefully
235+
// handle a user-provided percent larger than 100.
236+
assert(MinCallsiteColdBytePercent >= 100);
237+
return MDNode::get(Ctx, MIBPayload);
238+
}
239+
240+
for (const auto &[FullStackId, TotalSize] : ContextSizeInfo) {
241+
TotalBytes += TotalSize;
242+
if (AllocType == AllocationType::Cold)
243+
ColdBytes += TotalSize;
244+
// Only add the context size info as metadata if we need it in the thin
245+
// link (currently if reporting of hinted sizes is enabled or we have
246+
// specified a threshold for marking allocations cold after cloning).
247+
if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) {
218248
auto *FullStackIdMD = ValueAsMetadata::get(
219249
ConstantInt::get(Type::getInt64Ty(Ctx), FullStackId));
220250
auto *TotalSizeMD = ValueAsMetadata::get(
@@ -223,6 +253,7 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
223253
MIBPayload.push_back(ContextSizeMD);
224254
}
225255
}
256+
assert(TotalBytes > 0);
226257
return MDNode::get(Ctx, MIBPayload);
227258
}
228259

@@ -246,9 +277,14 @@ void CallStackTrie::convertHotToNotCold(CallStackTrieNode *Node) {
246277
// on options that enable filtering out some NotCold contexts.
247278
static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
248279
std::vector<Metadata *> &SavedMIBNodes,
249-
unsigned CallerContextLength) {
280+
unsigned CallerContextLength,
281+
uint64_t TotalBytes, uint64_t ColdBytes) {
282+
const bool MostlyCold =
283+
MinCallsiteColdBytePercent < 100 &&
284+
ColdBytes * 100 >= MinCallsiteColdBytePercent * TotalBytes;
285+
250286
// In the simplest case, with pruning disabled, keep all the new MIB nodes.
251-
if (MemProfKeepAllNotColdContexts) {
287+
if (MemProfKeepAllNotColdContexts && !MostlyCold) {
252288
append_range(SavedMIBNodes, NewMIBNodes);
253289
return;
254290
}
@@ -271,6 +307,30 @@ static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
271307
}
272308
};
273309

310+
// If the cold bytes at the current callsite exceed the given threshold, we
311+
// discard all non-cold contexts so do not need any of the later pruning
312+
// handling. We can simply copy over all the cold contexts and return early.
313+
if (MostlyCold) {
314+
auto NewColdMIBNodes =
315+
make_filter_range(NewMIBNodes, [&](const Metadata *M) {
316+
auto MIBMD = cast<MDNode>(M);
317+
// Only append cold contexts.
318+
if (getMIBAllocType(MIBMD) == AllocationType::Cold)
319+
return true;
320+
if (MemProfReportHintedSizes) {
321+
const float PercentCold = ColdBytes * 100.0 / TotalBytes;
322+
std::string PercentStr;
323+
llvm::raw_string_ostream OS(PercentStr);
324+
OS << format(" for %5.2f%% cold bytes", PercentCold);
325+
EmitMessageForRemovedContexts(MIBMD, "discarded", OS.str());
326+
}
327+
return false;
328+
});
329+
for (auto *M : NewColdMIBNodes)
330+
SavedMIBNodes.push_back(M);
331+
return;
332+
}
333+
274334
// Prune unneeded NotCold contexts, taking advantage of the fact
275335
// that we later will only clone Cold contexts, as NotCold is the allocation
276336
// default. We only need to keep as metadata the NotCold contexts that
@@ -341,17 +401,20 @@ static void saveFilteredNewMIBNodes(std::vector<Metadata *> &NewMIBNodes,
341401
// Recursive helper to trim contexts and create metadata nodes.
342402
// Caller should have pushed Node's loc to MIBCallStack. Doing this in the
343403
// caller makes it simpler to handle the many early returns in this method.
404+
// Updates the total and cold profiled bytes in the subtrie rooted at this node.
344405
bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
345406
std::vector<uint64_t> &MIBCallStack,
346407
std::vector<Metadata *> &MIBNodes,
347-
bool CalleeHasAmbiguousCallerContext) {
408+
bool CalleeHasAmbiguousCallerContext,
409+
uint64_t &TotalBytes, uint64_t &ColdBytes) {
348410
// Trim context below the first node in a prefix with a single alloc type.
349411
// Add an MIB record for the current call stack prefix.
350412
if (hasSingleAllocType(Node->AllocTypes)) {
351413
std::vector<ContextTotalSize> ContextSizeInfo;
352414
collectContextSizeInfo(Node, ContextSizeInfo);
353-
MIBNodes.push_back(createMIBNode(
354-
Ctx, MIBCallStack, (AllocationType)Node->AllocTypes, ContextSizeInfo));
415+
MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack,
416+
(AllocationType)Node->AllocTypes,
417+
ContextSizeInfo, TotalBytes, ColdBytes));
355418
return true;
356419
}
357420

@@ -364,17 +427,25 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
364427
// that will later be filtered before adding to the caller's MIBNodes
365428
// vector.
366429
std::vector<Metadata *> NewMIBNodes;
430+
// Determine the total and cold byte counts for all callers, then add to the
431+
// caller's counts further below.
432+
uint64_t CallerTotalBytes = 0;
433+
uint64_t CallerColdBytes = 0;
367434
for (auto &Caller : Node->Callers) {
368435
MIBCallStack.push_back(Caller.first);
369-
AddedMIBNodesForAllCallerContexts &=
370-
buildMIBNodes(Caller.second, Ctx, MIBCallStack, NewMIBNodes,
371-
NodeHasAmbiguousCallerContext);
436+
AddedMIBNodesForAllCallerContexts &= buildMIBNodes(
437+
Caller.second, Ctx, MIBCallStack, NewMIBNodes,
438+
NodeHasAmbiguousCallerContext, CallerTotalBytes, CallerColdBytes);
372439
// Remove Caller.
373440
MIBCallStack.pop_back();
374441
}
375442
// Pass in the stack length of the MIB nodes added for the immediate caller,
376443
// which is the current stack length plus 1.
377-
saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1);
444+
saveFilteredNewMIBNodes(NewMIBNodes, MIBNodes, MIBCallStack.size() + 1,
445+
CallerTotalBytes, CallerColdBytes);
446+
TotalBytes += CallerTotalBytes;
447+
ColdBytes += CallerColdBytes;
448+
378449
if (AddedMIBNodesForAllCallerContexts)
379450
return true;
380451
// We expect that the callers should be forced to add MIBs to disambiguate
@@ -397,7 +468,7 @@ bool CallStackTrie::buildMIBNodes(CallStackTrieNode *Node, LLVMContext &Ctx,
397468
std::vector<ContextTotalSize> ContextSizeInfo;
398469
collectContextSizeInfo(Node, ContextSizeInfo);
399470
MIBNodes.push_back(createMIBNode(Ctx, MIBCallStack, AllocationType::NotCold,
400-
ContextSizeInfo));
471+
ContextSizeInfo, TotalBytes, ColdBytes));
401472
return true;
402473
}
403474

@@ -444,12 +515,15 @@ bool CallStackTrie::buildAndAttachMIBMetadata(CallBase *CI) {
444515
std::vector<uint64_t> MIBCallStack;
445516
MIBCallStack.push_back(AllocStackId);
446517
std::vector<Metadata *> MIBNodes;
518+
uint64_t TotalBytes = 0;
519+
uint64_t ColdBytes = 0;
447520
assert(!Alloc->Callers.empty() && "addCallStack has not been called yet");
448521
// The CalleeHasAmbiguousCallerContext flag is meant to say whether the
449522
// callee of the given node has more than one caller. Here the node being
450523
// passed in is the alloc and it has no callees. So it's false.
451524
if (buildMIBNodes(Alloc, Ctx, MIBCallStack, MIBNodes,
452-
/*CalleeHasAmbiguousCallerContext=*/false)) {
525+
/*CalleeHasAmbiguousCallerContext=*/false, TotalBytes,
526+
ColdBytes)) {
453527
assert(MIBCallStack.size() == 1 &&
454528
"Should only be left with Alloc's location in stack");
455529
CI->setMetadata(LLVMContext::MD_memprof, MDNode::get(Ctx, MIBNodes));

llvm/lib/Transforms/Instrumentation/MemProfiler.cpp

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -176,11 +176,9 @@ static cl::opt<bool>
176176
cl::desc("Salvage stale MemProf profile"),
177177
cl::init(false), cl::Hidden);
178178

179-
cl::opt<unsigned> MinClonedColdBytePercent(
180-
"memprof-cloning-cold-threshold", cl::init(100), cl::Hidden,
181-
cl::desc("Min percent of cold bytes to hint alloc cold during cloning"));
182-
183179
extern cl::opt<bool> MemProfReportHintedSizes;
180+
extern cl::opt<unsigned> MinClonedColdBytePercent;
181+
extern cl::opt<unsigned> MinCallsiteColdBytePercent;
184182

185183
static cl::opt<unsigned> MinMatchedColdBytePercent(
186184
"memprof-matching-cold-threshold", cl::init(100), cl::Hidden,
@@ -293,6 +291,13 @@ class ModuleMemProfiler {
293291
Function *MemProfCtorFunction = nullptr;
294292
};
295293

294+
// Options under which we need to record the context size info in the alloc trie
295+
// used to build metadata.
296+
bool recordContextSizeInfo() {
297+
return MemProfReportHintedSizes || MinClonedColdBytePercent < 100 ||
298+
MinCallsiteColdBytePercent < 100;
299+
}
300+
296301
} // end anonymous namespace
297302

298303
MemProfilerPass::MemProfilerPass() = default;
@@ -758,7 +763,7 @@ static AllocationType addCallStack(CallStackTrie &AllocTrie,
758763
AllocInfo->Info.getAllocCount(),
759764
AllocInfo->Info.getTotalLifetime());
760765
std::vector<ContextTotalSize> ContextSizeInfo;
761-
if (MemProfReportHintedSizes || MinClonedColdBytePercent < 100) {
766+
if (recordContextSizeInfo()) {
762767
auto TotalSize = AllocInfo->Info.getTotalSize();
763768
assert(TotalSize);
764769
assert(FullStackId != 0);
@@ -1141,8 +1146,7 @@ readMemprof(Module &M, Function &F, IndexedInstrProfReader *MemProfReader,
11411146
InlinedCallStack)) {
11421147
NumOfMemProfMatchedAllocContexts++;
11431148
uint64_t FullStackId = 0;
1144-
if (ClPrintMemProfMatchInfo || MemProfReportHintedSizes ||
1145-
MinClonedColdBytePercent < 100)
1149+
if (ClPrintMemProfMatchInfo || recordContextSizeInfo())
11461150
FullStackId = computeFullStackId(AllocInfo->CallStack);
11471151
auto AllocType = addCallStack(AllocTrie, AllocInfo, FullStackId);
11481152
TotalSize += AllocInfo->Info.getTotalSize();

0 commit comments

Comments
 (0)