Skip to content

Commit 856a6a5

Browse files
committed
[CSSPGO][llvm-profgen] Trim and merge context beforehand to reduce memory usage
Currently we use a centralized string map(StringMap<FunctionSamples> ProfileMap) to store the profile while populating the sample, which might cause the memory usage bottleneck. I saw in an extreme case, there are thousands of samples whose context stack depth is >= 100. The memory consumption can be greater than 100GB. As here the context is used for inlining, we can assume we won't have so many of inlinees keeping inlined at the same root function, so this change tried to cap the context stack and merge the samples for peak memory reduction and this is done after recursion compression. The default value is -1 meaning no depth limit, in the future we can tune to a smaller one. Reviewed By: hoy, wenlei Differential Revision: https://reviews.llvm.org/D107800
1 parent 35d6e75 commit 856a6a5

File tree

7 files changed

+66
-6
lines changed

7 files changed

+66
-6
lines changed

llvm/test/tools/llvm-profgen/merge-cold-profile.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
; RUN: FileCheck %s --input-file %t3 --check-prefix=CHECK-UNMERGED
1212

1313
; Test --csprof-frame-depth-for-cold-context
14-
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t2 --compress-recursion=-1 --profile-summary-cold-count=100 --csprof-trim-cold-context=0 --csprof-frame-depth-for-cold-context=2
14+
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t2 --compress-recursion=-1 --profile-summary-cold-count=100 --csprof-trim-cold-context=0 --csprof-max-cold-context-depth=2
1515
; RUN: FileCheck %s --input-file %t2 --check-prefix=CHECK-COLD-CONTEXT-LENGTH
1616

1717
; CHECK: [fa]:14:4

llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS
44
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --profile-summary-cold-count=0
55
; RUN: FileCheck %s --input-file %t
6+
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-cold-count=0 --csprof-max-context-depth=2
7+
; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
68

79
; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb]:48:0
810
; CHECK-UNCOMPRESS: 1: 11
@@ -21,6 +23,20 @@
2123
; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa:2 @ fb:2 @ fa:2 @ fb]:2:0
2224
; CHECK-UNCOMPRESS: 2: 1 fa:1
2325

26+
; CHECK-MAX-CTX-DEPTH:[foo:3 @ fa:2 @ fb]:47:0
27+
; CHECK-MAX-CTX-DEPTH: 1: 11
28+
; CHECK-MAX-CTX-DEPTH:[main:1 @ foo:3 @ fa]:13:0
29+
; CHECK-MAX-CTX-DEPTH: 1: 1
30+
; CHECK-MAX-CTX-DEPTH: 2: 2
31+
; CHECK-MAX-CTX-DEPTH:[fa:2 @ fb:2 @ fa]:8:0
32+
; CHECK-MAX-CTX-DEPTH: 1: 1
33+
; CHECK-MAX-CTX-DEPTH: 2: 1
34+
; CHECK-MAX-CTX-DEPTH: 4: 1
35+
; CHECK-MAX-CTX-DEPTH:[main:1 @ foo]:7:0
36+
; CHECK-MAX-CTX-DEPTH: 2: 1
37+
; CHECK-MAX-CTX-DEPTH: 3: 2 fa:1
38+
; CHECK-MAX-CTX-DEPTH:[fb:2 @ fa:2 @ fb]:1:0
39+
2440

2541
; CHECK: [main:1 @ foo:3 @ fa:2 @ fb]:48:0
2642
; CHECK: 1: 11

llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
; RUN: FileCheck %s --input-file %t
66
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe-nommap.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output --profile-summary-cold-count=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
77
; RUN: FileCheck %s --input-file %t
8-
8+
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --profile-summary-cold-count=0 --csprof-max-context-depth=0
9+
; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-MAX-CTX-DEPTH
910

1011
; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:4:1
1112
; CHECK-UNCOMPRESS: 1: 1
@@ -64,6 +65,25 @@
6465
; CHECK-UNCOMPRESS: !CFGChecksum: 563022570642068
6566

6667

68+
; CHECK-MAX-CTX-DEPTH: [fb]:19:6
69+
; CHECK-MAX-CTX-DEPTH: 1: 6
70+
; CHECK-MAX-CTX-DEPTH: 2: 3
71+
; CHECK-MAX-CTX-DEPTH: 3: 3
72+
; CHECK-MAX-CTX-DEPTH: 4: 0
73+
; CHECK-MAX-CTX-DEPTH: 5: 4 fb:4
74+
; CHECK-MAX-CTX-DEPTH: 6: 3 fa:3
75+
; CHECK-MAX-CTX-DEPTH: !CFGChecksum: 563022570642068
76+
; CHECK-MAX-CTX-DEPTH: [fa]:14:4
77+
; CHECK-MAX-CTX-DEPTH: 1: 4
78+
; CHECK-MAX-CTX-DEPTH: 3: 4
79+
; CHECK-MAX-CTX-DEPTH: 4: 2
80+
; CHECK-MAX-CTX-DEPTH: 5: 1
81+
; CHECK-MAX-CTX-DEPTH: 6: 0
82+
; CHECK-MAX-CTX-DEPTH: 7: 2 fb:2
83+
; CHECK-MAX-CTX-DEPTH: 8: 1 fa:1
84+
; CHECK-MAX-CTX-DEPTH: !CFGChecksum: 563070469352221
85+
86+
6787
; CHECK: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4
6888
; CHECK: 1: 4
6989
; CHECK: 2: 3

llvm/tools/llvm-profgen/PerfReader.cpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,9 @@ std::shared_ptr<ProbeBasedCtxKey> ProbeStack::getContextKey() {
109109
}
110110
CSProfileGenerator::compressRecursionContext<const MCDecodedPseudoProbe *>(
111111
ProbeBasedKey->Probes);
112+
CSProfileGenerator::trimContext<const MCDecodedPseudoProbe *>(
113+
ProbeBasedKey->Probes);
114+
112115
ProbeBasedKey->genHashCode();
113116
return ProbeBasedKey;
114117
}

llvm/tools/llvm-profgen/ProfileGenerator.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,11 +44,17 @@ static cl::opt<bool> CSProfTrimColdContext(
4444
cl::desc("If the total count of the profile after all merge is done "
4545
"is still smaller than threshold, it will be trimmed."));
4646

47-
static cl::opt<uint32_t> CSProfColdContextFrameDepth(
48-
"csprof-frame-depth-for-cold-context", cl::init(1), cl::ZeroOrMore,
49-
cl::desc("Keep the last K frames while merging cold profile. 1 means the "
47+
static cl::opt<uint32_t> CSProfMaxColdContextDepth(
48+
"csprof-max-cold-context-depth", cl::init(1), cl::ZeroOrMore,
49+
cl::desc("Keep the last K contexts while merging cold profile. 1 means the "
5050
"context-less base profile"));
5151

52+
static cl::opt<int, true> CSProfMaxContextDepth(
53+
"csprof-max-context-depth", cl::ZeroOrMore,
54+
cl::desc("Keep the last K contexts while merging profile. -1 means no "
55+
"depth limit."),
56+
cl::location(llvm::sampleprof::CSProfileGenerator::MaxContextDepth));
57+
5258
static cl::opt<bool> EnableCSPreInliner(
5359
"csspgo-preinliner", cl::Hidden, cl::init(false),
5460
cl::desc("Run a global pre-inliner to merge context profile based on "
@@ -65,6 +71,8 @@ namespace sampleprof {
6571
// Initialize the MaxCompressionSize to -1 which means no size limit
6672
int32_t CSProfileGenerator::MaxCompressionSize = -1;
6773

74+
int CSProfileGenerator::MaxContextDepth = -1;
75+
6876
static bool
6977
usePseudoProbes(const BinarySampleCounterMap &BinarySampleCounters) {
7078
return BinarySampleCounters.size() &&
@@ -415,7 +423,7 @@ void CSProfileGenerator::postProcessProfiles() {
415423
SampleContextTrimmer(ProfileMap)
416424
.trimAndMergeColdContextProfiles(
417425
ColdCountThreshold, CSProfTrimColdContext, CSProfMergeColdContext,
418-
CSProfColdContextFrameDepth);
426+
CSProfMaxColdContextDepth);
419427
}
420428

421429
void CSProfileGenerator::computeSummaryAndThreshold() {
@@ -608,6 +616,7 @@ FunctionSamples &PseudoProbeCSProfileGenerator::getFunctionProfileForLeafProbe(
608616
std::string LeafFrame = ContextStrStack.back();
609617
ContextStrStack.pop_back();
610618
CSProfileGenerator::compressRecursionContext(ContextStrStack);
619+
CSProfileGenerator::trimContext(ContextStrStack);
611620

612621
std::ostringstream OContextStr;
613622
for (uint32_t I = 0; I < ContextStrStack.size(); I++) {

llvm/tools/llvm-profgen/ProfileGenerator.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,16 @@ class CSProfileGenerator : public ProfileGenerator {
7070
public:
7171
void generateProfile() override;
7272

73+
// Trim the context stack at a given depth.
74+
template <typename T>
75+
static void trimContext(SmallVectorImpl<T> &S, int Depth = MaxContextDepth) {
76+
if (Depth < 0 || static_cast<size_t>(Depth) >= S.size())
77+
return;
78+
std::copy(S.begin() + S.size() - static_cast<size_t>(Depth), S.end(),
79+
S.begin());
80+
S.resize(Depth);
81+
}
82+
7383
// Remove adjacent repeated context sequences up to a given sequence length,
7484
// -1 means no size limit. Note that repeated sequences are identified based
7585
// on the exact call site, this is finer granularity than function recursion.
@@ -212,6 +222,7 @@ class CSProfileGenerator : public ProfileGenerator {
212222
// Deduplicate adjacent repeated context sequences up to a given sequence
213223
// length. -1 means no size limit.
214224
static int32_t MaxCompressionSize;
225+
static int MaxContextDepth;
215226
};
216227

217228
using ProbeCounterMap =

llvm/tools/llvm-profgen/ProfiledBinary.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@ ProfiledBinary::getExpandedContextStr(const SmallVectorImpl<uint64_t> &Stack,
125125
std::string LeafFrame = ContextVec.back();
126126
ContextVec.pop_back();
127127
CSProfileGenerator::compressRecursionContext<std::string>(ContextVec);
128+
CSProfileGenerator::trimContext<std::string>(ContextVec);
128129

129130
std::ostringstream OContextStr;
130131
for (uint32_t I = 0; I < (uint32_t)ContextVec.size(); I++) {

0 commit comments

Comments
 (0)