Skip to content

Commit e10b73f

Browse files
committed
[CSSPGO][llvm-profgen] Merge and trim profile for cold context to reduce profile size
This change allows merging and trimming cold context profile in llvm-profgen to solve profile size bloat problem. Currently when the profile's total sample is below threshold(supported by a switch), it will be considered cold and merged into a base context-less profile, which will at least keep the profile quality as good as the baseline(non-cs). For example, two input profiles: [main @ foo @ bar]:60 [main @ bar]:50 Under threshold = 100, the two profiles will be merge into one with the base context, get result: [bar]:110 Added two switches: `--csprof-cold-thres=<value>`: Specified the total samples threshold for a context profile to be considered cold, with 100 being the default. Any cold context profiles will be merged into context-less base profile by default. `--csprof-keep-cold`: Force profile generation to keep cold context profiles instead of dropping them. By default, any cold context will not be written to output profile. Results: Though not yet evaluating it with the latest CSSPGO, our internal branch shows neutral on performance but significantly reduce the profile size. Detailed evaluation on llvm-profgen with CSSPGO will come later. Differential Revision: https://reviews.llvm.org/D94111
1 parent 36496cc commit e10b73f

9 files changed

+142
-9
lines changed

llvm/test/tools/llvm-profgen/inline-cs-noprobe.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-noprobe.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
1+
; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-noprobe.perfscript --binary=%S/Inputs/inline-cs-noprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
22
; RUN: FileCheck %s --input-file %t
33

44
; CHECK:[main:1 @ foo]:44:0

llvm/test/tools/llvm-profgen/inline-cs-pseudoprobe.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-pseudoprobe.perfscript --binary=%S/Inputs/inline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
1+
; RUN: llvm-profgen --perfscript=%S/Inputs/inline-cs-pseudoprobe.perfscript --binary=%S/Inputs/inline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
22
; RUN: FileCheck %s --input-file %t
33

44
; CHECK: [main:2 @ foo]:74:0
Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
; Used the data from recursion-compression.test, refer it for the unmerged output
2+
; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=-1 --csprof-cold-thres=8
3+
; RUN: FileCheck %s --input-file %t
4+
5+
; Test --csprof-keep-cold
6+
; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=-1 --csprof-cold-thres=100 --csprof-keep-cold
7+
; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-KEEP-COLD
8+
9+
; CHECK: [fa]:14:4
10+
; CHECK-NEXT: 1: 4
11+
; CHECK-NEXT: 3: 4
12+
; CHECK-NEXT: 4: 2
13+
; CHECK-NEXT: 5: 1
14+
; CHECK-NEXT: 7: 2 fb:2
15+
; CHECK-NEXT: 8: 1 fa:1
16+
; CHECK-NEXT: !CFGChecksum: 120515930909
17+
; CHECK-NEXT:[main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb]:13:4
18+
; CHECK-NEXT: 1: 4
19+
; CHECK-NEXT: 2: 3
20+
; CHECK-NEXT: 3: 1
21+
; CHECK-NEXT: 5: 4 fb:4
22+
; CHECK-NEXT: 6: 1 fa:1
23+
; CHECK-NEXT: !CFGChecksum: 72617220756
24+
25+
; CHECK-KEEP-COLD: [fb]:19:6
26+
; CHECK-KEEP-COLD-NEXT: 1: 6
27+
; CHECK-KEEP-COLD-NEXT: 2: 3
28+
; CHECK-KEEP-COLD-NEXT: 3: 3
29+
; CHECK-KEEP-COLD-NEXT: 5: 4 fb:4
30+
; CHECK-KEEP-COLD-NEXT: 6: 3 fa:3
31+
; CHECK-KEEP-COLD-NEXT: !CFGChecksum: 72617220756
32+
; CHECK-KEEP-COLD-NEXT:[fa]:14:4
33+
; CHECK-KEEP-COLD-NEXT: 1: 4
34+
; CHECK-KEEP-COLD-NEXT: 3: 4
35+
; CHECK-KEEP-COLD-NEXT: 4: 2
36+
; CHECK-KEEP-COLD-NEXT: 5: 1
37+
; CHECK-KEEP-COLD-NEXT: 7: 2 fb:2
38+
; CHECK-KEEP-COLD-NEXT: 8: 1 fa:1
39+
; CHECK-KEEP-COLD-NEXT: !CFGChecksum: 120515930909
40+
41+
42+
; clang -O3 -fexperimental-new-pass-manager -fuse-ld=lld -fpseudo-probe-for-profiling
43+
; -fno-omit-frame-pointer -mno-omit-leaf-frame-pointer -Xclang -mdisable-tail-calls
44+
; -g test.c -o a.out
45+
46+
; Copied from recursion-compression.test
47+
#include <stdio.h>
48+
49+
int fb(int n) {
50+
if(n > 10) return fb(n / 2);
51+
return fa(n - 1);
52+
}
53+
54+
int fa(int n) {
55+
if(n < 2) return n;
56+
if(n % 2) return fb(n - 1);
57+
return fa(n - 1);
58+
}
59+
60+
void foo() {
61+
int s, i = 0;
62+
while (i++ < 10000)
63+
s += fa(i);
64+
printf("sum is %d\n", s);
65+
}
66+
67+
int main() {
68+
foo();
69+
return 0;
70+
}

llvm/test/tools/llvm-profgen/noinline-cs-noprobe.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-noprobe.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
1+
; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-noprobe.perfscript --binary=%S/Inputs/noinline-cs-noprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
22
; RUN: FileCheck %s --input-file %t
33

44
; CHECK:[main:1 @ foo:3 @ bar]:12:3

llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-pseudoprobe.perfscript --binary=%S/Inputs/noinline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
1+
; RUN: llvm-profgen --perfscript=%S/Inputs/noinline-cs-pseudoprobe.perfscript --binary=%S/Inputs/noinline-cs-pseudoprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
22
; RUN: FileCheck %s --input-file %t
33

44
; CHECK: [main:2 @ foo]:75:0

llvm/test/tools/llvm-profgen/recursion-compression-noprobe.test

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; Firstly test uncompression(--compress-recursion=0)
2-
; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --compress-recursion=0
2+
; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --compress-recursion=0 --csprof-cold-thres=0
33
; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS
4-
; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t
4+
; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-noprobe.perfscript --binary=%S/Inputs/recursion-compression-noprobe.perfbin --output=%t --csprof-cold-thres=0
55
; RUN: FileCheck %s --input-file %t
66

77
; CHECK-UNCOMPRESS:[main:1 @ foo:3 @ fa]:14:0

llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
; Firstly test uncompression(--compress-recursion=0)
2-
; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0
2+
; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --compress-recursion=0 --csprof-cold-thres=0
33
; RUN: FileCheck %s --input-file %t -check-prefix=CHECK-UNCOMPRESS
4-
; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output | FileCheck %s --check-prefix=CHECK-UNWINDER
4+
; RUN: llvm-profgen --perfscript=%S/Inputs/recursion-compression-pseudoprobe.perfscript --binary=%S/Inputs/recursion-compression-pseudoprobe.perfbin --output=%t --show-unwinder-output --csprof-cold-thres=0 | FileCheck %s --check-prefix=CHECK-UNWINDER
55
; RUN: FileCheck %s --input-file %t
66

77
; CHECK-UNCOMPRESS: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa]:4:1

llvm/tools/llvm-profgen/ProfileGenerator.cpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,19 @@ static cl::opt<int32_t, true> RecursionCompression(
2929
cl::Hidden,
3030
cl::location(llvm::sampleprof::CSProfileGenerator::MaxCompressionSize));
3131

32+
static cl::opt<uint64_t> CSProfColdThres(
33+
"csprof-cold-thres", cl::init(100), cl::ZeroOrMore,
34+
cl::desc("Specify the total samples threshold for a context profile to "
35+
"be considered cold, any cold profiles will be merged into "
36+
"context-less base profiles"));
37+
38+
static cl::opt<bool> CSProfKeepCold(
39+
"csprof-keep-cold", cl::init(false), cl::ZeroOrMore,
40+
cl::desc("This works together with --csprof-cold-thres. If the total count "
41+
"of the profile after all merge is done is still smaller than the "
42+
"csprof-cold-thres, it will be trimmed unless csprof-keep-cold "
43+
"flag is specified."));
44+
3245
using namespace llvm;
3346
using namespace sampleprof;
3447

@@ -68,6 +81,7 @@ void ProfileGenerator::write() {
6881
if (std::error_code EC = WriterOrErr.getError())
6982
exitWithError(EC, OutputFilename);
7083
auto Writer = std::move(WriterOrErr.get());
84+
mergeAndTrimColdProfile(ProfileMap);
7185
Writer->write(ProfileMap);
7286
}
7387

@@ -329,6 +343,49 @@ void CSProfileGenerator::populateInferredFunctionSamples() {
329343
}
330344
}
331345

346+
void CSProfileGenerator::mergeAndTrimColdProfile(
347+
StringMap<FunctionSamples> &ProfileMap) {
348+
// Nothing to merge if sample threshold is zero
349+
if (!CSProfColdThres)
350+
return;
351+
352+
// Filter the cold profiles from ProfileMap and move them into a tmp
353+
// container
354+
std::vector<std::pair<StringRef, const FunctionSamples *>> ToRemoveVec;
355+
for (const auto &I : ProfileMap) {
356+
const FunctionSamples &FunctionProfile = I.second;
357+
if (FunctionProfile.getTotalSamples() >= CSProfColdThres)
358+
continue;
359+
ToRemoveVec.emplace_back(I.getKey(), &I.second);
360+
}
361+
362+
// Remove the code profile from ProfileMap and merge them into BaseProileMap
363+
StringMap<FunctionSamples> BaseProfileMap;
364+
for (const auto &I : ToRemoveVec) {
365+
auto Ret =
366+
BaseProfileMap.try_emplace(I.second->getName(), FunctionSamples());
367+
FunctionSamples &BaseProfile = Ret.first->second;
368+
BaseProfile.merge(*I.second);
369+
ProfileMap.erase(I.first);
370+
}
371+
372+
// Merge the base profiles into ProfileMap;
373+
for (const auto &I : BaseProfileMap) {
374+
// Filter the cold base profile
375+
if (!CSProfKeepCold && I.second.getTotalSamples() < CSProfColdThres &&
376+
ProfileMap.find(I.getKey()) == ProfileMap.end())
377+
continue;
378+
// Merge the profile if the original profile exists, otherwise just insert
379+
// as a new profile
380+
FunctionSamples &OrigProfile = getFunctionProfileForContext(I.getKey());
381+
StringRef TmpName = OrigProfile.getName();
382+
OrigProfile.merge(I.second);
383+
// Should use the name ref from ProfileMap's key to avoid name being freed
384+
// from BaseProfileMap
385+
OrigProfile.setName(TmpName);
386+
}
387+
}
388+
332389
// Helper function to extract context prefix string stack
333390
// Extract context stack for reusing, leaf context stack will
334391
// be added compressed while looking up function profile

llvm/tools/llvm-profgen/ProfileGenerator.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,10 @@ class ProfileGenerator {
2828
create(const BinarySampleCounterMap &BinarySampleCounters,
2929
enum PerfScriptType SampleType);
3030
virtual void generateProfile() = 0;
31-
31+
// Merge and trim profile with cold context before serialization,
32+
// only eligible for CS profile
33+
virtual void
34+
mergeAndTrimColdProfile(StringMap<FunctionSamples> &ProfileMap){};
3235
// Use SampleProfileWriter to serialize profile map
3336
void write();
3437

@@ -200,6 +203,9 @@ class CSProfileGenerator : public ProfileGenerator {
200203
protected:
201204
// Lookup or create FunctionSamples for the context
202205
FunctionSamples &getFunctionProfileForContext(StringRef ContextId);
206+
// Merge cold context profile whose total sample is below threshold
207+
// into base profile.
208+
void mergeAndTrimColdProfile(StringMap<FunctionSamples> &ProfileMap) override;
203209

204210
private:
205211
// Helper function for updating body sample for a leaf location in

0 commit comments

Comments
 (0)