Skip to content

Commit 1cbe49f

Browse files
tcreech-intelyuxuanchen1997
authored andcommitted
[llvm-profgen] Add --sample-period to estimate absolute counts (#99826)
Summary: Without `--sample-period`, no assumptions are made about perf profile sample frequencies. This is useful for comparing relative hotness of different program locations within the same profile. With `--sample-period`, LBR- and IP-based profile hit counts are adjusted to estimate the absolute total event count for each program location. This makes it reasonable to compare hit counts between different profiles, e.g., between two LBR-based execution frequency profiles with different sampling periods or between LBR-based execution frequency profiles and IP-based branch mispredict profiles. This functionality is in support of HWPGO[^1], which aims to enable feedback from a wider range of hardware events. [^1]: https://llvm.org/devmtg/2024-04/slides/TechnicalTalks/Xiao-EnablingHW-BasedPGO.pdf Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251524
1 parent c021892 commit 1cbe49f

File tree

2 files changed

+98
-0
lines changed

2 files changed

+98
-0
lines changed
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_inst_retired.near_taken:upp --sample-period=1000003
2+
// RUN: FileCheck %s --input-file %t --check-prefix=CHECK-RAW-PROFILE
3+
// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_inst_retired.near_taken:upp --sample-period=1000003
4+
// RUN: FileCheck %s --input-file %t --check-prefix=CHECK
5+
6+
// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_misp_retired.all_branches:upp --leading-ip-only --sample-period=1000003
7+
// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED-RAW-PROFILE
8+
// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_misp_retired.all_branches:upp --leading-ip-only --sample-period=1000003
9+
// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED
10+
11+
// Check that we can use perf event filtering to generate multiple types of
12+
// source-level profiles from a single perf profile. In this case, we generate
13+
// a typical execution frequency profile using br_inst_retired.near_taken LBRs,
14+
// and a branch mispredict profile using br_misp_retired.all_branches sample
15+
// IPs.
16+
17+
// Check that we can use --sample-period to compute LBR and IP-based profiles
18+
// which have comparable and absolute magnitudes. For example, in this case the
19+
// branch of interest (at source line offset 4) is in a loop body which is
20+
// executed ~20M times in total, and it's mispredicted about 9M times, yielding
21+
// a mispredict rate of roughly 0.45.
22+
23+
// The source example below is based on perfKernelCpp/cmov_3, except a
24+
// misleading builtin is used to persuade the compiler not to use cmov, which
25+
// induces branch mispredicts.
26+
27+
// CHECK: sel_arr:652547082:0
28+
// CHECK: 3.1: 20225766
29+
// CHECK: 3.2: 20225766
30+
// CHECK: 4: 19838670
31+
// CHECK: 5: 20225766
32+
33+
// UNPRED: sel_arr:18000054:0
34+
// UNPRED: 3.1: 0
35+
// UNPRED: 3.2: 0
36+
// UNPRED: 4: 9000027
37+
// UNPRED: 5: 0
38+
39+
// CHECK-RAW-PROFILE: 3
40+
// CHECK-RAW-PROFILE-NEXT: 2f0-2fa:9774174
41+
// CHECK-RAW-PROFILE-NEXT: 2f0-310:10064496
42+
// CHECK-RAW-PROFILE-NEXT: 2ff-310:10161270
43+
44+
// UNPRED-RAW-PROFILE: 1
45+
// UNPRED-RAW-PROFILE-NEXT: 2fa-2fa:9000027
46+
47+
// original code:
48+
// icx -fprofile-sample-generate lit.c
49+
#include <stdlib.h>
50+
51+
#define N 20000
52+
#define ITERS 10000
53+
54+
static int *m_s1, *m_s2, *m_s3, *m_dst;
55+
56+
void init(void) {
57+
m_s1 = malloc(sizeof(int)*N);
58+
m_s2 = malloc(sizeof(int)*N);
59+
m_s3 = malloc(sizeof(int)*N);
60+
m_dst = malloc(sizeof(int)*N);
61+
srand(42);
62+
63+
for (int i = 0; i < N; i++) {
64+
m_s1[i] = rand() % N;
65+
m_s2[i] = 0;
66+
m_s3[i] = 1;
67+
}
68+
}
69+
70+
void __attribute__((noinline)) sel_arr(int *dst, int *s1, int *s2, int *s3) {
71+
#pragma nounroll
72+
#pragma clang loop vectorize(disable) interleave(disable)
73+
for (int i = 0; i < N; i++) {
74+
int *p = __builtin_expect((s1[i] < 10035), 0) ? &s2[i] : &s3[i];
75+
dst[i] = *p;
76+
}
77+
}
78+
79+
int main(void) {
80+
init();
81+
for(int i=0; i<ITERS; ++i)
82+
sel_arr(m_dst, m_s1, m_s2, m_s3);
83+
return 0;
84+
}

llvm/tools/llvm-profgen/PerfReader.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,10 @@ static cl::alias
5353
cl::desc("Comma-delimited version of -perf-event"),
5454
cl::aliasopt(PerfEventFilter));
5555

56+
static cl::opt<uint64_t>
57+
SamplePeriod("sample-period", cl::init(1),
58+
cl::desc("The sampling period (-c) used for perf data"));
59+
5660
extern cl::opt<std::string> PerfTraceFilename;
5761
extern cl::opt<bool> ShowDisassemblyOnly;
5862
extern cl::opt<bool> ShowSourceLocations;
@@ -1000,6 +1004,16 @@ void LBRPerfReader::parseSample(TraceStream &TraceIt, uint64_t Count) {
10001004
if (extractLBRStack(TraceIt, Sample->LBRStack)) {
10011005
warnIfMissingMMap();
10021006
// Record LBR only samples by aggregation
1007+
// If a sampling period is given we can adjust the magnitude of sample
1008+
// counts to estimate the absolute magnitute.
1009+
if (SamplePeriod.getNumOccurrences()) {
1010+
Count *= SamplePeriod;
1011+
// If counts are LBR-based, as opposed to IP-based, then the magnitude is
1012+
// now amplified by roughly the LBR stack size. By adjusting this down, we
1013+
// can produce LBR-based and IP-based profiles with comparable magnitudes.
1014+
if (!LeadingIPOnly && Sample->LBRStack.size() > 1)
1015+
Count /= (Sample->LBRStack.size() - 1);
1016+
}
10031017
AggregatedSamples[Hashable<PerfSample>(Sample)] += Count;
10041018
}
10051019
}

0 commit comments

Comments
 (0)