Skip to content

Commit eda9653

Browse files
tcreech-intelyuxuanchen1997
authored andcommitted
[llvm-profgen] Support creating profiles of arbitrary events (#99026)
Summary: This change introduces two options which may be used to create profiles of arbitrary PMU events. 1. `--leading-ip-only` provides a simple sample-IP-based profile mode. This is not useful for building a profile of execution frequency, but it is useful for building new types of profiles. For example, to build a profile of unpredictable branches: perf record -b -e branch-misses:upp -o perf.data ... llvm-profgen --perfdata perf.data --leading-ip-only ... 2. `--perf-event=event` enables the creation of a profile concerned with a specific event or set of events. The names given should match the "event" field as emitted by perf-script(1). This option has two spellings: `--perf-event` and `--perf-events`. The plural spelling accepts a comma-separated list. The singular spelling appends a single event name to the set of events which will be used. This is meant to accommodate event names containing commas. Combined, these options allow generating multiple kinds of profiles from a single `perf record` collection. For example, to generate both execution frequency and branch mispredict profiles: perf record -c 1000003 -b -e br_inst_retired.near_taken:upp,br_misp_retired.all_branches:upp ... llvm-profgen --output execution.prof --perf-event=br_inst_retired.near_taken:upp ... llvm-profgen --leading-ip-only --output unpredictable.prof --perf-event=br_misp_retired.all_branches:upp ... These additions are in support of more general HWPGO[^1], allowing feedback from a wider range of hardware events. [^1]: https://llvm.org/devmtg/2024-04/slides/TechnicalTalks/Xiao-EnablingHW-BasedPGO.pdf --------- Co-authored-by: Tim Creech <[email protected]> Test Plan: Reviewers: Subscribers: Tasks: Tags: Differential Revision: https://phabricator.intern.facebook.com/D60251204
1 parent a50f1e4 commit eda9653

File tree

9 files changed

+331
-20
lines changed

9 files changed

+331
-20
lines changed
Binary file not shown.

llvm/test/tools/llvm-profgen/Inputs/cmov_3.perfscript

Lines changed: 39 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
4006b7 0x4006b7/0x40068b/P/-/-/1 0x4006c8/0x4006b0/P/-/-/1 0x400689/0x4006b9/P/-/-/1 0x40066d/0x400686/P/-/-/2 0x4007a6/0x400650/P/-/-/9 0x4007ca/0x400790/P/-/-/8 0x4007d7/0x4007bd/P/-/-/1 0x400792/0x4007d7/P/-/-/1 0x4007b8/0x400790/P/-/-/2 0x4006a2/0x4007a8/P/-/-/3
2+
40065d 40065d/0x40068f/M/-/-/1
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
// Invalid perf line
2+
40062f 0x40062f/0x4005b0/P/-/-/9 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/6 0x40062f/0x4005b0/P/-/-/16 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/6 0x40062f/0x4005b0/P/-/-/6 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005c8/0x4005dc/P/-/-/8 0x40062f/0x4005b0/P/-/-/9 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/10 0x40062f/0x4005b0/P/-/-/14 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/7 0x40062f/0x4005b0/P/-/-/8 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005c8/0x4005dc/P/-/-/7 0x40062f/0x4005b0/P/-/-/15 0x400645/0x4005ff/P/-/-/1
3+
4005d7 0x4005d7/0x4005e5/P/-/-/8 0x40062f/0x4005b0/P/-/-/6 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/2 0x4005c8/0x4005dc/P/-/-/7 0x40062f/0x4005b0/P/-/-/11 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/8 0x40062f/0x4005b0/P/-/-/9 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/5 0x40062f/0x4005b0/P/-/-/11 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/2 0x4005c8/0x4005dc/P/-/-/7 0x40062f/0x4005b0/P/-/-/10 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/8 0x40062f/0x4005b0/P/-/-/9 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/13 0x40062f/0x4005b0/P/-/-/9
4+
4005c8 0x4005c8/0x4005dc/P/-/-/11 0x40062f/0x4005b0/P/-/-/8 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/5 0x40062f/0x4005b0/P/-/-/6 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/12 0x40062f/0x4005b0/P/-/-/6 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/2 0x4005c8/0x4005dc/P/-/-/7 0x40062f/0x4005b0/P/-/-/10 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/8 0x40062f/0x4005b0/P/-/-/9 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/12 0x40062f/0x4005b0/P/-/-/6 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/2 0x4005c8/0x4005dc/P/-/-/8 0x40062f/0x4005b0/P/-/-/8
5+
4005c5 0x4005c8/0x4005dc/P/-/-/11 0x40062f/0x4005b0/P/-/-/8 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/5 0x40062f/0x4005b0/P/-/-/6 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/12 0x40062f/0x4005b0/P/-/-/6 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/2 0x4005c8/0x4005dc/P/-/-/7 0x40062f/0x4005b0/P/-/-/10 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/8 0x40062f/0x4005b0/P/-/-/9 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/1 0x4005d7/0x4005e5/P/-/-/12 0x40062f/0x4005b0/P/-/-/6 0x400645/0x4005ff/P/-/-/1 0x400637/0x400645/P/-/-/1 0x4005e9/0x400634/P/-/-/2 0x4005c8/0x4005dc/P/-/-/8 0x40062f/0x4005b0/P/-/-/8
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_inst_retired.near_taken:upp
2+
// RUN: FileCheck %s --input-file %t --check-prefix=CHECK-RAW-PROFILE
3+
// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_inst_retired.near_taken:upp
4+
// RUN: FileCheck %s --input-file %t --check-prefix=CHECK
5+
6+
// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --skip-symbolization --perf-event=br_misp_retired.all_branches:upp --leading-ip-only
7+
// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED-RAW-PROFILE
8+
// RUN: llvm-profgen --format=text --perfscript=%S/Inputs/cmov_3.perfscript --binary=%S/Inputs/cmov_3.perfbin --output=%t --perf-event=br_misp_retired.all_branches:upp --leading-ip-only
9+
// RUN: FileCheck %s --input-file %t --check-prefix=UNPRED
10+
11+
// Check that we can use perf event filtering to generate multiple types of
12+
// source-level profiles from a single perf profile. In this case, we generate
13+
// a typical execution frequency profile using br_inst_retired.near_taken LBRs,
14+
// and a branch mispredict profile using br_misp_retired.all_branches sample
15+
// IPs.
16+
17+
// The source example below is based on perfKernelCpp/cmov_3, except a
18+
// misleading builtin is used to persuade the compiler not to use cmov, which
19+
// induces branch mispredicts.
20+
21+
// CHECK: sel_arr:20229:0
22+
// CHECK: 3.1: 627
23+
// CHECK: 3.2: 627
24+
// CHECK: 4: 615
25+
// CHECK: 5: 627
26+
27+
// UNPRED: sel_arr:18:0
28+
// UNPRED: 3.1: 0
29+
// UNPRED: 3.2: 0
30+
// UNPRED: 4: 9
31+
// UNPRED: 5: 0
32+
33+
// CHECK-RAW-PROFILE: 3
34+
// CHECK-RAW-PROFILE-NEXT: 2f0-2fa:303
35+
// CHECK-RAW-PROFILE-NEXT: 2f0-310:312
36+
// CHECK-RAW-PROFILE-NEXT: 2ff-310:315
37+
38+
// UNPRED-RAW-PROFILE: 1
39+
// UNPRED-RAW-PROFILE-NEXT: 2fa-2fa:9
40+
41+
// original code:
42+
// clang -O2 -gline-tables-only -fdebug-info-for-profiling lit.c
43+
#include <stdlib.h>
44+
45+
#define N 20000
46+
#define ITERS 10000
47+
48+
static int *m_s1, *m_s2, *m_s3, *m_dst;
49+
50+
void init(void) {
51+
m_s1 = malloc(sizeof(int)*N);
52+
m_s2 = malloc(sizeof(int)*N);
53+
m_s3 = malloc(sizeof(int)*N);
54+
m_dst = malloc(sizeof(int)*N);
55+
srand(42);
56+
57+
for (int i = 0; i < N; i++) {
58+
m_s1[i] = rand() % N;
59+
m_s2[i] = 0;
60+
m_s3[i] = 1;
61+
}
62+
}
63+
64+
void __attribute__((noinline)) sel_arr(int *dst, int *s1, int *s2, int *s3) {
65+
#pragma nounroll
66+
#pragma clang loop vectorize(disable) interleave(disable)
67+
for (int i = 0; i < N; i++) {
68+
int *p = __builtin_expect((s1[i] < 10035), 0) ? &s2[i] : &s3[i];
69+
dst[i] = *p;
70+
}
71+
}
72+
73+
int main(void) {
74+
init();
75+
for(int i=0; i<ITERS; ++i)
76+
sel_arr(m_dst, m_s1, m_s2, m_s3);
77+
return 0;
78+
}
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/ip-duplication.perfscript --binary=%S/Inputs/inline-noprobe2.perfbin --output=%t --use-offset=0 --leading-ip-only
2+
; RUN: FileCheck %s --input-file %t --check-prefix=CHECK
3+
4+
; Test that we don't over-count samples for duplicated source code when
5+
; building an IP-based profile.
6+
7+
; The inline-noprobe2.perfbin binary is used for this test because one of the
8+
; partition_pivot_last+3.1 debug locations has a duplication factor of 2
9+
; encoded into its discriminator. In IP-sample mode, a hit in one instruction
10+
; in the duplicated code does not imply a hit to the other duplicates.
11+
12+
; The perfscript input includes 1 sample at a location with duplication factor
13+
; of 2, and another sample at the same source location but with no duplication
14+
; factor. These should be summed without duplication factors. Ensure we record
15+
; a count of 1+1=2 (and not 2+1=3) for the 3.1 location.
16+
17+
;CHECK-LABEL: partition_pivot_last
18+
;CHECK-NEXT: 1: 0
19+
;CHECK-NEXT: 2: 0
20+
;CHECK-NEXT: 3: 0
21+
;CHECK-NEXT: 3.1: 2
22+
Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/noprobe-skid.perfscript --binary=%S/Inputs/noprobe.perfbin --output=%t --skip-symbolization --leading-ip-only
2+
; RUN: FileCheck %s --input-file %t --check-prefix=CHECK-RAW-PROFILE
3+
; RUN: llvm-profgen --format=text --perfscript=%S/Inputs/noprobe-skid.perfscript --binary=%S/Inputs/noprobe.perfbin --output=%t --leading-ip-only
4+
; RUN: FileCheck %s --input-file %t --check-prefix=CHECK
5+
6+
; Here we check the ability to ignore LBRs, which is useful for generating
7+
; profiles where only the precise PMU sample IP is of interest. In general the
8+
; IPs need not identify a branch. In this case there are exactly 4 samples, so
9+
; we see only these 4 locations as "hot" and none of the LBR history.
10+
; Compare with noinline-noprobe.test, which includes LBR history.
11+
12+
; Note that there are two different IPs (5c5 and 5c8) contributing to line
13+
; offset 1 in bar. This tests that sample counts corresponding to the same
14+
; debug location are summed into that location in the profile rather than the
15+
; maximum being taken, as happens with basic block execution count profiles.
16+
17+
;CHECK: bar:14:0
18+
;CHECK: 0: 0
19+
;CHECK: 1: 2
20+
;CHECK: 2: 1
21+
;CHECK: 4: 0
22+
;CHECK: 5: 0
23+
;CHECK: foo:5:0
24+
;CHECK: 0: 0
25+
;CHECK: 1: 0
26+
;CHECK: 2: 0
27+
;CHECK: 3: 1
28+
;CHECK: 4: 0
29+
;CHECK: 5: 0
30+
31+
CHECK-RAW-PROFILE: 4
32+
CHECK-RAW-PROFILE-NEXT: 5c5-5c5:1
33+
CHECK-RAW-PROFILE-NEXT: 5c8-5c8:1
34+
CHECK-RAW-PROFILE-NEXT: 5d7-5d7:1
35+
CHECK-RAW-PROFILE-NEXT: 62f-62f:1
36+
37+
; original code:
38+
; clang -O3 -g -fdebug-info-for-profiling test.c -fno-inline -o a.out
39+
#include <stdio.h>
40+
41+
int bar(int x, int y) {
42+
if (x % 3) {
43+
return x - y;
44+
}
45+
return x + y;
46+
}
47+
48+
void foo() {
49+
int s, i = 0;
50+
while (i++ < 4000 * 4000)
51+
if (i % 91) s = bar(i, s); else s += 30;
52+
printf("sum is %d\n", s);
53+
}
54+
55+
int main() {
56+
foo();
57+
return 0;
58+
}

llvm/tools/llvm-profgen/PerfReader.cpp

Lines changed: 107 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,17 @@ static cl::opt<bool>
4141
"and produce context-insensitive profile."));
4242
cl::opt<bool> ShowDetailedWarning("show-detailed-warning",
4343
cl::desc("Show detailed warning message."));
44+
cl::opt<bool>
45+
LeadingIPOnly("leading-ip-only",
46+
cl::desc("Form a profile based only on sample IPs"));
47+
48+
static cl::list<std::string> PerfEventFilter(
49+
"perf-event",
50+
cl::desc("Ignore samples not matching the given event names"));
51+
static cl::alias
52+
PerfEventFilterPlural("perf-events", cl::CommaSeparated,
53+
cl::desc("Comma-delimited version of -perf-event"),
54+
cl::aliasopt(PerfEventFilter));
4455

4556
extern cl::opt<std::string> PerfTraceFilename;
4657
extern cl::opt<bool> ShowDisassemblyOnly;
@@ -404,13 +415,18 @@ PerfScriptReader::convertPerfDataToTrace(ProfiledBinary *Binary, bool SkipPID,
404415
}
405416
}
406417

418+
// If filtering by events was requested, additionally request the "event"
419+
// field.
420+
const std::string FieldList =
421+
PerfEventFilter.empty() ? "ip,brstack" : "event,ip,brstack";
422+
407423
// Run perf script again to retrieve events for PIDs collected above
408424
SmallVector<StringRef, 8> ScriptSampleArgs;
409425
ScriptSampleArgs.push_back(PerfPath);
410426
ScriptSampleArgs.push_back("script");
411427
ScriptSampleArgs.push_back("--show-mmap-events");
412428
ScriptSampleArgs.push_back("-F");
413-
ScriptSampleArgs.push_back("ip,brstack");
429+
ScriptSampleArgs.push_back(FieldList);
414430
ScriptSampleArgs.push_back("-i");
415431
ScriptSampleArgs.push_back(PerfData);
416432
if (!PIDs.empty()) {
@@ -575,14 +591,54 @@ bool PerfScriptReader::extractLBRStack(TraceStream &TraceIt,
575591

576592
// Skip the leading instruction pointer.
577593
size_t Index = 0;
594+
595+
StringRef EventName;
596+
// Skip a perf event name. This may or may not exist.
597+
if (Records.size() > Index && Records[Index].ends_with(":")) {
598+
EventName = Records[Index].ltrim().rtrim(':');
599+
Index++;
600+
601+
if (PerfEventFilter.empty()) {
602+
WithColor::warning() << "No --perf-event filter was specified, but an "
603+
"\"event\" field was found in line "
604+
<< TraceIt.getLineNumber() << ": "
605+
<< TraceIt.getCurrentLine() << "\n";
606+
} else if (std::find(PerfEventFilter.begin(), PerfEventFilter.end(),
607+
EventName) == PerfEventFilter.end()) {
608+
TraceIt.advance();
609+
return false;
610+
}
611+
612+
} else if (!PerfEventFilter.empty()) {
613+
WithColor::warning() << "A --perf-event filter was specified, but no "
614+
"\"event\" field found in line "
615+
<< TraceIt.getLineNumber() << ": "
616+
<< TraceIt.getCurrentLine() << "\n";
617+
}
618+
578619
uint64_t LeadingAddr;
579-
if (!Records.empty() && !Records[0].contains('/')) {
580-
if (Records[0].getAsInteger(16, LeadingAddr)) {
620+
if (Records.size() > Index && !Records[Index].contains('/')) {
621+
if (Records[Index].getAsInteger(16, LeadingAddr)) {
581622
WarnInvalidLBR(TraceIt);
582623
TraceIt.advance();
583624
return false;
584625
}
585-
Index = 1;
626+
Index++;
627+
}
628+
629+
// We assume that if we saw an event name we also saw a leading addr.
630+
// In other words, LeadingAddr is set if Index is 1 or 2.
631+
if (LeadingIPOnly && Index > 0) {
632+
// Form a profile only from the sample IP. Do not assume an LBR stack
633+
// follows, and ignore it if it does.
634+
uint64_t SampleIP = Binary->canonicalizeVirtualAddress(LeadingAddr);
635+
bool SampleIPIsInternal = Binary->addressIsCode(SampleIP);
636+
if (SampleIPIsInternal) {
637+
// Form a half LBR entry where the sample IP is the destination.
638+
LBRStack.emplace_back(LBREntry(SampleIP, SampleIP));
639+
}
640+
TraceIt.advance();
641+
return !LBRStack.empty();
586642
}
587643

588644
// Now extract LBR samples - note that we do not reverse the
@@ -902,6 +958,20 @@ void PerfScriptReader::computeCounterFromLBR(const PerfSample *Sample,
902958
uint64_t Repeat) {
903959
SampleCounter &Counter = SampleCounters.begin()->second;
904960
uint64_t EndAddress = 0;
961+
962+
if (LeadingIPOnly) {
963+
assert(Sample->LBRStack.size() == 1 &&
964+
"Expected only half LBR entries for ip-only mode");
965+
const LBREntry &LBR = *(Sample->LBRStack.begin());
966+
uint64_t SourceAddress = LBR.Source;
967+
uint64_t TargetAddress = LBR.Target;
968+
if (SourceAddress == TargetAddress &&
969+
Binary->addressIsCode(TargetAddress)) {
970+
Counter.recordRangeCount(SourceAddress, TargetAddress, Repeat);
971+
}
972+
return;
973+
}
974+
905975
for (const LBREntry &LBR : Sample->LBRStack) {
906976
uint64_t SourceAddress = LBR.Source;
907977
uint64_t TargetAddress = LBR.Target;
@@ -1062,6 +1132,18 @@ bool PerfScriptReader::isLBRSample(StringRef Line) {
10621132
Line.trim().split(Records, " ", 2, false);
10631133
if (Records.size() < 2)
10641134
return false;
1135+
// Check if there is an event name before the leading IP.
1136+
// If there is, it will be in Records[0]. To skip it, we'll re-split on
1137+
// Records[1], which should contain the rest of the line.
1138+
if (Records[0].contains(":")) {
1139+
// If so, consume the event name and continue processing the rest of the
1140+
// line.
1141+
StringRef IPAndLBR = Records[1].ltrim();
1142+
Records.clear();
1143+
IPAndLBR.split(Records, " ", 2, false);
1144+
if (Records.size() < 2)
1145+
return false;
1146+
}
10651147
if (Records[1].starts_with("0x") && Records[1].contains('/'))
10661148
return true;
10671149
return false;
@@ -1152,6 +1234,18 @@ void PerfScriptReader::warnInvalidRange() {
11521234
const PerfSample *Sample = Item.first.getPtr();
11531235
uint64_t Count = Item.second;
11541236
uint64_t EndAddress = 0;
1237+
1238+
if (LeadingIPOnly) {
1239+
assert(Sample->LBRStack.size() == 1 &&
1240+
"Expected only half LBR entries for ip-only mode");
1241+
const LBREntry &LBR = *(Sample->LBRStack.begin());
1242+
if (LBR.Source == LBR.Target && LBR.Source != ExternalAddr) {
1243+
// This is an leading-addr-only profile.
1244+
Ranges[{LBR.Source, LBR.Source}] += Count;
1245+
}
1246+
continue;
1247+
}
1248+
11551249
for (const LBREntry &LBR : Sample->LBRStack) {
11561250
uint64_t SourceAddress = LBR.Source;
11571251
uint64_t StartAddress = LBR.Target;
@@ -1199,11 +1293,15 @@ void PerfScriptReader::warnInvalidRange() {
11991293
!Binary->addressIsCode(EndAddress))
12001294
continue;
12011295

1202-
if (!Binary->addressIsCode(StartAddress) ||
1203-
!Binary->addressIsTransfer(EndAddress)) {
1204-
InstNotBoundary += I.second;
1205-
WarnInvalidRange(StartAddress, EndAddress, EndNotBoundaryMsg);
1206-
}
1296+
// IP samples can indicate activity on individual instructions rather than
1297+
// basic blocks/edges. In this mode, don't warn if sampled IPs aren't
1298+
// branches.
1299+
if (!LeadingIPOnly)
1300+
if (!Binary->addressIsCode(StartAddress) ||
1301+
!Binary->addressIsTransfer(EndAddress)) {
1302+
InstNotBoundary += I.second;
1303+
WarnInvalidRange(StartAddress, EndAddress, EndNotBoundaryMsg);
1304+
}
12071305

12081306
auto *FRange = Binary->findFuncRange(StartAddress);
12091307
if (!FRange) {

llvm/tools/llvm-profgen/ProfileGenerator.cpp

Lines changed: 20 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,8 @@ cl::opt<bool> InferMissingFrames(
104104
"Infer missing call frames due to compiler tail call elimination."),
105105
llvm::cl::Optional);
106106

107+
extern cl::opt<bool> LeadingIPOnly;
108+
107109
using namespace llvm;
108110
using namespace sampleprof;
109111

@@ -388,18 +390,25 @@ void ProfileGeneratorBase::updateBodySamplesforFunctionProfile(
388390
// Use the maximum count of samples with same line location
389391
uint32_t Discriminator = getBaseDiscriminator(LeafLoc.Location.Discriminator);
390392

391-
// Use duplication factor to compensated for loop unroll/vectorization.
392-
// Note that this is only needed when we're taking MAX of the counts at
393-
// the location instead of SUM.
394-
Count *= getDuplicationFactor(LeafLoc.Location.Discriminator);
395-
396-
ErrorOr<uint64_t> R =
397-
FunctionProfile.findSamplesAt(LeafLoc.Location.LineOffset, Discriminator);
398-
399-
uint64_t PreviousCount = R ? R.get() : 0;
400-
if (PreviousCount <= Count) {
393+
if (LeadingIPOnly) {
394+
// When computing an IP-based profile we take the SUM of counts at the
395+
// location instead of applying duplication factors and taking the MAX.
401396
FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator,
402-
Count - PreviousCount);
397+
Count);
398+
} else {
399+
// Otherwise, use duplication factor to compensate for loop
400+
// unroll/vectorization. Note that this is only needed when we're taking
401+
// MAX of the counts at the location instead of SUM.
402+
Count *= getDuplicationFactor(LeafLoc.Location.Discriminator);
403+
404+
ErrorOr<uint64_t> R = FunctionProfile.findSamplesAt(
405+
LeafLoc.Location.LineOffset, Discriminator);
406+
407+
uint64_t PreviousCount = R ? R.get() : 0;
408+
if (PreviousCount <= Count) {
409+
FunctionProfile.addBodySamples(LeafLoc.Location.LineOffset, Discriminator,
410+
Count - PreviousCount);
411+
}
403412
}
404413
}
405414

0 commit comments

Comments
 (0)