Skip to content

Commit 0668bb2

Browse files
authored
[ctxprof] Track unhandled call targets (#131417)
Collect profiles for functions we encounter when collecting a contextual profile, that are not associated with a call site. This is expected to happen for signal handlers, but it also - problematically - currently happens for mem{memset|copy|move|set}, which are currently inserted after profile instrumentation. Collecting a "regular" flat profile in these cases would hide the problem - that we loose better profile opportunities.
1 parent bd2d8c0 commit 0668bb2

File tree

14 files changed

+211
-56
lines changed

14 files changed

+211
-56
lines changed

compiler-rt/lib/ctx_profile/CtxInstrContextNode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ class ProfileWriter {
121121
public:
122122
virtual void startContextSection() = 0;
123123
virtual void writeContextual(const ctx_profile::ContextNode &RootNode,
124+
const ctx_profile::ContextNode *Unhandled,
124125
uint64_t TotalRootEntryCount) = 0;
125126
virtual void endContextSection() = 0;
126127

compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp

Lines changed: 27 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -246,22 +246,37 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
246246

247247
ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
248248
uint32_t NumCounters) {
249-
// 1) if we are under a root (regardless if this thread is collecting or not a
249+
250+
// 1) if we are currently collecting a contextual profile, fetch a ContextNode
251+
// in the `Unhandled` set. We want to do this regardless of `ProfilingStarted`
252+
// to (hopefully) offset the penalty of creating these contexts to before
253+
// profiling.
254+
//
255+
// 2) if we are under a root (regardless if this thread is collecting or not a
250256
// contextual profile for that root), do not collect a flat profile. We want
251257
// to keep flat profiles only for activations that can't happen under a root,
252258
// to avoid confusing profiles. We can, for example, combine flattened and
253259
// flat profiles meaningfully, as we wouldn't double-count anything.
254260
//
255-
// 2) to avoid lengthy startup, don't bother with flat profiles until the
256-
// profiling started. We would reset them anyway when profiling starts.
261+
// 3) to avoid lengthy startup, don't bother with flat profiles until the
262+
// profiling has started. We would reset them anyway when profiling starts.
257263
// HOWEVER. This does lose profiling for message pumps: those functions are
258264
// entered once and never exit. They should be assumed to be entered before
259265
// profiling starts - because profiling should start after the server is up
260266
// and running (which is equivalent to "message pumps are set up").
261-
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
262-
return TheScratchContext;
263-
return markAsScratch(
264-
onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
267+
ContextRoot *R = __llvm_ctx_profile_current_context_root;
268+
if (!R) {
269+
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
270+
return TheScratchContext;
271+
else
272+
return markAsScratch(
273+
onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
274+
}
275+
auto [Iter, Ins] = R->Unhandled.insert({Guid, nullptr});
276+
if (Ins)
277+
Iter->second =
278+
getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0);
279+
return markAsScratch(onContextEnter(*Iter->second));
265280
}
266281

267282
ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
@@ -396,6 +411,8 @@ void __llvm_ctx_profile_start_collection() {
396411
++NumMemUnits;
397412

398413
resetContextNode(*Root->FirstNode);
414+
if (Root->FirstUnhandledCalleeNode)
415+
resetContextNode(*Root->FirstUnhandledCalleeNode);
399416
__sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
400417
}
401418
__sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
@@ -416,8 +433,9 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
416433
__sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
417434
return false;
418435
}
419-
Writer.writeContextual(*Root->FirstNode, __sanitizer::atomic_load_relaxed(
420-
&Root->TotalEntries));
436+
Writer.writeContextual(
437+
*Root->FirstNode, Root->FirstUnhandledCalleeNode,
438+
__sanitizer::atomic_load_relaxed(&Root->TotalEntries));
421439
}
422440
Writer.endContextSection();
423441
Writer.startFlatSection();

compiler-rt/lib/ctx_profile/CtxInstrProfiling.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
#define CTX_PROFILE_CTXINSTRPROFILING_H_
1111

1212
#include "CtxInstrContextNode.h"
13+
#include "sanitizer_common/sanitizer_dense_map.h"
1314
#include "sanitizer_common/sanitizer_mutex.h"
1415
#include <sanitizer/common_interface_defs.h>
1516

@@ -83,6 +84,20 @@ struct ContextRoot {
8384
// Count the number of entries - regardless if we could take the `Taken` mutex
8485
::__sanitizer::atomic_uint64_t TotalEntries = {};
8586

87+
// Profiles for functions we encounter when collecting a contexutal profile,
88+
// that are not associated with a callsite. This is expected to happen for
89+
// signal handlers, but it also - problematically - currently happens for
90+
// call sites generated after profile instrumentation, primarily
91+
// mem{memset|copy|move|set}.
92+
// `Unhandled` serves 2 purposes:
93+
// 1. identifying such cases (like the memops)
94+
// 2. collecting a profile for them, which can be at least used as a flat
95+
// profile
96+
::__sanitizer::DenseMap<GUID, ContextNode *> Unhandled;
97+
// Keep the unhandled contexts in a list, as we allocate them, as it makes it
98+
// simpler to send to the writer when the profile is fetched.
99+
ContextNode *FirstUnhandledCalleeNode = nullptr;
100+
86101
// Taken is used to ensure only one thread traverses the contextual graph -
87102
// either to read it or to write it. On server side, the same entrypoint will
88103
// be entered by numerous threads, but over time, the profile aggregated by

compiler-rt/lib/ctx_profile/tests/CtxInstrProfilingTest.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,7 @@ TEST_F(ContextTest, Dump) {
240240
TestProfileWriter(ContextRoot *Root, size_t Entries)
241241
: Root(Root), Entries(Entries) {}
242242

243-
void writeContextual(const ContextNode &Node,
243+
void writeContextual(const ContextNode &Node, const ContextNode *Unhandled,
244244
uint64_t TotalRootEntryCount) override {
245245
EXPECT_EQ(TotalRootEntryCount, Entries);
246246
EXPECT_EQ(EnteredSectionCount, 1);

compiler-rt/test/ctx_profile/TestCases/generate-context.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,8 @@
55
// RUN: cp %llvm_src/include/llvm/ProfileData/CtxInstrContextNode.h %t_include/
66
//
77
// Compile with ctx instrumentation "on". We treat "theRoot" as callgraph root.
8-
// RUN: %clangxx %s %ctxprofilelib -I%t_include -O2 -o %t.bin -mllvm -profile-context-root=theRoot
8+
// RUN: %clangxx %s %ctxprofilelib -I%t_include -O2 -o %t.bin -mllvm -profile-context-root=theRoot \
9+
// RUN: -mllvm -ctx-prof-skip-callsite-instr=skip_me
910
//
1011
// Run the binary, and observe the profile fetch handler's output.
1112
// RUN: %t.bin | FileCheck %s
@@ -20,11 +21,14 @@ extern "C" bool __llvm_ctx_profile_fetch(ProfileWriter &);
2021

2122
// avoid name mangling
2223
extern "C" {
24+
__attribute__((noinline)) void skip_me() {}
25+
2326
__attribute__((noinline)) void someFunction(int I) {
2427
if (I % 2)
2528
printf("check odd\n");
2629
else
2730
printf("check even\n");
31+
skip_me();
2832
}
2933

3034
// block inlining because the pre-inliner otherwise will inline this - it's
@@ -36,6 +40,7 @@ __attribute__((noinline)) void theRoot() {
3640
for (auto I = 0; I < 2; ++I) {
3741
someFunction(I);
3842
}
43+
skip_me();
3944
}
4045

4146
__attribute__((noinline)) void flatFct() {
@@ -85,9 +90,13 @@ class TestProfileWriter : public ProfileWriter {
8590
}
8691

8792
void writeContextual(const ContextNode &RootNode,
93+
const ContextNode *Unhandled,
8894
uint64_t EntryCount) override {
8995
std::cout << "Entering Root " << RootNode.guid()
9096
<< " with total entry count " << EntryCount << std::endl;
97+
for (const auto *P = Unhandled; P; P = P->next())
98+
std::cout << "Unhandled GUID: " << P->guid() << " entered "
99+
<< P->entrycount() << " times" << std::endl;
91100
printProfile(RootNode, "", "");
92101
}
93102

@@ -119,6 +128,8 @@ class TestProfileWriter : public ProfileWriter {
119128
// branches would be taken once, so the second counter is 1.
120129
// CHECK-NEXT: Entered Context Section
121130
// CHECK-NEXT: Entering Root 8657661246551306189 with total entry count 1
131+
// skip_me is entered 4 times: 3 via `someFunction`, and once from `theRoot`
132+
// CHECK-NEXT: Unhandled GUID: 17928815489886282963 entered 4 times
122133
// CHECK-NEXT: Guid: 8657661246551306189
123134
// CHECK-NEXT: Entries: 1
124135
// CHECK-NEXT: 2 counters and 3 callsites
@@ -135,6 +146,8 @@ class TestProfileWriter : public ProfileWriter {
135146
// CHECK-NEXT: Counter values: 2 1
136147
// CHECK-NEXT: Exited Context Section
137148
// CHECK-NEXT: Entered Flat Section
149+
// This is `skip_me`. Entered 3 times via `someFunction`
150+
// CHECK-NEXT: Flat: 17928815489886282963 3
138151
// CHECK-NEXT: Flat: 6759619411192316602 3,1
139152
// This is flatFct (guid: 14569438697463215220)
140153
// CHECK-NEXT: Flat: 14569438697463215220 1,2

llvm/include/llvm/ProfileData/CtxInstrContextNode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,7 @@ class ProfileWriter {
121121
public:
122122
virtual void startContextSection() = 0;
123123
virtual void writeContextual(const ctx_profile::ContextNode &RootNode,
124+
const ctx_profile::ContextNode *Unhandled,
124125
uint64_t TotalRootEntryCount) = 0;
125126
virtual void endContextSection() = 0;
126127

llvm/include/llvm/ProfileData/PGOCtxProfReader.h

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,12 @@ class IndexNode {
7474
};
7575
} // namespace internal
7676

77+
// Setting initial capacity to 1 because all contexts must have at least 1
78+
// counter, and then, because all contexts belonging to a function have the same
79+
// size, there'll be at most one other heap allocation.
80+
using CtxProfFlatProfile =
81+
std::map<GlobalValue::GUID, SmallVector<uint64_t, 1>>;
82+
7783
/// A node (context) in the loaded contextual profile, suitable for mutation
7884
/// during IPO passes. We generally expect a fraction of counters and
7985
/// callsites to be populated. We continue to model counters as vectors, but
@@ -93,11 +99,16 @@ class PGOCtxProfContext final : public internal::IndexNode {
9399
GlobalValue::GUID GUID = 0;
94100
SmallVector<uint64_t, 16> Counters;
95101
const std::optional<uint64_t> RootEntryCount{};
102+
std::optional<CtxProfFlatProfile> Unhandled{};
96103
CallsiteMapTy Callsites;
97104

98-
PGOCtxProfContext(GlobalValue::GUID G, SmallVectorImpl<uint64_t> &&Counters,
99-
std::optional<uint64_t> RootEntryCount = std::nullopt)
100-
: GUID(G), Counters(std::move(Counters)), RootEntryCount(RootEntryCount) {
105+
PGOCtxProfContext(
106+
GlobalValue::GUID G, SmallVectorImpl<uint64_t> &&Counters,
107+
std::optional<uint64_t> RootEntryCount = std::nullopt,
108+
std::optional<CtxProfFlatProfile> &&Unhandled = std::nullopt)
109+
: GUID(G), Counters(std::move(Counters)), RootEntryCount(RootEntryCount),
110+
Unhandled(std::move(Unhandled)) {
111+
assert(RootEntryCount.has_value() == Unhandled.has_value());
101112
}
102113

103114
Expected<PGOCtxProfContext &>
@@ -121,6 +132,8 @@ class PGOCtxProfContext final : public internal::IndexNode {
121132
bool isRoot() const { return RootEntryCount.has_value(); }
122133
uint64_t getTotalRootEntryCount() const { return RootEntryCount.value(); }
123134

135+
const CtxProfFlatProfile &getUnhandled() const { return Unhandled.value(); }
136+
124137
uint64_t getEntrycount() const {
125138
assert(!Counters.empty() &&
126139
"Functions are expected to have at their entry BB instrumented, so "
@@ -170,12 +183,6 @@ class PGOCtxProfContext final : public internal::IndexNode {
170183
}
171184
};
172185

173-
// Setting initial capacity to 1 because all contexts must have at least 1
174-
// counter, and then, because all contexts belonging to a function have the same
175-
// size, there'll be at most one other heap allocation.
176-
using CtxProfFlatProfile =
177-
std::map<GlobalValue::GUID, SmallVector<uint64_t, 1>>;
178-
179186
using CtxProfContextualProfiles =
180187
std::map<GlobalValue::GUID, PGOCtxProfContext>;
181188
struct PGOCtxProfile {
@@ -205,6 +212,7 @@ class PGOCtxProfileReader final {
205212

206213
Error loadContexts(CtxProfContextualProfiles &P);
207214
Error loadFlatProfiles(CtxProfFlatProfile &P);
215+
Error loadFlatProfileList(CtxProfFlatProfile &P);
208216

209217
public:
210218
PGOCtxProfileReader(StringRef Buffer)

llvm/include/llvm/ProfileData/PGOCtxProfWriter.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@ enum PGOCtxProfileBlockIDs {
3636
ContextNodeBlockID = ContextRootBlockID + 1,
3737
FlatProfilesSectionBlockID = ContextNodeBlockID + 1,
3838
FlatProfileBlockID = FlatProfilesSectionBlockID + 1,
39-
LAST_VALID = FlatProfileBlockID
39+
UnhandledBlockID = FlatProfileBlockID + 1,
40+
LAST_VALID = UnhandledBlockID
4041
};
4142

4243
/// Write one or more ContextNodes to the provided raw_fd_stream.
@@ -94,6 +95,7 @@ class PGOCtxProfileWriter final : public ctx_profile::ProfileWriter {
9495

9596
void startContextSection() override;
9697
void writeContextual(const ctx_profile::ContextNode &RootNode,
98+
const ctx_profile::ContextNode *Unhandled,
9799
uint64_t TotalRootEntryCount) override;
98100
void endContextSection() override;
99101

@@ -104,7 +106,7 @@ class PGOCtxProfileWriter final : public ctx_profile::ProfileWriter {
104106

105107
// constants used in writing which a reader may find useful.
106108
static constexpr unsigned CodeLen = 2;
107-
static constexpr uint32_t CurrentVersion = 3;
109+
static constexpr uint32_t CurrentVersion = 4;
108110
static constexpr unsigned VBREncodingBits = 6;
109111
static constexpr StringRef ContainerMagic = "CTXP";
110112
};

0 commit comments

Comments
 (0)