Skip to content

Commit 1efa69b

Browse files
committed
[ctxprof] Flat profile collection
1 parent a316516 commit 1efa69b

File tree

8 files changed

+286
-36
lines changed

8 files changed

+286
-36
lines changed

compiler-rt/lib/ctx_profile/CtxInstrContextNode.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,12 @@ class ProfileWriter {
122122
virtual void startContextSection() = 0;
123123
virtual void writeContextual(const ctx_profile::ContextNode &RootNode) = 0;
124124
virtual void endContextSection() = 0;
125+
126+
virtual void startFlatSection() = 0;
127+
virtual void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer,
128+
size_t BufferSize) = 0;
129+
virtual void endFlatSection() = 0;
130+
125131
virtual ~ProfileWriter() = default;
126132
};
127133
} // namespace ctx_profile

compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp

Lines changed: 122 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
#include "CtxInstrProfiling.h"
1010
#include "sanitizer_common/sanitizer_allocator_internal.h"
11+
#include "sanitizer_common/sanitizer_atomic.h"
12+
#include "sanitizer_common/sanitizer_atomic_clang.h"
1113
#include "sanitizer_common/sanitizer_common.h"
1214
#include "sanitizer_common/sanitizer_dense_map.h"
1315
#include "sanitizer_common/sanitizer_libc.h"
@@ -27,6 +29,20 @@ __sanitizer::SpinMutex AllContextsMutex;
2729
SANITIZER_GUARDED_BY(AllContextsMutex)
2830
__sanitizer::Vector<ContextRoot *> AllContextRoots;
2931

32+
__sanitizer::atomic_uintptr_t AllFunctionsData = {};
33+
34+
// Keep all the functions for which we collect a flat profile in a linked list.
35+
__sanitizer::SpinMutex FlatCtxArenaMutex;
36+
SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
37+
Arena* FlatCtxArenaHead = nullptr;
38+
SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
39+
Arena* FlatCtxArena = nullptr;
40+
41+
// Set to true when we enter a root, and false when we exit - regardless if this
42+
// thread collects a contextual profile for that root.
43+
__thread bool IsUnderContext = false;
44+
__sanitizer::atomic_uint8_t ProfilingStarted = {};
45+
3046
// utility to taint a pointer by setting the LSB. There is an assumption
3147
// throughout that the addresses of contexts are even (really, they should be
3248
// align(8), but "even"-ness is the minimum assumption)
@@ -109,7 +125,10 @@ void resetContextNode(ContextNode &Node) {
109125
resetContextNode(*Next);
110126
}
111127

112-
void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; }
128+
ContextNode *onContextEnter(ContextNode &Node) {
129+
++Node.counters()[0];
130+
return &Node;
131+
}
113132

114133
} // namespace
115134

@@ -182,12 +201,74 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
182201
return Ret;
183202
}
184203

185-
ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
186-
uint32_t NumCounters,
204+
ContextNode *getFlatProfile(FunctionData &Data, GUID Guid, uint32_t NumCounters) {
205+
if (ContextNode *Existing = Data.FlatCtx)
206+
return Existing;
207+
{
208+
// We could instead try to take the lock and, if that fails, return
209+
// TheScratchContext. But that could leave message pump loops more sparsely
210+
// profiled than everything else. Maybe that doesn't matter, and we can
211+
// optimize this later.
212+
__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex);
213+
if (ContextNode *Existing = Data.FlatCtx)
214+
return Existing;
215+
216+
auto NeededSize = ContextNode::getAllocSize(NumCounters, 0);
217+
char *AllocBuff = nullptr;
218+
{
219+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL(
220+
&FlatCtxArenaMutex);
221+
if (FlatCtxArena)
222+
AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
223+
if (!AllocBuff) {
224+
FlatCtxArena = Arena::allocateNewArena(getArenaAllocSize(NeededSize),
225+
FlatCtxArena);
226+
AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
227+
}
228+
if (!FlatCtxArenaHead)
229+
FlatCtxArenaHead = FlatCtxArena;
230+
}
231+
auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
232+
Data.FlatCtx = Ret;
233+
234+
Data.Next = reinterpret_cast<FunctionData *>(
235+
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
236+
while (!__sanitizer::atomic_compare_exchange_strong(
237+
&AllFunctionsData, reinterpret_cast<uintptr_t *>(&Data.Next),
238+
reinterpret_cast<uintptr_t>(&Data),
239+
__sanitizer::memory_order_release)) {
240+
}
241+
}
242+
243+
return Data.FlatCtx;
244+
}
245+
246+
ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
247+
uint32_t NumCounters) {
248+
// 1) if we are under a root (regardless if this thread is collecting or not a
249+
// contextual profile for that root), do not collect a flat profile. We want
250+
// to keep flat profiles only for activations that can't happen under a root,
251+
// to avoid confusing profiles. We can, for example, combine flattened and
252+
// flat profiles meaningfully, as we wouldn't double-count anything.
253+
//
254+
// 2) to avoid lengthy startup, don't bother with flat profiles until the
255+
// profiling started. We would reset them anyway when profiling starts.
256+
// HOWEVER. This does loose profiling for message pumps: those functions are
257+
// entered once and never exit. They should be assumed to be entered before
258+
// profiling starts - because profiling should start after the server is up
259+
// and running (which is equivalent to "message pumps are set up").
260+
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
261+
return TheScratchContext;
262+
return markAsScratch(
263+
onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
264+
}
265+
266+
ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
267+
GUID Guid, uint32_t NumCounters,
187268
uint32_t NumCallsites) {
188269
// fast "out" if we're not even doing contextual collection.
189270
if (!__llvm_ctx_profile_current_context_root)
190-
return TheScratchContext;
271+
return getUnhandledContext(*Data, Guid, NumCounters);
191272

192273
// also fast "out" if the caller is scratch. We can see if it's scratch by
193274
// looking at the interior pointer into the subcontexts vector that the caller
@@ -196,7 +277,7 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
196277
// precisely, aligned - 8 values)
197278
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
198279
if (!CallsiteContext || isScratch(CallsiteContext))
199-
return TheScratchContext;
280+
return getUnhandledContext(*Data, Guid, NumCounters);
200281

201282
// if the callee isn't the expected one, return scratch.
202283
// Signal handler(s) could have been invoked at any point in the execution.
@@ -214,7 +295,7 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
214295
// for that case.
215296
auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
216297
if (ExpectedCallee != Callee)
217-
return TheScratchContext;
298+
return getUnhandledContext(*Data, Guid, NumCounters);
218299

219300
auto *Callsite = *CallsiteContext;
220301
// in the case of indirect calls, we will have all seen targets forming a
@@ -257,6 +338,7 @@ void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters,
257338
ContextNode *__llvm_ctx_profile_start_context(
258339
ContextRoot *Root, GUID Guid, uint32_t Counters,
259340
uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
341+
IsUnderContext = true;
260342
if (!Root->FirstMemBlock) {
261343
setupContext(Root, Guid, Counters, Callsites);
262344
}
@@ -272,6 +354,7 @@ ContextNode *__llvm_ctx_profile_start_context(
272354

273355
void __llvm_ctx_profile_release_context(ContextRoot *Root)
274356
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
357+
IsUnderContext = false;
275358
if (__llvm_ctx_profile_current_context_root) {
276359
__llvm_ctx_profile_current_context_root = nullptr;
277360
Root->Taken.Unlock();
@@ -291,10 +374,12 @@ void __llvm_ctx_profile_start_collection() {
291374

292375
resetContextNode(*Root->FirstNode);
293376
}
377+
__sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
294378
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
295379
}
296380

297381
bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
382+
__sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
298383
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
299384
&AllContextsMutex);
300385

@@ -310,17 +395,42 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
310395
Writer.writeContextual(*Root->FirstNode);
311396
}
312397
Writer.endContextSection();
398+
Writer.startFlatSection();
399+
// The list progresses behind the head, so taking this snapshot allows the
400+
// list to grow concurrently without causing a race condition with our
401+
// traversing it.
402+
const auto *Pos = reinterpret_cast<const FunctionData *>(
403+
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
404+
for (; Pos; Pos = Pos->Next)
405+
Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
406+
Pos->FlatCtx->counters_size());
407+
Writer.endFlatSection();
313408
return true;
314409
}
315410

316411
void __llvm_ctx_profile_free() {
317-
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
318-
&AllContextsMutex);
319-
for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
320-
for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
412+
{
413+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
414+
&AllContextsMutex);
415+
for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
416+
for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
417+
auto *C = A;
418+
A = A->next();
419+
__sanitizer::InternalFree(C);
420+
}
421+
AllContextRoots.Reset();
422+
}
423+
__sanitizer::atomic_store_relaxed(&AllFunctionsData, 0U);
424+
{
425+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
426+
&FlatCtxArenaMutex);
427+
FlatCtxArena = nullptr;
428+
for (auto *A = FlatCtxArenaHead; A;) {
321429
auto *C = A;
322-
A = A->next();
430+
A = C->next();
323431
__sanitizer::InternalFree(C);
324432
}
325-
AllContextRoots.Reset();
433+
434+
FlatCtxArenaHead = nullptr;
435+
}
326436
}

compiler-rt/lib/ctx_profile/CtxInstrProfiling.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,28 @@ struct ContextRoot {
113113
static_assert(sizeof(Taken) == 1);
114114
};
115115

116+
// This is allocated and zero-initialized by the compiler, the in-place
117+
// initialization serves mostly as self-documentation and for testing.
118+
// The design is influenced by the observation that typically (at least for
119+
// datacenter binaries, which is the motivating target of this profiler) less
120+
// than 10% of functions in a binary even appear in a profile (of any kind).
121+
//
122+
// 1) We could pre-allocate the flat profile storage in the compiler, just like
123+
// the flat instrumented profiling does. But that penalizes the static size of
124+
// the binary for little reason
125+
//
126+
// 2) We could do the above but zero-initialize the buffers, and dynamically
127+
// populate them. This, though, would page-in more memory upfront for the
128+
// binary's runtime
129+
//
130+
// The current design trades off a bit of overhead at the first time a function
131+
// is encountered *for flat profiling* for avoiding size penalties.
132+
struct FunctionData {
133+
FunctionData *Next = nullptr;
134+
ContextNode *volatile FlatCtx = nullptr;
135+
::__sanitizer::StaticSpinMutex Mutex;
136+
};
137+
116138
/// This API is exposed for testing. See the APIs below about the contract with
117139
/// LLVM.
118140
inline bool isScratch(const void *Ctx) {
@@ -152,7 +174,8 @@ void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root);
152174

153175
/// called for any other function than entry points, in the entry BB of such
154176
/// function. Same consideration about LSB of returned value as .._start_context
155-
ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
177+
ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *Data,
178+
void *Callee, GUID Guid,
156179
uint32_t NumCounters,
157180
uint32_t NumCallsites);
158181

0 commit comments

Comments
 (0)