Skip to content

Commit d4fe31b

Browse files
committed
[ctxprof] Flat profile collection
1 parent 0a1f7f0 commit d4fe31b

File tree

8 files changed

+292
-33
lines changed

8 files changed

+292
-33
lines changed

compiler-rt/lib/ctx_profile/CtxInstrContextNode.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,12 @@ class ProfileWriter {
122122
virtual void startContextSection() = 0;
123123
virtual void writeContextual(const ctx_profile::ContextNode &RootNode) = 0;
124124
virtual void endContextSection() = 0;
125+
126+
virtual void startFlatSection() = 0;
127+
virtual void writeFlat(ctx_profile::GUID Guid, const uint64_t *Buffer,
128+
size_t BufferSize) = 0;
129+
virtual void endFlatSection() = 0;
130+
125131
virtual ~ProfileWriter() = default;
126132
};
127133
} // namespace ctx_profile

compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp

Lines changed: 124 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88

99
#include "CtxInstrProfiling.h"
1010
#include "sanitizer_common/sanitizer_allocator_internal.h"
11+
#include "sanitizer_common/sanitizer_atomic.h"
12+
#include "sanitizer_common/sanitizer_atomic_clang.h"
1113
#include "sanitizer_common/sanitizer_common.h"
1214
#include "sanitizer_common/sanitizer_dense_map.h"
1315
#include "sanitizer_common/sanitizer_libc.h"
@@ -27,6 +29,20 @@ __sanitizer::SpinMutex AllContextsMutex;
2729
SANITIZER_GUARDED_BY(AllContextsMutex)
2830
__sanitizer::Vector<ContextRoot *> AllContextRoots;
2931

32+
__sanitizer::atomic_uintptr_t AllFunctionsData = {};
33+
34+
// Keep all the functions for which we collect a flat profile in a linked list.
35+
__sanitizer::SpinMutex FlatCtxArenaMutex;
36+
SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
37+
Arena *FlatCtxArenaHead = nullptr;
38+
SANITIZER_GUARDED_BY(FlatCtxArenaMutex)
39+
Arena *FlatCtxArena = nullptr;
40+
41+
// Set to true when we enter a root, and false when we exit - regardless if this
42+
// thread collects a contextual profile for that root.
43+
__thread bool IsUnderContext = false;
44+
__sanitizer::atomic_uint8_t ProfilingStarted = {};
45+
3046
// utility to taint a pointer by setting the LSB. There is an assumption
3147
// throughout that the addresses of contexts are even (really, they should be
3248
// align(8), but "even"-ness is the minimum assumption)
@@ -109,7 +125,10 @@ void resetContextNode(ContextNode &Node) {
109125
resetContextNode(*Next);
110126
}
111127

112-
void onContextEnter(ContextNode &Node) { ++Node.counters()[0]; }
128+
ContextNode *onContextEnter(ContextNode &Node) {
129+
++Node.counters()[0];
130+
return &Node;
131+
}
113132

114133
} // namespace
115134

@@ -182,12 +201,75 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
182201
return Ret;
183202
}
184203

185-
ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
186-
uint32_t NumCounters,
204+
ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
205+
uint32_t NumCounters) {
206+
if (ContextNode *Existing = Data.FlatCtx)
207+
return Existing;
208+
{
209+
// We could instead try to take the lock and, if that fails, return
210+
// TheScratchContext. But that could leave message pump loops more sparsely
211+
// profiled than everything else. Maybe that doesn't matter, and we can
212+
// optimize this later.
213+
__sanitizer::GenericScopedLock<__sanitizer::StaticSpinMutex> L(&Data.Mutex);
214+
if (ContextNode *Existing = Data.FlatCtx)
215+
return Existing;
216+
217+
auto NeededSize = ContextNode::getAllocSize(NumCounters, 0);
218+
char *AllocBuff = nullptr;
219+
{
220+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> FL(
221+
&FlatCtxArenaMutex);
222+
if (FlatCtxArena)
223+
AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
224+
if (!AllocBuff) {
225+
FlatCtxArena = Arena::allocateNewArena(getArenaAllocSize(NeededSize),
226+
FlatCtxArena);
227+
AllocBuff = FlatCtxArena->tryBumpAllocate(NeededSize);
228+
}
229+
if (!FlatCtxArenaHead)
230+
FlatCtxArenaHead = FlatCtxArena;
231+
}
232+
auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
233+
Data.FlatCtx = Ret;
234+
235+
Data.Next = reinterpret_cast<FunctionData *>(
236+
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
237+
while (!__sanitizer::atomic_compare_exchange_strong(
238+
&AllFunctionsData, reinterpret_cast<uintptr_t *>(&Data.Next),
239+
reinterpret_cast<uintptr_t>(&Data),
240+
__sanitizer::memory_order_release)) {
241+
}
242+
}
243+
244+
return Data.FlatCtx;
245+
}
246+
247+
ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
248+
uint32_t NumCounters) {
249+
// 1) if we are under a root (regardless if this thread is collecting or not a
250+
// contextual profile for that root), do not collect a flat profile. We want
251+
// to keep flat profiles only for activations that can't happen under a root,
252+
// to avoid confusing profiles. We can, for example, combine flattened and
253+
// flat profiles meaningfully, as we wouldn't double-count anything.
254+
//
255+
// 2) to avoid lengthy startup, don't bother with flat profiles until the
256+
// profiling started. We would reset them anyway when profiling starts.
257+
// HOWEVER. This does loose profiling for message pumps: those functions are
258+
// entered once and never exit. They should be assumed to be entered before
259+
// profiling starts - because profiling should start after the server is up
260+
// and running (which is equivalent to "message pumps are set up").
261+
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
262+
return TheScratchContext;
263+
return markAsScratch(
264+
onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
265+
}
266+
267+
ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
268+
GUID Guid, uint32_t NumCounters,
187269
uint32_t NumCallsites) {
188270
// fast "out" if we're not even doing contextual collection.
189271
if (!__llvm_ctx_profile_current_context_root)
190-
return TheScratchContext;
272+
return getUnhandledContext(*Data, Guid, NumCounters);
191273

192274
// also fast "out" if the caller is scratch. We can see if it's scratch by
193275
// looking at the interior pointer into the subcontexts vector that the caller
@@ -196,7 +278,7 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
196278
// precisely, aligned - 8 values)
197279
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
198280
if (!CallsiteContext || isScratch(CallsiteContext))
199-
return TheScratchContext;
281+
return getUnhandledContext(*Data, Guid, NumCounters);
200282

201283
// if the callee isn't the expected one, return scratch.
202284
// Signal handler(s) could have been invoked at any point in the execution.
@@ -214,7 +296,7 @@ ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
214296
// for that case.
215297
auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
216298
if (ExpectedCallee != Callee)
217-
return TheScratchContext;
299+
return getUnhandledContext(*Data, Guid, NumCounters);
218300

219301
auto *Callsite = *CallsiteContext;
220302
// in the case of indirect calls, we will have all seen targets forming a
@@ -257,6 +339,7 @@ void setupContext(ContextRoot *Root, GUID Guid, uint32_t NumCounters,
257339
ContextNode *__llvm_ctx_profile_start_context(
258340
ContextRoot *Root, GUID Guid, uint32_t Counters,
259341
uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
342+
IsUnderContext = true;
260343
if (!Root->FirstMemBlock) {
261344
setupContext(Root, Guid, Counters, Callsites);
262345
}
@@ -272,6 +355,7 @@ ContextNode *__llvm_ctx_profile_start_context(
272355

273356
void __llvm_ctx_profile_release_context(ContextRoot *Root)
274357
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
358+
IsUnderContext = false;
275359
if (__llvm_ctx_profile_current_context_root) {
276360
__llvm_ctx_profile_current_context_root = nullptr;
277361
Root->Taken.Unlock();
@@ -291,10 +375,12 @@ void __llvm_ctx_profile_start_collection() {
291375

292376
resetContextNode(*Root->FirstNode);
293377
}
378+
__sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
294379
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
295380
}
296381

297382
bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
383+
__sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
298384
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
299385
&AllContextsMutex);
300386

@@ -310,17 +396,43 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
310396
Writer.writeContextual(*Root->FirstNode);
311397
}
312398
Writer.endContextSection();
399+
Writer.startFlatSection();
400+
// The list progresses behind the head, so taking this snapshot allows the
401+
// list to grow concurrently without causing a race condition with our
402+
// traversing it.
403+
const auto *Pos = reinterpret_cast<const FunctionData *>(
404+
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
405+
for (; Pos; Pos = Pos->Next)
406+
Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
407+
Pos->FlatCtx->counters_size());
408+
Writer.endFlatSection();
313409
return true;
314410
}
315411

316412
void __llvm_ctx_profile_free() {
317-
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
318-
&AllContextsMutex);
319-
for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
320-
for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
413+
__sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
414+
{
415+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
416+
&AllContextsMutex);
417+
for (int I = 0, E = AllContextRoots.Size(); I < E; ++I)
418+
for (auto *A = AllContextRoots[I]->FirstMemBlock; A;) {
419+
auto *C = A;
420+
A = A->next();
421+
__sanitizer::InternalFree(C);
422+
}
423+
AllContextRoots.Reset();
424+
}
425+
__sanitizer::atomic_store_relaxed(&AllFunctionsData, 0U);
426+
{
427+
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
428+
&FlatCtxArenaMutex);
429+
FlatCtxArena = nullptr;
430+
for (auto *A = FlatCtxArenaHead; A;) {
321431
auto *C = A;
322-
A = A->next();
432+
A = C->next();
323433
__sanitizer::InternalFree(C);
324434
}
325-
AllContextRoots.Reset();
435+
436+
FlatCtxArenaHead = nullptr;
437+
}
326438
}

compiler-rt/lib/ctx_profile/CtxInstrProfiling.h

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,28 @@ struct ContextRoot {
113113
static_assert(sizeof(Taken) == 1);
114114
};
115115

116+
// This is allocated and zero-initialized by the compiler, the in-place
117+
// initialization serves mostly as self-documentation and for testing.
118+
// The design is influenced by the observation that typically (at least for
119+
// datacenter binaries, which is the motivating target of this profiler) less
120+
// than 10% of functions in a binary even appear in a profile (of any kind).
121+
//
122+
// 1) We could pre-allocate the flat profile storage in the compiler, just like
123+
// the flat instrumented profiling does. But that penalizes the static size of
124+
// the binary for little reason
125+
//
126+
// 2) We could do the above but zero-initialize the buffers, and dynamically
127+
// populate them. This, though, would page-in more memory upfront for the
128+
// binary's runtime
129+
//
130+
// The current design trades off a bit of overhead at the first time a function
131+
// is encountered *for flat profiling* for avoiding size penalties.
132+
struct FunctionData {
133+
FunctionData *Next = nullptr;
134+
ContextNode *volatile FlatCtx = nullptr;
135+
::__sanitizer::StaticSpinMutex Mutex;
136+
};
137+
116138
/// This API is exposed for testing. See the APIs below about the contract with
117139
/// LLVM.
118140
inline bool isScratch(const void *Ctx) {
@@ -152,7 +174,8 @@ void __llvm_ctx_profile_release_context(__ctx_profile::ContextRoot *Root);
152174

153175
/// called for any other function than entry points, in the entry BB of such
154176
/// function. Same consideration about LSB of returned value as .._start_context
155-
ContextNode *__llvm_ctx_profile_get_context(void *Callee, GUID Guid,
177+
ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *Data,
178+
void *Callee, GUID Guid,
156179
uint32_t NumCounters,
157180
uint32_t NumCallsites);
158181

0 commit comments

Comments
 (0)