Skip to content

Commit 7f3dd7f

Browse files
committed
RootAutodetect
1 parent 2abcdd8 commit 7f3dd7f

File tree

11 files changed

+449
-42
lines changed

11 files changed

+449
-42
lines changed

compiler-rt/lib/ctx_profile/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ endif()
2727
add_compiler_rt_runtime(clang_rt.ctx_profile
2828
STATIC
2929
ARCHS ${CTX_PROFILE_SUPPORTED_ARCH}
30-
OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc
30+
OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc RTSanitizerCommonSymbolizer
3131
CFLAGS ${EXTRA_FLAGS}
3232
SOURCES ${CTX_PROFILE_SOURCES}
3333
ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS}

compiler-rt/lib/ctx_profile/CtxInstrContextNode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class ContextNode final {
127127
/// MUTEXDECL takes one parameter, the name of a field that is a mutex.
128128
#define CTXPROF_FUNCTION_DATA(PTRDECL, VOLATILE_PTRDECL, MUTEXDECL) \
129129
PTRDECL(FunctionData, Next) \
130+
VOLATILE_PTRDECL(void, EntryAddress) \
130131
VOLATILE_PTRDECL(ContextRoot, CtxRoot) \
131132
VOLATILE_PTRDECL(ContextNode, FlatCtx) \
132133
MUTEXDECL(Mutex)

compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp

Lines changed: 57 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "CtxInstrProfiling.h"
10+
#include "RootAutoDetector.h"
1011
#include "sanitizer_common/sanitizer_allocator_internal.h"
1112
#include "sanitizer_common/sanitizer_atomic.h"
1213
#include "sanitizer_common/sanitizer_atomic_clang.h"
@@ -43,6 +44,12 @@ Arena *FlatCtxArena = nullptr;
4344
__thread bool IsUnderContext = false;
4445
__sanitizer::atomic_uint8_t ProfilingStarted = {};
4546

47+
__sanitizer::atomic_uintptr_t RootDetector = {};
48+
RootAutoDetector *getRootDetector() {
49+
return reinterpret_cast<RootAutoDetector *>(
50+
__sanitizer::atomic_load_relaxed(&RootDetector));
51+
}
52+
4653
// utility to taint a pointer by setting the LSB. There is an assumption
4754
// throughout that the addresses of contexts are even (really, they should be
4855
// align(8), but "even"-ness is the minimum assumption)
@@ -201,7 +208,7 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
201208
return Ret;
202209
}
203210

204-
ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
211+
ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid,
205212
uint32_t NumCounters) {
206213
if (ContextNode *Existing = Data.FlatCtx)
207214
return Existing;
@@ -232,6 +239,7 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
232239
auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
233240
Data.FlatCtx = Ret;
234241

242+
Data.EntryAddress = Callee;
235243
Data.Next = reinterpret_cast<FunctionData *>(
236244
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
237245
while (!__sanitizer::atomic_compare_exchange_strong(
@@ -296,8 +304,9 @@ ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid,
296304
return TheScratchContext;
297305
}
298306

299-
ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
300-
uint32_t NumCounters) {
307+
ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid,
308+
uint32_t NumCounters, uint32_t NumCallsites,
309+
ContextRoot *CtxRoot) {
301310

302311
// 1) if we are currently collecting a contextual profile, fetch a ContextNode
303312
// in the `Unhandled` set. We want to do this regardless of `ProfilingStarted`
@@ -316,27 +325,32 @@ ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
316325
// entered once and never exit. They should be assumed to be entered before
317326
// profiling starts - because profiling should start after the server is up
318327
// and running (which is equivalent to "message pumps are set up").
319-
ContextRoot *R = __llvm_ctx_profile_current_context_root;
320-
if (!R) {
328+
if (!CtxRoot) {
329+
if (auto *RAD = getRootDetector())
330+
RAD->sample();
331+
else if (auto *CR = Data.CtxRoot)
332+
return tryStartContextGivenRoot(CR, Guid, NumCounters, NumCallsites);
321333
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
322334
return TheScratchContext;
323335
else
324336
return markAsScratch(
325-
onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
337+
onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters)));
326338
}
327-
auto [Iter, Ins] = R->Unhandled.insert({Guid, nullptr});
339+
auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr});
328340
if (Ins)
329-
Iter->second =
330-
getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0);
341+
Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode,
342+
NumCounters, 0);
331343
return markAsScratch(onContextEnter(*Iter->second));
332344
}
333345

334346
ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
335347
GUID Guid, uint32_t NumCounters,
336348
uint32_t NumCallsites) {
349+
auto *CtxRoot = __llvm_ctx_profile_current_context_root;
337350
// fast "out" if we're not even doing contextual collection.
338-
if (!__llvm_ctx_profile_current_context_root)
339-
return getUnhandledContext(*Data, Guid, NumCounters);
351+
if (!CtxRoot)
352+
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
353+
nullptr);
340354

341355
// also fast "out" if the caller is scratch. We can see if it's scratch by
342356
// looking at the interior pointer into the subcontexts vector that the caller
@@ -345,7 +359,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
345359
// precisely, aligned - 8 values)
346360
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
347361
if (!CallsiteContext || isScratch(CallsiteContext))
348-
return getUnhandledContext(*Data, Guid, NumCounters);
362+
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
363+
CtxRoot);
349364

350365
// if the callee isn't the expected one, return scratch.
351366
// Signal handler(s) could have been invoked at any point in the execution.
@@ -363,7 +378,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
363378
// for that case.
364379
auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
365380
if (ExpectedCallee != Callee)
366-
return getUnhandledContext(*Data, Guid, NumCounters);
381+
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
382+
CtxRoot);
367383

368384
auto *Callsite = *CallsiteContext;
369385
// in the case of indirect calls, we will have all seen targets forming a
@@ -388,21 +404,23 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
388404
ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid,
389405
uint32_t Counters,
390406
uint32_t Callsites) {
407+
391408
return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid,
392409
Counters, Callsites);
393410
}
394411

395412
void __llvm_ctx_profile_release_context(FunctionData *FData)
396413
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
414+
const auto *CurrentRoot = __llvm_ctx_profile_current_context_root;
415+
if (!CurrentRoot || FData->CtxRoot != CurrentRoot)
416+
return;
397417
IsUnderContext = false;
398-
if (__llvm_ctx_profile_current_context_root) {
399-
__llvm_ctx_profile_current_context_root = nullptr;
400-
assert(FData->CtxRoot);
401-
FData->CtxRoot->Taken.Unlock();
402-
}
418+
assert(FData->CtxRoot);
419+
__llvm_ctx_profile_current_context_root = nullptr;
420+
FData->CtxRoot->Taken.Unlock();
403421
}
404422

405-
void __llvm_ctx_profile_start_collection() {
423+
void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration) {
406424
size_t NumMemUnits = 0;
407425
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
408426
&AllContextsMutex);
@@ -418,12 +436,28 @@ void __llvm_ctx_profile_start_collection() {
418436
resetContextNode(*Root->FirstUnhandledCalleeNode);
419437
__sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
420438
}
439+
if (AutodetectDuration) {
440+
// we leak RD intentionally. Knowing when to free it is tricky, there's a
441+
// race condition with functions observing the `RootDectector` as non-null.
442+
// This can be addressed but the alternatives have some added complexity and
443+
// it's not (yet) worth it.
444+
auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector)))
445+
RootAutoDetector(AllFunctionsData, RootDetector, AutodetectDuration);
446+
RD->start();
447+
} else {
448+
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
449+
}
421450
__sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
422-
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
423451
}
424452

425453
bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
426454
__sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
455+
if (auto *RD = getRootDetector()) {
456+
__sanitizer::Printf("[ctxprof] Expected the root autodetector to have "
457+
"finished well before attempting to fetch a context");
458+
RD->join();
459+
}
460+
427461
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
428462
&AllContextsMutex);
429463

@@ -448,8 +482,9 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
448482
const auto *Pos = reinterpret_cast<const FunctionData *>(
449483
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
450484
for (; Pos; Pos = Pos->Next)
451-
Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
452-
Pos->FlatCtx->counters_size());
485+
if (!Pos->CtxRoot)
486+
Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
487+
Pos->FlatCtx->counters_size());
453488
Writer.endFlatSection();
454489
return true;
455490
}

compiler-rt/lib/ctx_profile/CtxInstrProfiling.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *FData,
207207

208208
/// Prepares for collection. Currently this resets counter values but preserves
209209
/// internal context tree structure.
210-
void __llvm_ctx_profile_start_collection();
210+
void __llvm_ctx_profile_start_collection(unsigned AutodetectDuration = 0);
211211

212212
/// Completely free allocated memory.
213213
void __llvm_ctx_profile_free();

compiler-rt/lib/ctx_profile/RootAutoDetector.cpp

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
#include "RootAutoDetector.h"
1010

11+
#include "CtxInstrProfiling.h"
1112
#include "sanitizer_common/sanitizer_common.h"
1213
#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
1314
#include <assert.h>
@@ -17,6 +18,99 @@
1718
using namespace __ctx_profile;
1819
template <typename T> using Set = DenseMap<T, bool>;
1920

21+
namespace __sanitizer {
22+
void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
23+
bool request_fast, u32 max_depth) {
24+
// We can't implement the fast variant. The fast variant ends up invoking an
25+
// external allocator, because of pthread_attr_getstack. If this happens
26+
// during an allocation of the program being instrumented, a non-reentrant
27+
// lock may be taken (this was observed). The allocator called by
28+
// pthread_attr_getstack will also try to take that lock.
29+
UnwindSlow(pc, max_depth);
30+
}
31+
} // namespace __sanitizer
32+
33+
RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
34+
GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
35+
Parent.AllSamples.PushBack(this);
36+
}
37+
38+
void RootAutoDetector::start() {
39+
atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
40+
pthread_create(
41+
&WorkerThread, nullptr,
42+
+[](void *Ctx) -> void * {
43+
RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
44+
SleepForSeconds(RAD->WaitSeconds);
45+
// To avoid holding the AllSamplesMutex, make a snapshot of all the
46+
// thread samples collected so far
47+
Vector<PerThreadSamples *> SamplesSnapshot;
48+
{
49+
GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
50+
SamplesSnapshot.Resize(RAD->AllSamples.Size());
51+
for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
52+
SamplesSnapshot[I] = RAD->AllSamples[I];
53+
}
54+
DenseMap<uptr, uint64_t> AllRoots;
55+
for (uptr I = 0; I < SamplesSnapshot.Size(); ++I) {
56+
GenericScopedLock<SpinMutex>(&SamplesSnapshot[I]->M);
57+
SamplesSnapshot[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
58+
auto [FAddr, Count] = KVP;
59+
AllRoots[FAddr] += Count;
60+
return true;
61+
});
62+
}
63+
// FIXME: as a next step, establish a minimum relative nr of samples
64+
// per root that would qualify it as a root.
65+
for (auto *FD = reinterpret_cast<FunctionData *>(
66+
atomic_load_relaxed(&RAD->FunctionDataListHead));
67+
FD; FD = FD->Next) {
68+
if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
69+
FD->getOrAllocateContextRoot();
70+
}
71+
}
72+
atomic_store_relaxed(&RAD->Self, 0);
73+
return nullptr;
74+
},
75+
this);
76+
}
77+
78+
void RootAutoDetector::join() { pthread_join(WorkerThread, nullptr); }
79+
80+
void RootAutoDetector::sample() {
81+
// tracking reentry in case we want to re-explore fast stack unwind - which
82+
// does potentially re-enter the runtime because it calls the instrumented
83+
// allocator because of pthread_attr_getstack. See the notes also on
84+
// UnwindImpl above.
85+
static thread_local bool Entered = false;
86+
static thread_local uint64_t Entries = 0;
87+
if (Entered || (++Entries % SampleRate))
88+
return;
89+
Entered = true;
90+
collectStack();
91+
Entered = false;
92+
}
93+
94+
void RootAutoDetector::collectStack() {
95+
GET_CALLER_PC_BP;
96+
BufferedStackTrace CurrentStack;
97+
CurrentStack.Unwind(pc, bp, nullptr, false);
98+
// 2 stack frames would be very unlikely to mean anything, since at least the
99+
// compiler-rt frame - which can't be inlined - should be observable, which
100+
// counts as 1; we can be even more aggressive with this number.
101+
if (CurrentStack.size <= 2)
102+
return;
103+
static thread_local PerThreadSamples *ThisThreadSamples =
104+
new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
105+
PerThreadSamples(*this);
106+
107+
if (!ThisThreadSamples->M.TryLock())
108+
return;
109+
110+
ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
111+
ThisThreadSamples->M.Unlock();
112+
}
113+
20114
uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
21115
// this requires --linkopt=-Wl,--export-dynamic
22116
Dl_info Info;

compiler-rt/lib/ctx_profile/RootAutoDetector.h

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "sanitizer_common/sanitizer_dense_map.h"
1313
#include "sanitizer_common/sanitizer_internal_defs.h"
1414
#include "sanitizer_common/sanitizer_stacktrace.h"
15+
#include "sanitizer_common/sanitizer_vector.h"
1516
#include <pthread.h>
1617
#include <sanitizer/common_interface_defs.h>
1718

@@ -53,5 +54,47 @@ class PerThreadCallsiteTrie {
5354
/// thread, together with the number of samples that included them.
5455
DenseMap<uptr, uint64_t> determineRoots() const;
5556
};
57+
58+
class RootAutoDetector final {
59+
// A prime number. We may want to make this configurable at collection start.
60+
static const uint64_t SampleRate = 6113;
61+
const unsigned WaitSeconds;
62+
pthread_t WorkerThread;
63+
64+
struct PerThreadSamples {
65+
PerThreadSamples(RootAutoDetector &Parent);
66+
67+
PerThreadCallsiteTrie TrieRoot;
68+
SpinMutex M;
69+
};
70+
SpinMutex AllSamplesMutex;
71+
SANITIZER_GUARDED_BY(AllSamplesMutex)
72+
Vector<PerThreadSamples *> AllSamples;
73+
atomic_uintptr_t &FunctionDataListHead;
74+
atomic_uintptr_t &Self;
75+
void collectStack();
76+
77+
public:
78+
RootAutoDetector(atomic_uintptr_t &FunctionDataListHead,
79+
atomic_uintptr_t &Self, unsigned WaitSeconds)
80+
: WaitSeconds(WaitSeconds), FunctionDataListHead(FunctionDataListHead),
81+
Self(Self) {}
82+
83+
// Samples the stack at `SampleRate` (rate observed independently on each
84+
// thread) in thread local `PerThreadCallsiteTrie`s.
85+
void sample();
86+
87+
// Start a thread waiting `WaitSeconds`, after which it uses the
88+
// `PerThreadCallsiteTrie` data observed so far over all threads to determine
89+
// roots. Marks those roots by traversing the linked list of FunctionData that
90+
// starts at `FunctionDataListHead`, and assigning their `CtxRoot`. Finally,
91+
// resets the `Self` atomic, so that other threads don't continue calling
92+
// `sample`.
93+
void start();
94+
95+
// join the waiting thread.
96+
void join();
97+
};
98+
5699
} // namespace __ctx_profile
57100
#endif

0 commit comments

Comments
 (0)