Skip to content

Commit 7182bae

Browse files
committed
RootAutodetect
1 parent bbe97a8 commit 7182bae

File tree

8 files changed

+195
-47
lines changed

8 files changed

+195
-47
lines changed

compiler-rt/lib/ctx_profile/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ endif()
2727
add_compiler_rt_runtime(clang_rt.ctx_profile
2828
STATIC
2929
ARCHS ${CTX_PROFILE_SUPPORTED_ARCH}
30-
OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc
30+
OBJECT_LIBS RTSanitizerCommon RTSanitizerCommonLibc RTSanitizerCommonSymbolizer
3131
CFLAGS ${EXTRA_FLAGS}
3232
SOURCES ${CTX_PROFILE_SOURCES}
3333
ADDITIONAL_HEADERS ${CTX_PROFILE_HEADERS}

compiler-rt/lib/ctx_profile/CtxInstrContextNode.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ class ContextNode final {
127127
/// MUTEXDECL takes one parameter, the name of a field that is a mutex.
128128
#define CTXPROF_FUNCTION_DATA(PTRDECL, VOLATILE_PTRDECL, MUTEXDECL) \
129129
PTRDECL(FunctionData, Next) \
130+
PTRDECL(void, EntryAddress) \
130131
VOLATILE_PTRDECL(ContextRoot, CtxRoot) \
131132
VOLATILE_PTRDECL(ContextNode, FlatCtx) \
132133
MUTEXDECL(Mutex)

compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp

Lines changed: 76 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
//===----------------------------------------------------------------------===//
88

99
#include "CtxInstrProfiling.h"
10+
#include "RootAutoDetector.h"
1011
#include "sanitizer_common/sanitizer_allocator_internal.h"
1112
#include "sanitizer_common/sanitizer_atomic.h"
1213
#include "sanitizer_common/sanitizer_atomic_clang.h"
@@ -43,6 +44,12 @@ Arena *FlatCtxArena = nullptr;
4344
__thread bool IsUnderContext = false;
4445
__sanitizer::atomic_uint8_t ProfilingStarted = {};
4546

47+
__sanitizer::atomic_uintptr_t RootDetector = {};
48+
RootAutoDetector *getRootDetector() {
49+
return reinterpret_cast<RootAutoDetector *>(
50+
__sanitizer::atomic_load_relaxed(&RootDetector));
51+
}
52+
4653
// utility to taint a pointer by setting the LSB. There is an assumption
4754
// throughout that the addresses of contexts are even (really, they should be
4855
// align(8), but "even"-ness is the minimum assumption)
@@ -201,7 +208,7 @@ ContextNode *getCallsiteSlow(GUID Guid, ContextNode **InsertionPoint,
201208
return Ret;
202209
}
203210

204-
ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
211+
ContextNode *getFlatProfile(FunctionData &Data, void *Callee, GUID Guid,
205212
uint32_t NumCounters) {
206213
if (ContextNode *Existing = Data.FlatCtx)
207214
return Existing;
@@ -232,6 +239,7 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,
232239
auto *Ret = allocContextNode(AllocBuff, Guid, NumCounters, 0);
233240
Data.FlatCtx = Ret;
234241

242+
Data.EntryAddress = Callee;
235243
Data.Next = reinterpret_cast<FunctionData *>(
236244
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
237245
while (!__sanitizer::atomic_compare_exchange_strong(
@@ -277,8 +285,29 @@ ContextRoot *FunctionData::getOrAllocateContextRoot() {
277285
return Root;
278286
}
279287

280-
ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
281-
uint32_t NumCounters) {
288+
ContextNode *tryStartContextGivenRoot(ContextRoot *Root, GUID Guid,
289+
uint32_t Counters, uint32_t Callsites)
290+
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
291+
IsUnderContext = true;
292+
__sanitizer::atomic_fetch_add(&Root->TotalEntries, 1,
293+
__sanitizer::memory_order_relaxed);
294+
295+
if (!Root->FirstMemBlock) {
296+
setupContext(Root, Guid, Counters, Callsites);
297+
}
298+
if (Root->Taken.TryLock()) {
299+
__llvm_ctx_profile_current_context_root = Root;
300+
onContextEnter(*Root->FirstNode);
301+
return Root->FirstNode;
302+
}
303+
// If this thread couldn't take the lock, return scratch context.
304+
__llvm_ctx_profile_current_context_root = nullptr;
305+
return TheScratchContext;
306+
}
307+
308+
ContextNode *getUnhandledContext(FunctionData &Data, void *Callee, GUID Guid,
309+
uint32_t NumCounters, uint32_t NumCallsites,
310+
ContextRoot *CtxRoot) {
282311

283312
// 1) if we are currently collecting a contextual profile, fetch a ContextNode
284313
// in the `Unhandled` set. We want to do this regardless of `ProfilingStarted`
@@ -297,27 +326,30 @@ ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
297326
// entered once and never exit. They should be assumed to be entered before
298327
// profiling starts - because profiling should start after the server is up
299328
// and running (which is equivalent to "message pumps are set up").
300-
ContextRoot *R = __llvm_ctx_profile_current_context_root;
301-
if (!R) {
329+
if (!CtxRoot) {
330+
if (auto *RAD = getRootDetector())
331+
RAD->sample();
302332
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
303333
return TheScratchContext;
304334
else
305335
return markAsScratch(
306-
onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
336+
onContextEnter(*getFlatProfile(Data, Callee, Guid, NumCounters)));
307337
}
308-
auto [Iter, Ins] = R->Unhandled.insert({Guid, nullptr});
338+
auto [Iter, Ins] = CtxRoot->Unhandled.insert({Guid, nullptr});
309339
if (Ins)
310-
Iter->second =
311-
getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0);
340+
Iter->second = getCallsiteSlow(Guid, &CtxRoot->FirstUnhandledCalleeNode,
341+
NumCounters, 0);
312342
return markAsScratch(onContextEnter(*Iter->second));
313343
}
314344

315345
ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
316346
GUID Guid, uint32_t NumCounters,
317347
uint32_t NumCallsites) {
348+
auto *CtxRoot = __llvm_ctx_profile_current_context_root;
318349
// fast "out" if we're not even doing contextual collection.
319-
if (!__llvm_ctx_profile_current_context_root)
320-
return getUnhandledContext(*Data, Guid, NumCounters);
350+
if (!CtxRoot)
351+
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
352+
nullptr);
321353

322354
// also fast "out" if the caller is scratch. We can see if it's scratch by
323355
// looking at the interior pointer into the subcontexts vector that the caller
@@ -326,7 +358,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
326358
// precisely, aligned - 8 values)
327359
auto **CallsiteContext = consume(__llvm_ctx_profile_callsite[0]);
328360
if (!CallsiteContext || isScratch(CallsiteContext))
329-
return getUnhandledContext(*Data, Guid, NumCounters);
361+
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
362+
CtxRoot);
330363

331364
// if the callee isn't the expected one, return scratch.
332365
// Signal handler(s) could have been invoked at any point in the execution.
@@ -344,7 +377,8 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
344377
// for that case.
345378
auto *ExpectedCallee = consume(__llvm_ctx_profile_expected_callee[0]);
346379
if (ExpectedCallee != Callee)
347-
return getUnhandledContext(*Data, Guid, NumCounters);
380+
return getUnhandledContext(*Data, Callee, Guid, NumCounters, NumCallsites,
381+
CtxRoot);
348382

349383
auto *Callsite = *CallsiteContext;
350384
// in the case of indirect calls, we will have all seen targets forming a
@@ -366,40 +400,26 @@ ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
366400
return Ret;
367401
}
368402

369-
ContextNode *__llvm_ctx_profile_start_context(
370-
FunctionData *FData, GUID Guid, uint32_t Counters,
371-
uint32_t Callsites) SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
372-
IsUnderContext = true;
373-
374-
auto *Root = FData->getOrAllocateContextRoot();
375-
376-
__sanitizer::atomic_fetch_add(&Root->TotalEntries, 1,
377-
__sanitizer::memory_order_relaxed);
403+
ContextNode *__llvm_ctx_profile_start_context(FunctionData *FData, GUID Guid,
404+
uint32_t Counters,
405+
uint32_t Callsites) {
378406

379-
if (!Root->FirstMemBlock) {
380-
setupContext(Root, Guid, Counters, Callsites);
381-
}
382-
if (Root->Taken.TryLock()) {
383-
__llvm_ctx_profile_current_context_root = Root;
384-
onContextEnter(*Root->FirstNode);
385-
return Root->FirstNode;
386-
}
387-
// If this thread couldn't take the lock, return scratch context.
388-
__llvm_ctx_profile_current_context_root = nullptr;
389-
return TheScratchContext;
407+
return tryStartContextGivenRoot(FData->getOrAllocateContextRoot(), Guid,
408+
Counters, Callsites);
390409
}
391410

392411
void __llvm_ctx_profile_release_context(FunctionData *FData)
393412
SANITIZER_NO_THREAD_SAFETY_ANALYSIS {
413+
const auto *CurrentRoot = __llvm_ctx_profile_current_context_root;
414+
if (!CurrentRoot || FData->CtxRoot != CurrentRoot)
415+
return;
394416
IsUnderContext = false;
395-
if (__llvm_ctx_profile_current_context_root) {
396-
__llvm_ctx_profile_current_context_root = nullptr;
397-
assert(FData->CtxRoot);
398-
FData->CtxRoot->Taken.Unlock();
399-
}
417+
assert(FData->CtxRoot);
418+
__llvm_ctx_profile_current_context_root = nullptr;
419+
FData->CtxRoot->Taken.Unlock();
400420
}
401421

402-
void __llvm_ctx_profile_start_collection() {
422+
void __llvm_ctx_profile_start_collection(bool AutodetectRoots) {
403423
size_t NumMemUnits = 0;
404424
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
405425
&AllContextsMutex);
@@ -415,12 +435,24 @@ void __llvm_ctx_profile_start_collection() {
415435
resetContextNode(*Root->FirstUnhandledCalleeNode);
416436
__sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
417437
}
418-
__sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
419-
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
438+
if (AutodetectRoots) {
439+
auto *RD = new (__sanitizer::InternalAlloc(sizeof(RootAutoDetector)))
440+
RootAutoDetector(AllFunctionsData, RootDetector);
441+
RD->start();
442+
} else {
443+
__sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
444+
__sanitizer::Printf("[ctxprof] Initial NumMemUnits: %zu \n", NumMemUnits);
445+
}
420446
}
421447

422448
bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
423449
__sanitizer::atomic_store_relaxed(&ProfilingStarted, false);
450+
if (auto *RD = getRootDetector()) {
451+
__sanitizer::Printf("[ctxprof] Expected the root autodetector to have "
452+
"finished well before attempting to fetch a context");
453+
RD->join();
454+
}
455+
424456
__sanitizer::GenericScopedLock<__sanitizer::SpinMutex> Lock(
425457
&AllContextsMutex);
426458

@@ -445,8 +477,9 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
445477
const auto *Pos = reinterpret_cast<const FunctionData *>(
446478
__sanitizer::atomic_load_relaxed(&AllFunctionsData));
447479
for (; Pos; Pos = Pos->Next)
448-
Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
449-
Pos->FlatCtx->counters_size());
480+
if (!Pos->CtxRoot)
481+
Writer.writeFlat(Pos->FlatCtx->guid(), Pos->FlatCtx->counters(),
482+
Pos->FlatCtx->counters_size());
450483
Writer.endFlatSection();
451484
return true;
452485
}

compiler-rt/lib/ctx_profile/CtxInstrProfiling.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ ContextNode *__llvm_ctx_profile_get_context(__ctx_profile::FunctionData *FData,
207207

208208
/// Prepares for collection. Currently this resets counter values but preserves
209209
/// internal context tree structure.
210-
void __llvm_ctx_profile_start_collection();
210+
void __llvm_ctx_profile_start_collection(bool AutodetectRoots = false);
211211

212212
/// Completely free allocated memory.
213213
void __llvm_ctx_profile_free();

compiler-rt/lib/ctx_profile/RootAutoDetector.cpp

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,90 @@
1818

1919
using namespace __ctx_profile;
2020

21+
namespace __sanitizer {
22+
void BufferedStackTrace::UnwindImpl(uptr pc, uptr bp, void *context,
23+
bool request_fast, u32 max_depth) {
24+
// We can't implement the fast variant. The fast variant ends up invoking an
25+
// external allocator, because of pthread_attr_getstack. If this happens
26+
// during an allocation of the program being instrumented, a non-reentrant
27+
// lock may be taken (this was observed). The allocator called by
28+
// pthread_attr_getstack will also try to take that lock.
29+
UnwindSlow(pc, max_depth);
30+
}
31+
} // namespace __sanitizer
32+
33+
RootAutoDetector::PerThreadSamples::PerThreadSamples(RootAutoDetector &Parent) {
34+
GenericScopedLock<SpinMutex> L(&Parent.AllSamplesMutex);
35+
Parent.AllSamples.PushBack(this);
36+
}
37+
38+
void RootAutoDetector::start() {
39+
atomic_store_relaxed(&Self, reinterpret_cast<uintptr_t>(this));
40+
pthread_create(
41+
&WorkerThread, nullptr,
42+
+[](void *Ctx) -> void * {
43+
RootAutoDetector *RAD = reinterpret_cast<RootAutoDetector *>(Ctx);
44+
SleepForSeconds(30);
45+
Vector<PerThreadSamples*> Copy;
46+
{
47+
GenericScopedLock<SpinMutex> M(&RAD->AllSamplesMutex);
48+
Copy.Resize(RAD->AllSamples.Size());
49+
for (uptr I = 0; I < RAD->AllSamples.Size(); ++I)
50+
Copy[I] = RAD->AllSamples[I];
51+
}
52+
DenseMap<uptr, uint64_t> AllRoots;
53+
for (uptr I = 0; I < Copy.Size(); ++I) {
54+
GenericScopedLock<SpinMutex>(&Copy[I]->M);
55+
Copy[I]->TrieRoot.determineRoots().forEach([&](auto &KVP) {
56+
auto [FAddr, Count] = KVP;
57+
AllRoots[FAddr] += Count;
58+
return true;
59+
});
60+
}
61+
for (auto *FD = reinterpret_cast<FunctionData *>(
62+
atomic_load_relaxed(&RAD->FunctionDataListHead));
63+
FD; FD = FD->Next) {
64+
if (AllRoots.contains(reinterpret_cast<uptr>(FD->EntryAddress))) {
65+
GenericScopedLock<SpinMutex> M(&FD->Mutex);
66+
FD->getOrAllocateContextRoot();
67+
}
68+
}
69+
atomic_store_relaxed(&RAD->Self, 0);
70+
return nullptr;
71+
},
72+
this);
73+
}
74+
75+
void RootAutoDetector::join() {
76+
pthread_join(WorkerThread, nullptr);
77+
}
78+
79+
void RootAutoDetector::sample() {
80+
static thread_local bool Entered = false;
81+
static thread_local uint64_t Entries = 0;
82+
if (Entered || (++Entries % SampleRate))
83+
return;
84+
Entered = true;
85+
collectStack();
86+
Entered = false;
87+
}
88+
89+
void RootAutoDetector::collectStack() {
90+
GET_CALLER_PC_BP;
91+
BufferedStackTrace CurrentStack;
92+
CurrentStack.Unwind(pc, bp, nullptr, false);
93+
if (CurrentStack.size <= 2) return;
94+
static thread_local PerThreadSamples *ThisThreadSamples =
95+
new (__sanitizer::InternalAlloc(sizeof(PerThreadSamples)))
96+
PerThreadSamples(*this);
97+
98+
if (!ThisThreadSamples->M.TryLock())
99+
return;
100+
101+
ThisThreadSamples->TrieRoot.insertStack(CurrentStack);
102+
ThisThreadSamples->M.Unlock();
103+
}
104+
21105
uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
22106
// this requires --linkopt=-Wl,--export-dynamic
23107
Dl_info Info;

compiler-rt/lib/ctx_profile/RootAutoDetector.h

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "sanitizer_common/sanitizer_dense_map.h"
1313
#include "sanitizer_common/sanitizer_internal_defs.h"
1414
#include "sanitizer_common/sanitizer_stacktrace.h"
15+
#include "sanitizer_common/sanitizer_vector.h"
1516
#include <pthread.h>
1617
#include <sanitizer/common_interface_defs.h>
1718

@@ -64,5 +65,33 @@ class PerThreadCallsiteTrie {
6465

6566
const Trie &start() const { return T; }
6667
};
68+
69+
class RootAutoDetector final {
70+
static const uint64_t SampleRate = 6113;
71+
pthread_t WorkerThread;
72+
73+
struct PerThreadSamples {
74+
PerThreadSamples(RootAutoDetector &Parent);
75+
76+
PerThreadCallsiteTrie TrieRoot;
77+
SpinMutex M;
78+
};
79+
SpinMutex AllSamplesMutex;
80+
SANITIZER_GUARDED_BY(AllSamplesMutex)
81+
Vector<PerThreadSamples*> AllSamples;
82+
atomic_uintptr_t &FunctionDataListHead;
83+
atomic_uintptr_t &Self;
84+
void collectStack();
85+
86+
public:
87+
RootAutoDetector(atomic_uintptr_t &FunctionDataListHead,
88+
atomic_uintptr_t &Self)
89+
: FunctionDataListHead(FunctionDataListHead), Self(Self) {}
90+
91+
void sample();
92+
void start();
93+
void join();
94+
};
95+
6796
} // namespace __ctx_profile
6897
#endif

compiler-rt/test/ctx_profile/TestCases/generate-context.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
#include <iostream>
1717

1818
using namespace llvm::ctx_profile;
19-
extern "C" void __llvm_ctx_profile_start_collection();
19+
extern "C" void __llvm_ctx_profile_start_collection(bool);
2020
extern "C" bool __llvm_ctx_profile_fetch(ProfileWriter &);
2121

2222
// avoid name mangling
@@ -159,7 +159,7 @@ bool profileWriter() {
159159
}
160160

161161
int main(int argc, char **argv) {
162-
__llvm_ctx_profile_start_collection();
162+
__llvm_ctx_profile_start_collection(false);
163163
theRoot();
164164
flatFct();
165165
// This would be implemented in a specific RPC handler, but here we just call

0 commit comments

Comments
 (0)