Skip to content

[ctxprof] Auto root detection: trie for stack samples #133106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 29, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions compiler-rt/lib/ctx_profile/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@ add_compiler_rt_component(ctx_profile)

set(CTX_PROFILE_SOURCES
CtxInstrProfiling.cpp
RootAutoDetector.cpp
)

set(CTX_PROFILE_HEADERS
CtxInstrContextNode.h
CtxInstrProfiling.h
RootAutoDetector.h
)

include_directories(..)
Expand Down
90 changes: 90 additions & 0 deletions compiler-rt/lib/ctx_profile/RootAutoDetector.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
//===- RootAutodetector.cpp - detect contextual profiling roots -----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "RootAutoDetector.h"

#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_placement_new.h" // IWYU pragma: keep (DenseMap)
#include <assert.h>
#include <dlfcn.h>
#include <pthread.h>

using namespace __ctx_profile;
template <typename T> using Set = DenseMap<T, bool>;

uptr PerThreadCallsiteTrie::getFctStartAddr(uptr CallsiteAddress) const {
// this requires --linkopt=-Wl,--export-dynamic
Dl_info Info;
if (dladdr(reinterpret_cast<const void *>(CallsiteAddress), &Info) != 0)
return reinterpret_cast<uptr>(Info.dli_saddr);
return 0;
}

void PerThreadCallsiteTrie::insertStack(const StackTrace &ST) {
++TheTrie.Count;
auto *Current = &TheTrie;
// the stack is backwards - the first callsite is at the top.
for (int I = ST.size - 1; I >= 0; --I) {
uptr ChildAddr = ST.trace[I];
auto [Iter, _] = Current->Children.insert({ChildAddr, Trie(ChildAddr)});
++Iter->second.Count;
Current = &Iter->second;
}
}

DenseMap<uptr, uint64_t> PerThreadCallsiteTrie::determineRoots() const {
// Assuming a message pump design, roots are those functions called by the
// message pump. The message pump is an infinite loop (for all practical
// considerations) fetching data from a queue. The root functions return -
// otherwise the message pump doesn't work. This function detects roots as the
// first place in the trie (starting from the root) where a function calls 2
// or more functions.
//
// We start with a callsite trie - the nodes are callsites. Different child
// nodes may actually correspond to the same function.
//
// For example: using function(callsite)
// f1(csf1_1) -> f2(csf2_1) -> f3
// -> f2(csf2_2) -> f4
//
// would be represented in our trie as:
// csf1_1 -> csf2_1 -> f3
// -> csf2_2 -> f4
//
// While we can assert the control flow returns to f2, we don't know if it
// ever returns to f1. f2 could be the message pump.
//
// We need to convert our callsite tree into a function tree. We can also,
// more economically, just see how many distinct functions there are at a
// certain depth. When that count is greater than 1, we got to potential roots
// and everything above should be considered as non-roots.
DenseMap<uptr, uint64_t> Result;
Set<const Trie *> Worklist;
Worklist.insert({&TheTrie, {}});

while (!Worklist.empty()) {
Set<const Trie *> NextWorklist;
DenseMap<uptr, uint64_t> Candidates;
Worklist.forEach([&](const auto &KVP) {
auto [Node, _] = KVP;
auto SA = getFctStartAddr(Node->CallsiteAddress);
Candidates[SA] += Node->Count;
Node->Children.forEach([&](auto &ChildKVP) {
NextWorklist.insert({&ChildKVP.second, true});
return true;
});
return true;
});
if (Candidates.size() > 1) {
Result.swap(Candidates);
break;
}
Worklist.swap(NextWorklist);
}
return Result;
}
57 changes: 57 additions & 0 deletions compiler-rt/lib/ctx_profile/RootAutoDetector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
/*===- RootAutodetector.h- auto-detect roots for ctxprof -----------------===*\
|*
|* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|* See https://llvm.org/LICENSE.txt for license information.
|* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|*
\*===----------------------------------------------------------------------===*/

#ifndef CTX_PROFILE_ROOTAUTODETECTOR_H_
#define CTX_PROFILE_ROOTAUTODETECTOR_H_

#include "sanitizer_common/sanitizer_dense_map.h"
#include "sanitizer_common/sanitizer_internal_defs.h"
#include "sanitizer_common/sanitizer_stacktrace.h"
#include <pthread.h>
#include <sanitizer/common_interface_defs.h>

using namespace __asan;
using namespace __sanitizer;

namespace __ctx_profile {

/// Capture all the stack traces observed for a specific thread. The "for a
/// specific thread" part is not enforced, but assumed in determineRoots.
class PerThreadCallsiteTrie {
protected:
/// A trie. A node is the address of a callsite in a function activation. A
/// child is a callsite in the activation made from the callsite
/// corresponding to the parent.
struct Trie final {
const uptr CallsiteAddress;
uint64_t Count = 0;
DenseMap<uptr, Trie> Children;

Trie(uptr CallsiteAddress = 0) : CallsiteAddress(CallsiteAddress) {}
};
Trie TheTrie;

/// Return the runtime start address of the function that contains the call at
/// the runtime address CallsiteAddress. May be overriden for easy testing.
virtual uptr getFctStartAddr(uptr CallsiteAddress) const;

public:
PerThreadCallsiteTrie(const PerThreadCallsiteTrie &) = delete;
PerThreadCallsiteTrie(PerThreadCallsiteTrie &&) = default;
PerThreadCallsiteTrie() = default;

virtual ~PerThreadCallsiteTrie() = default;

void insertStack(const StackTrace &ST);

/// Return the runtime address of root functions, as determined for this
/// thread, together with the number of samples that included them.
DenseMap<uptr, uint64_t> determineRoots() const;
};
} // namespace __ctx_profile
#endif
4 changes: 3 additions & 1 deletion compiler-rt/lib/ctx_profile/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,12 @@ append_list_if(COMPILER_RT_HAS_WVARIADIC_MACROS_FLAG -Wno-variadic-macros CTX_PR
file(GLOB CTX_PROFILE_HEADERS ../*.h)

set(CTX_PROFILE_SOURCES
../CtxInstrProfiling.cpp)
../CtxInstrProfiling.cpp
../RootAutoDetector.cpp)

set(CTX_PROFILE_UNITTESTS
CtxInstrProfilingTest.cpp
RootAutoDetectorTest.cpp
driver.cpp)

include_directories(../../../include)
Expand Down
155 changes: 155 additions & 0 deletions compiler-rt/lib/ctx_profile/tests/RootAutoDetectorTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
#include "../RootAutoDetector.h"
#include "sanitizer_common/sanitizer_array_ref.h"
#include "gmock/gmock.h"
#include "gtest/gtest.h"

using namespace __ctx_profile;
using ::testing::IsEmpty;
using ::testing::Not;
using ::testing::SizeIs;

// Utility for describing a preorder traversal. By default it captures the
// address and count at a callsite node. Implicitly nodes are expected to have 1
// child. If they have none, we place a Marker::term and if they have more than
// one, we place a Marker::split(nr_of_children) For example, using a list
// notation, and letters to denote a pair of address and count:
// (A (B C) (D (E F))) is a list of markers: A, split(2), B, term, C,
// term, D, split(2), E, term, F, term
class Marker {
enum class Kind { End, Value, Split };
const uptr Value;
const uptr Count;
const Kind K;
Marker(uptr V, uptr C, Kind S) : Value(V), Count(C), K(S) {}

public:
Marker(uptr V, uptr C) : Marker(V, C, Kind::Value) {}

static Marker split(uptr V) { return Marker(V, 0, Kind::Split); }
static Marker term() { return Marker(0, 0, Kind::End); }

bool isSplit() const { return K == Kind::Split; }
bool isTerm() const { return K == Kind::End; }
bool isVal() const { return K == Kind::Value; }

bool operator==(const Marker &M) const {
return Value == M.Value && Count == M.Count && K == M.K;
}
};

class MockCallsiteTrie final : public PerThreadCallsiteTrie {
// Return the first multiple of 100.
uptr getFctStartAddr(uptr CallsiteAddress) const override {
return (CallsiteAddress / 100) * 100;
}

static void popAndCheck(ArrayRef<Marker> &Preorder, Marker M) {
ASSERT_THAT(Preorder, Not(IsEmpty()));
ASSERT_EQ(Preorder[0], M);
Preorder = Preorder.drop_front();
}

static void checkSameImpl(const Trie &T, ArrayRef<Marker> &Preorder) {
popAndCheck(Preorder, {T.CallsiteAddress, T.Count});

if (T.Children.empty()) {
popAndCheck(Preorder, Marker::term());
return;
}

if (T.Children.size() > 1)
popAndCheck(Preorder, Marker::split(T.Children.size()));

T.Children.forEach([&](const auto &KVP) {
checkSameImpl(KVP.second, Preorder);
return true;
});
}

public:
void checkSame(ArrayRef<Marker> Preorder) const {
checkSameImpl(TheTrie, Preorder);
ASSERT_THAT(Preorder, IsEmpty());
}
};

TEST(PerThreadCallsiteTrieTest, Insert) {
MockCallsiteTrie R;
uptr Stack1[]{4, 3, 2, 1};
R.insertStack(StackTrace(Stack1, 4));
R.checkSame(ArrayRef<Marker>(
{{0, 1}, {1, 1}, {2, 1}, {3, 1}, {4, 1}, Marker::term()}));

uptr Stack2[]{5, 4, 3, 2, 1};
R.insertStack(StackTrace(Stack2, 5));
R.checkSame(ArrayRef<Marker>(
{{0, 2}, {1, 2}, {2, 2}, {3, 2}, {4, 2}, {5, 1}, Marker::term()}));

uptr Stack3[]{6, 3, 2, 1};
R.insertStack(StackTrace(Stack3, 4));
R.checkSame(ArrayRef<Marker>({{0, 3},
{1, 3},
{2, 3},
{3, 3},
Marker::split(2),
{4, 2},
{5, 1},
Marker::term(),
{6, 1},
Marker::term()}));
uptr Stack4[]{7, 2, 1};
R.insertStack(StackTrace(Stack4, 3));
R.checkSame(ArrayRef<Marker>({{0, 4},
{1, 4},
{2, 4},
Marker::split(2),
{7, 1},
Marker::term(),
{3, 3},
Marker::split(2),
{4, 2},
{5, 1},
Marker::term(),
{6, 1},
Marker::term()}));
}

TEST(PerThreadCallsiteTrieTest, DetectRoots) {
MockCallsiteTrie T;

uptr Stack1[]{501, 302, 202, 102};
uptr Stack2[]{601, 402, 203, 102};
T.insertStack({Stack1, 4});
T.insertStack({Stack2, 4});

auto R = T.determineRoots();
EXPECT_THAT(R, SizeIs(2U));
EXPECT_TRUE(R.contains(300));
EXPECT_TRUE(R.contains(400));
}

TEST(PerThreadCallsiteTrieTest, DetectRootsNoBranches) {
MockCallsiteTrie T;

uptr Stack1[]{501, 302, 202, 102};
T.insertStack({Stack1, 4});

auto R = T.determineRoots();
EXPECT_THAT(R, IsEmpty());
}

TEST(PerThreadCallsiteTrieTest, DetectRootsUnknownFct) {
MockCallsiteTrie T;

uptr Stack1[]{501, 302, 202, 102};
// The MockCallsiteTree address resolver resolves addresses over 100, so 40
// will be mapped to 0.
uptr Stack2[]{601, 40, 203, 102};
T.insertStack({Stack1, 4});
T.insertStack({Stack2, 4});

auto R = T.determineRoots();
ASSERT_THAT(R, SizeIs(2U));
EXPECT_TRUE(R.contains(300));
EXPECT_TRUE(R.contains(0));
}