Skip to content

[ctxprof] Track unhandled call targets #131417

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ class ProfileWriter {
public:
virtual void startContextSection() = 0;
virtual void writeContextual(const ctx_profile::ContextNode &RootNode,
const ctx_profile::ContextNode *Unhandled,
uint64_t TotalRootEntryCount) = 0;
virtual void endContextSection() = 0;

Expand Down
36 changes: 27 additions & 9 deletions compiler-rt/lib/ctx_profile/CtxInstrProfiling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,22 +246,37 @@ ContextNode *getFlatProfile(FunctionData &Data, GUID Guid,

ContextNode *getUnhandledContext(FunctionData &Data, GUID Guid,
uint32_t NumCounters) {
// 1) if we are under a root (regardless if this thread is collecting or not a

// 1) if we are currently collecting a contextual profile, fetch a ContextNode
// in the `Unhandled` set. We want to do this regardless of `ProfilingStarted`
// to (hopefully) offset the penalty of creating these contexts to before
// profiling.
//
// 2) if we are under a root (regardless if this thread is collecting or not a
// contextual profile for that root), do not collect a flat profile. We want
// to keep flat profiles only for activations that can't happen under a root,
// to avoid confusing profiles. We can, for example, combine flattened and
// flat profiles meaningfully, as we wouldn't double-count anything.
//
// 2) to avoid lengthy startup, don't bother with flat profiles until the
// profiling started. We would reset them anyway when profiling starts.
// 3) to avoid lengthy startup, don't bother with flat profiles until the
// profiling has started. We would reset them anyway when profiling starts.
// HOWEVER. This does lose profiling for message pumps: those functions are
// entered once and never exit. They should be assumed to be entered before
// profiling starts - because profiling should start after the server is up
// and running (which is equivalent to "message pumps are set up").
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
return TheScratchContext;
return markAsScratch(
onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
ContextRoot *R = __llvm_ctx_profile_current_context_root;
if (!R) {
if (IsUnderContext || !__sanitizer::atomic_load_relaxed(&ProfilingStarted))
return TheScratchContext;
else
return markAsScratch(
onContextEnter(*getFlatProfile(Data, Guid, NumCounters)));
}
auto [Iter, Ins] = R->Unhandled.insert({Guid, nullptr});
if (Ins)
Iter->second =
getCallsiteSlow(Guid, &R->FirstUnhandledCalleeNode, NumCounters, 0);
return markAsScratch(onContextEnter(*Iter->second));
}

ContextNode *__llvm_ctx_profile_get_context(FunctionData *Data, void *Callee,
Expand Down Expand Up @@ -396,6 +411,8 @@ void __llvm_ctx_profile_start_collection() {
++NumMemUnits;

resetContextNode(*Root->FirstNode);
if (Root->FirstUnhandledCalleeNode)
resetContextNode(*Root->FirstUnhandledCalleeNode);
__sanitizer::atomic_store_relaxed(&Root->TotalEntries, 0);
}
__sanitizer::atomic_store_relaxed(&ProfilingStarted, true);
Expand All @@ -416,8 +433,9 @@ bool __llvm_ctx_profile_fetch(ProfileWriter &Writer) {
__sanitizer::Printf("[ctxprof] Contextual Profile is %s\n", "invalid");
return false;
}
Writer.writeContextual(*Root->FirstNode, __sanitizer::atomic_load_relaxed(
&Root->TotalEntries));
Writer.writeContextual(
*Root->FirstNode, Root->FirstUnhandledCalleeNode,
__sanitizer::atomic_load_relaxed(&Root->TotalEntries));
}
Writer.endContextSection();
Writer.startFlatSection();
Expand Down
15 changes: 15 additions & 0 deletions compiler-rt/lib/ctx_profile/CtxInstrProfiling.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#define CTX_PROFILE_CTXINSTRPROFILING_H_

#include "CtxInstrContextNode.h"
#include "sanitizer_common/sanitizer_dense_map.h"
#include "sanitizer_common/sanitizer_mutex.h"
#include <sanitizer/common_interface_defs.h>

Expand Down Expand Up @@ -83,6 +84,20 @@ struct ContextRoot {
// Count the number of entries - regardless if we could take the `Taken` mutex
::__sanitizer::atomic_uint64_t TotalEntries = {};

// Profiles for functions we encounter when collecting a contexutal profile,
// that are not associated with a callsite. This is expected to happen for
// signal handlers, but it also - problematically - currently happens for
// call sites generated after profile instrumentation, primarily
// mem{memset|copy|move|set}.
// `Unhandled` serves 2 purposes:
// 1. identifying such cases (like the memops)
// 2. collecting a profile for them, which can be at least used as a flat
// profile
::__sanitizer::DenseMap<GUID, ContextNode *> Unhandled;
// Keep the unhandled contexts in a list, as we allocate them, as it makes it
// simpler to send to the writer when the profile is fetched.
ContextNode *FirstUnhandledCalleeNode = nullptr;

// Taken is used to ensure only one thread traverses the contextual graph -
// either to read it or to write it. On server side, the same entrypoint will
// be entered by numerous threads, but over time, the profile aggregated by
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ TEST_F(ContextTest, Dump) {
TestProfileWriter(ContextRoot *Root, size_t Entries)
: Root(Root), Entries(Entries) {}

void writeContextual(const ContextNode &Node,
void writeContextual(const ContextNode &Node, const ContextNode *Unhandled,
uint64_t TotalRootEntryCount) override {
EXPECT_EQ(TotalRootEntryCount, Entries);
EXPECT_EQ(EnteredSectionCount, 1);
Expand Down
15 changes: 14 additions & 1 deletion compiler-rt/test/ctx_profile/TestCases/generate-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@
// RUN: cp %llvm_src/include/llvm/ProfileData/CtxInstrContextNode.h %t_include/
//
// Compile with ctx instrumentation "on". We treat "theRoot" as callgraph root.
// RUN: %clangxx %s %ctxprofilelib -I%t_include -O2 -o %t.bin -mllvm -profile-context-root=theRoot
// RUN: %clangxx %s %ctxprofilelib -I%t_include -O2 -o %t.bin -mllvm -profile-context-root=theRoot \
// RUN: -mllvm -ctx-prof-skip-callsite-instr=skip_me
//
// Run the binary, and observe the profile fetch handler's output.
// RUN: %t.bin | FileCheck %s
Expand All @@ -20,11 +21,14 @@ extern "C" bool __llvm_ctx_profile_fetch(ProfileWriter &);

// avoid name mangling
extern "C" {
__attribute__((noinline)) void skip_me() {}

__attribute__((noinline)) void someFunction(int I) {
if (I % 2)
printf("check odd\n");
else
printf("check even\n");
skip_me();
}

// block inlining because the pre-inliner otherwise will inline this - it's
Expand All @@ -36,6 +40,7 @@ __attribute__((noinline)) void theRoot() {
for (auto I = 0; I < 2; ++I) {
someFunction(I);
}
skip_me();
}

__attribute__((noinline)) void flatFct() {
Expand Down Expand Up @@ -85,9 +90,13 @@ class TestProfileWriter : public ProfileWriter {
}

void writeContextual(const ContextNode &RootNode,
const ContextNode *Unhandled,
uint64_t EntryCount) override {
std::cout << "Entering Root " << RootNode.guid()
<< " with total entry count " << EntryCount << std::endl;
for (const auto *P = Unhandled; P; P = P->next())
std::cout << "Unhandled GUID: " << P->guid() << " entered "
<< P->entrycount() << " times" << std::endl;
printProfile(RootNode, "", "");
}

Expand Down Expand Up @@ -119,6 +128,8 @@ class TestProfileWriter : public ProfileWriter {
// branches would be taken once, so the second counter is 1.
// CHECK-NEXT: Entered Context Section
// CHECK-NEXT: Entering Root 8657661246551306189 with total entry count 1
// skip_me is entered 4 times: 3 via `someFunction`, and once from `theRoot`
// CHECK-NEXT: Unhandled GUID: 17928815489886282963 entered 4 times
// CHECK-NEXT: Guid: 8657661246551306189
// CHECK-NEXT: Entries: 1
// CHECK-NEXT: 2 counters and 3 callsites
Expand All @@ -135,6 +146,8 @@ class TestProfileWriter : public ProfileWriter {
// CHECK-NEXT: Counter values: 2 1
// CHECK-NEXT: Exited Context Section
// CHECK-NEXT: Entered Flat Section
// This is `skip_me`. Entered 3 times via `someFunction`
// CHECK-NEXT: Flat: 17928815489886282963 3
// CHECK-NEXT: Flat: 6759619411192316602 3,1
// This is flatFct (guid: 14569438697463215220)
// CHECK-NEXT: Flat: 14569438697463215220 1,2
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/ProfileData/CtxInstrContextNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ class ProfileWriter {
public:
virtual void startContextSection() = 0;
virtual void writeContextual(const ctx_profile::ContextNode &RootNode,
const ctx_profile::ContextNode *Unhandled,
uint64_t TotalRootEntryCount) = 0;
virtual void endContextSection() = 0;

Expand Down
26 changes: 17 additions & 9 deletions llvm/include/llvm/ProfileData/PGOCtxProfReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ class IndexNode {
};
} // namespace internal

// Setting initial capacity to 1 because all contexts must have at least 1
// counter, and then, because all contexts belonging to a function have the same
// size, there'll be at most one other heap allocation.
using CtxProfFlatProfile =
std::map<GlobalValue::GUID, SmallVector<uint64_t, 1>>;

/// A node (context) in the loaded contextual profile, suitable for mutation
/// during IPO passes. We generally expect a fraction of counters and
/// callsites to be populated. We continue to model counters as vectors, but
Expand All @@ -93,11 +99,16 @@ class PGOCtxProfContext final : public internal::IndexNode {
GlobalValue::GUID GUID = 0;
SmallVector<uint64_t, 16> Counters;
const std::optional<uint64_t> RootEntryCount{};
std::optional<CtxProfFlatProfile> Unhandled{};
CallsiteMapTy Callsites;

PGOCtxProfContext(GlobalValue::GUID G, SmallVectorImpl<uint64_t> &&Counters,
std::optional<uint64_t> RootEntryCount = std::nullopt)
: GUID(G), Counters(std::move(Counters)), RootEntryCount(RootEntryCount) {
PGOCtxProfContext(
GlobalValue::GUID G, SmallVectorImpl<uint64_t> &&Counters,
std::optional<uint64_t> RootEntryCount = std::nullopt,
std::optional<CtxProfFlatProfile> &&Unhandled = std::nullopt)
: GUID(G), Counters(std::move(Counters)), RootEntryCount(RootEntryCount),
Unhandled(std::move(Unhandled)) {
assert(RootEntryCount.has_value() == Unhandled.has_value());
}

Expected<PGOCtxProfContext &>
Expand All @@ -121,6 +132,8 @@ class PGOCtxProfContext final : public internal::IndexNode {
bool isRoot() const { return RootEntryCount.has_value(); }
uint64_t getTotalRootEntryCount() const { return RootEntryCount.value(); }

const CtxProfFlatProfile &getUnhandled() const { return Unhandled.value(); }

uint64_t getEntrycount() const {
assert(!Counters.empty() &&
"Functions are expected to have at their entry BB instrumented, so "
Expand Down Expand Up @@ -170,12 +183,6 @@ class PGOCtxProfContext final : public internal::IndexNode {
}
};

// Setting initial capacity to 1 because all contexts must have at least 1
// counter, and then, because all contexts belonging to a function have the same
// size, there'll be at most one other heap allocation.
using CtxProfFlatProfile =
std::map<GlobalValue::GUID, SmallVector<uint64_t, 1>>;

using CtxProfContextualProfiles =
std::map<GlobalValue::GUID, PGOCtxProfContext>;
struct PGOCtxProfile {
Expand Down Expand Up @@ -205,6 +212,7 @@ class PGOCtxProfileReader final {

Error loadContexts(CtxProfContextualProfiles &P);
Error loadFlatProfiles(CtxProfFlatProfile &P);
Error loadFlatProfileList(CtxProfFlatProfile &P);

public:
PGOCtxProfileReader(StringRef Buffer)
Expand Down
6 changes: 4 additions & 2 deletions llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@ enum PGOCtxProfileBlockIDs {
ContextNodeBlockID = ContextRootBlockID + 1,
FlatProfilesSectionBlockID = ContextNodeBlockID + 1,
FlatProfileBlockID = FlatProfilesSectionBlockID + 1,
LAST_VALID = FlatProfileBlockID
UnhandledBlockID = FlatProfileBlockID + 1,
LAST_VALID = UnhandledBlockID
};

/// Write one or more ContextNodes to the provided raw_fd_stream.
Expand Down Expand Up @@ -94,6 +95,7 @@ class PGOCtxProfileWriter final : public ctx_profile::ProfileWriter {

void startContextSection() override;
void writeContextual(const ctx_profile::ContextNode &RootNode,
const ctx_profile::ContextNode *Unhandled,
uint64_t TotalRootEntryCount) override;
void endContextSection() override;

Expand All @@ -104,7 +106,7 @@ class PGOCtxProfileWriter final : public ctx_profile::ProfileWriter {

// constants used in writing which a reader may find useful.
static constexpr unsigned CodeLen = 2;
static constexpr uint32_t CurrentVersion = 3;
static constexpr uint32_t CurrentVersion = 4;
static constexpr unsigned VBREncodingBits = 6;
static constexpr StringRef ContainerMagic = "CTXP";
};
Expand Down
Loading