Skip to content

[Memprof] Adds the option to collect AccessCountHistograms for memprof. #94264

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions compiler-rt/include/profile/MIBEntryDef.inc
Original file line number Diff line number Diff line change
Expand Up @@ -51,3 +51,5 @@ MIBEntryDef(MaxAccessDensity = 22, MaxAccessDensity, uint32_t)
MIBEntryDef(TotalLifetimeAccessDensity = 23, TotalLifetimeAccessDensity, uint64_t)
MIBEntryDef(MinLifetimeAccessDensity = 24, MinLifetimeAccessDensity, uint32_t)
MIBEntryDef(MaxLifetimeAccessDensity = 25, MaxLifetimeAccessDensity, uint32_t)
MIBEntryDef(AccessHistogramSize = 26, AccessHistogramSize, uint32_t)
MIBEntryDef(AccessHistogram = 27, AccessHistogram, uintptr_t)
29 changes: 27 additions & 2 deletions compiler-rt/include/profile/MemProfData.inc
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,11 @@
(uint64_t)'o' << 24 | (uint64_t)'f' << 16 | (uint64_t)'r' << 8 | (uint64_t)129)

// The version number of the raw binary format.
#define MEMPROF_RAW_VERSION 3ULL
#define MEMPROF_RAW_VERSION 4ULL

// Currently supported versions.
#define MEMPROF_RAW_SUPPORTED_VERSIONS \
{ 3ULL, 4ULL }

#define MEMPROF_BUILDID_MAX_SIZE 32ULL

Expand Down Expand Up @@ -119,7 +123,8 @@ MemInfoBlock() {
}

MemInfoBlock(uint32_t Size, uint64_t AccessCount, uint32_t AllocTs,
uint32_t DeallocTs, uint32_t AllocCpu, uint32_t DeallocCpu)
uint32_t DeallocTs, uint32_t AllocCpu, uint32_t DeallocCpu,
uintptr_t Histogram, uint32_t HistogramSize)
: MemInfoBlock() {
AllocCount = 1U;
TotalAccessCount = AccessCount;
Expand Down Expand Up @@ -149,6 +154,8 @@ MemInfoBlock(uint32_t Size, uint64_t AccessCount, uint32_t AllocTs,
AllocCpuId = AllocCpu;
DeallocCpuId = DeallocCpu;
NumMigratedCpu = AllocCpuId != DeallocCpuId;
AccessHistogramSize = HistogramSize;
AccessHistogram = Histogram;
}

void Merge(const MemInfoBlock &newMIB) {
Expand Down Expand Up @@ -194,6 +201,24 @@ void Merge(const MemInfoBlock &newMIB) {
NumSameDeallocCpu += DeallocCpuId == newMIB.DeallocCpuId;
AllocCpuId = newMIB.AllocCpuId;
DeallocCpuId = newMIB.DeallocCpuId;

// For merging histograms, we always keep the longer histogram, and add
// values of shorter histogram to larger one.
uintptr_t ShorterHistogram;
uint32_t ShorterHistogramSize;
if (newMIB.AccessHistogramSize > AccessHistogramSize) {
ShorterHistogram = AccessHistogram;
ShorterHistogramSize = AccessHistogramSize;
// Swap histogram of current to larger histogram
AccessHistogram = newMIB.AccessHistogram;
AccessHistogramSize = newMIB.AccessHistogramSize;
} else {
ShorterHistogram = newMIB.AccessHistogram;
ShorterHistogramSize = newMIB.AccessHistogramSize;
}
for (size_t i = 0; i < ShorterHistogramSize; ++i) {
((uint64_t *)AccessHistogram)[i] += ((uint64_t *)ShorterHistogram)[i];
}
}

#ifdef _MSC_VER
Expand Down
84 changes: 73 additions & 11 deletions compiler-rt/lib/memprof/memprof_allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@
#include <sched.h>
#include <time.h>

#define MAX_HISTOGRAM_PRINT_SIZE 32U

extern bool __memprof_histogram;

namespace __memprof {
namespace {
using ::llvm::memprof::MemInfoBlock;
Expand Down Expand Up @@ -68,6 +72,14 @@ void Print(const MemInfoBlock &M, const u64 id, bool print_terse) {
"cpu: %u, num same dealloc_cpu: %u\n",
M.NumMigratedCpu, M.NumLifetimeOverlaps, M.NumSameAllocCpu,
M.NumSameDeallocCpu);
Printf("AccessCountHistogram[%u]: ", M.AccessHistogramSize);
uint32_t PrintSize = M.AccessHistogramSize > MAX_HISTOGRAM_PRINT_SIZE
? MAX_HISTOGRAM_PRINT_SIZE
: M.AccessHistogramSize;
for (size_t i = 0; i < PrintSize; ++i) {
Printf("%llu ", ((uint64_t *)M.AccessHistogram)[i]);
}
Printf("\n");
}
}
} // namespace
Expand Down Expand Up @@ -216,15 +228,34 @@ u64 GetShadowCount(uptr p, u32 size) {
return count;
}

// Accumulates the access count from the shadow for the given pointer and size.
// See memprof_mapping.h for an overview on histogram counters.
u64 GetShadowCountHistogram(uptr p, u32 size) {
u8 *shadow = (u8 *)HISTOGRAM_MEM_TO_SHADOW(p);
u8 *shadow_end = (u8 *)HISTOGRAM_MEM_TO_SHADOW(p + size);
u64 count = 0;
for (; shadow <= shadow_end; shadow++)
count += *shadow;
return count;
}

// Clears the shadow counters (when memory is allocated).
void ClearShadow(uptr addr, uptr size) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at this function more, there are a lot of uses of the MEM_TO_SHADOW computed values, which is not the right mapping function to use with the smaller granularity of the histogram case. I think you probably need to version the calls to MEM_TO_SHADOW here, then all of the rest of the code can work as is? I.e. you wouldn't need the 2 versions of clearCounters*

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function uses the MEM_TO_SHADOW computed pointers "kind of", but in reality it just rounds these pointers up to nearest page_sizes. So in effect, full pages should be cleared no matter if it is with histogram or without.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it works, because the shadow granularities are less than the page size, and because they are both scaling by the same 8 to 1 scale, and also I guess because the clear_shadow_mmap_threshold is an optimization and doesn't need to be exact. However, it still feels a little wonky to me (and also means that we have to do extra mapping operations here and again in clearCounters*). I do think I would prefer to have it be something like:

  uptr shadow_beg;
  uptr shadow_end;
    if (__memprof_histogram) {
      shadow_beg = HISTOGRAM_MEM_TO_SHADOW(addr);
      shadow_end = HISTOGRAM_MEM_TO_SHADOW(addr + size);
    } else {
      shadow_beg = MEM_TO_SHADOW(addr);
      shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1;
    }
    if (shadow_end - shadow_beg < common_flags()->clear_shadow_mmap_threshold) {
      REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg);
    } else {
...

I.e. set shadow_beg/end based on whether we are doing the histogramming or not, leave the rest as-is. Would that work?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes I agree, this is nicer.

Changed.

CHECK(AddrIsAlignedByGranularity(addr));
CHECK(AddrIsInMem(addr));
CHECK(AddrIsAlignedByGranularity(addr + size));
CHECK(AddrIsInMem(addr + size - SHADOW_GRANULARITY));
CHECK(REAL(memset));
uptr shadow_beg = MEM_TO_SHADOW(addr);
uptr shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1;
uptr shadow_beg;
uptr shadow_end;
if (__memprof_histogram) {
shadow_beg = HISTOGRAM_MEM_TO_SHADOW(addr);
shadow_end = HISTOGRAM_MEM_TO_SHADOW(addr + size);
} else {
shadow_beg = MEM_TO_SHADOW(addr);
shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1;
}

if (shadow_end - shadow_beg < common_flags()->clear_shadow_mmap_threshold) {
REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg);
} else {
Expand Down Expand Up @@ -279,6 +310,44 @@ struct Allocator {
Print(Value->mib, Key, bool(Arg));
}

// See memprof_mapping.h for an overview on histogram counters.
static MemInfoBlock CreateNewMIB(uptr p, MemprofChunk *m, u64 user_size) {
if (__memprof_histogram) {
return CreateNewMIBWithHistogram(p, m, user_size);
} else {
return CreateNewMIBWithoutHistogram(p, m, user_size);
}
}

static MemInfoBlock CreateNewMIBWithHistogram(uptr p, MemprofChunk *m,
u64 user_size) {

u64 c = GetShadowCountHistogram(p, user_size);
long curtime = GetTimestamp();
uint32_t HistogramSize =
RoundUpTo(user_size, HISTOGRAM_GRANULARITY) / HISTOGRAM_GRANULARITY;
uintptr_t Histogram =
(uintptr_t)InternalAlloc(HistogramSize * sizeof(uint64_t));
memset((void *)Histogram, 0, HistogramSize * sizeof(uint64_t));
for (size_t i = 0; i < HistogramSize; ++i) {
u8 Counter =
*((u8 *)HISTOGRAM_MEM_TO_SHADOW(p + HISTOGRAM_GRANULARITY * i));
((uint64_t *)Histogram)[i] = (uint64_t)Counter;
}
MemInfoBlock newMIB(user_size, c, m->timestamp_ms, curtime, m->cpu_id,
GetCpuId(), Histogram, HistogramSize);
return newMIB;
}

static MemInfoBlock CreateNewMIBWithoutHistogram(uptr p, MemprofChunk *m,
u64 user_size) {
u64 c = GetShadowCount(p, user_size);
long curtime = GetTimestamp();
MemInfoBlock newMIB(user_size, c, m->timestamp_ms, curtime, m->cpu_id,
GetCpuId(), 0, 0);
return newMIB;
}

void FinishAndWrite() {
if (print_text && common_flags()->print_module_map)
DumpProcessMap();
Expand Down Expand Up @@ -319,10 +388,7 @@ struct Allocator {
if (!m)
return;
uptr user_beg = ((uptr)m) + kChunkHeaderSize;
u64 c = GetShadowCount(user_beg, user_requested_size);
long curtime = GetTimestamp();
MemInfoBlock newMIB(user_requested_size, c, m->timestamp_ms, curtime,
m->cpu_id, GetCpuId());
MemInfoBlock newMIB = CreateNewMIB(user_beg, m, user_requested_size);
InsertOrMerge(m->alloc_context_id, newMIB, A->MIBMap);
},
this);
Expand Down Expand Up @@ -451,11 +517,7 @@ struct Allocator {
atomic_exchange(&m->user_requested_size, 0, memory_order_acquire);
if (memprof_inited && atomic_load_relaxed(&constructed) &&
!atomic_load_relaxed(&destructing)) {
u64 c = GetShadowCount(p, user_requested_size);
long curtime = GetTimestamp();

MemInfoBlock newMIB(user_requested_size, c, m->timestamp_ms, curtime,
m->cpu_id, GetCpuId());
MemInfoBlock newMIB = this->CreateNewMIB(p, m, user_requested_size);
InsertOrMerge(m->alloc_context_id, newMIB, MIBMap);
}

Expand Down
2 changes: 1 addition & 1 deletion compiler-rt/lib/memprof/memprof_flags.inc
Original file line number Diff line number Diff line change
Expand Up @@ -38,4 +38,4 @@ MEMPROF_FLAG(bool, allocator_frees_and_returns_null_on_realloc_zero, true,
MEMPROF_FLAG(bool, print_text, false,
"If set, prints the heap profile in text format. Else use the raw binary serialization format.")
MEMPROF_FLAG(bool, print_terse, false,
"If set, prints memory profile in a terse format. Only applicable if print_text = true.")
"If set, prints memory profile in a terse format. Only applicable if print_text = true.")
37 changes: 36 additions & 1 deletion compiler-rt/lib/memprof/memprof_mapping.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ static const u64 kDefaultShadowScale = 3;

#define SHADOW_GRANULARITY (1ULL << SHADOW_SCALE)
#define MEMPROF_ALIGNMENT 32

namespace __memprof {

extern uptr kHighMemEnd; // Initialized in __memprof_init.
Expand All @@ -37,6 +36,34 @@ extern uptr kHighMemEnd; // Initialized in __memprof_init.
#define MEM_TO_SHADOW(mem) \
((((mem) & SHADOW_MASK) >> SHADOW_SCALE) + (SHADOW_OFFSET))

// Histogram shadow memory is laid different to the standard configuration:

// 8 bytes
// +---+---+---+ +---+---+---+ +---+---+---+
// Memory | a | | b | | c |
// +---+---+---+ +---+---+---+ +---+---+---+

// +---+ +---+ +---+
// Shadow | a | | b | | c |
// +---+ +---+ +---+
// 1 byte
//
// Where we have a 1 byte counter for each 8 bytes. HISTOGRAM_MEM_TO_SHADOW
// translates a memory address to the address of its corresponding shadow
// counter memory address. The same data is still provided in MIB whether
// histograms are used or not. Total access counts per allocations are
// computed by summing up all individual 1 byte counters. This can incur an
// accuracy penalty.

#define HISTOGRAM_GRANULARITY 8U

#define HISTOGRAM_MAX_COUNTER 255U

#define HISTOGRAM_SHADOW_MASK ~(HISTOGRAM_GRANULARITY - 1)

#define HISTOGRAM_MEM_TO_SHADOW(mem) \
((((mem) & HISTOGRAM_SHADOW_MASK) >> SHADOW_SCALE) + (SHADOW_OFFSET))

#define SHADOW_ENTRY_SIZE (MEM_GRANULARITY >> SHADOW_SCALE)

#define kLowMemBeg 0
Expand Down Expand Up @@ -108,6 +135,14 @@ inline void RecordAccess(uptr a) {
(*shadow_address)++;
}

inline void RecordAccessHistogram(uptr a) {
CHECK_EQ(SHADOW_ENTRY_SIZE, 8);
u8 *shadow_address = (u8 *)HISTOGRAM_MEM_TO_SHADOW(a);
if (*shadow_address < HISTOGRAM_MAX_COUNTER) {
(*shadow_address)++;
}
}

} // namespace __memprof

#endif // MEMPROF_MAPPING_H
11 changes: 11 additions & 0 deletions compiler-rt/lib/memprof/memprof_mibmap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,18 @@ void InsertOrMerge(const uptr Id, const MemInfoBlock &Block, MIBMapTy &Map) {
} else {
LockedMemInfoBlock *lmib = *h;
SpinMutexLock lock(&lmib->mutex);
uintptr_t ShorterHistogram;
if (Block.AccessHistogramSize > lmib->mib.AccessHistogramSize)
ShorterHistogram = lmib->mib.AccessHistogram;
else
ShorterHistogram = Block.AccessHistogram;

lmib->mib.Merge(Block);
// The larger histogram is kept and the shorter histogram is discarded after
// adding the counters to the larger historam. Free only the shorter
// Histogram
if (Block.AccessHistogramSize > 0 || lmib->mib.AccessHistogramSize > 0)
InternalFree((void *)ShorterHistogram);
}
}

Expand Down
53 changes: 45 additions & 8 deletions compiler-rt/lib/memprof/memprof_rawprofile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,24 +146,38 @@ void SerializeStackToBuffer(const Vector<u64> &StackIds,
// ---------- MIB Entry 0
// Alloc Count
// ...
// ---- AccessHistogram Entry 0
// ...
// ---- AccessHistogram Entry AccessHistogramSize - 1
// ---------- MIB Entry 1
// Alloc Count
// ...
// ---- AccessHistogram Entry 0
// ...
// ---- AccessHistogram Entry AccessHistogramSize - 1
// ----------
void SerializeMIBInfoToBuffer(MIBMapTy &MIBMap, const Vector<u64> &StackIds,
const u64 ExpectedNumBytes, char *&Buffer) {
char *Ptr = Buffer;
const u64 NumEntries = StackIds.Size();
Ptr = WriteBytes(NumEntries, Ptr);

for (u64 i = 0; i < NumEntries; i++) {
const u64 Key = StackIds[i];
MIBMapTy::Handle h(&MIBMap, Key, /*remove=*/true, /*create=*/false);
CHECK(h.exists());
Ptr = WriteBytes(Key, Ptr);
// FIXME: We unnecessarily serialize the AccessHistogram pointer. Adding a
// serialization schema will fix this issue. See also FIXME in
// deserialization.
Ptr = WriteBytes((*h)->mib, Ptr);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we end up serializing out the AccessHistogram pointer unnecessarily?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, if we change to a schema based serialization (similar to the PortableMIB), it would fix it. Alternatively, we can write down each field seperately.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added FIXME here as well to mention this

for (u64 j = 0; j < (*h)->mib.AccessHistogramSize; ++j) {
u64 HistogramEntry = ((u64 *)((*h)->mib.AccessHistogram))[j];
Ptr = WriteBytes(HistogramEntry, Ptr);
}
if ((*h)->mib.AccessHistogramSize > 0) {
InternalFree((void *)((*h)->mib.AccessHistogram));
}
}

CHECK(ExpectedNumBytes >= static_cast<u64>(Ptr - Buffer) &&
"Expected num bytes != actual bytes written");
}
Expand Down Expand Up @@ -192,7 +206,15 @@ void SerializeMIBInfoToBuffer(MIBMapTy &MIBMap, const Vector<u64> &StackIds,
// ---------- MIB Entry
// Alloc Count
// ...
// ----------
// ---- AccessHistogram Entry 0
// ...
// ---- AccessHistogram Entry AccessHistogramSize - 1
// ---------- MIB Entry 1
// Alloc Count
// ...
// ---- AccessHistogram Entry 0
// ...
// ---- AccessHistogram Entry AccessHistogramSize - 1
// Optional Padding Bytes
// ---------- Stack Info
// Num Entries
Expand All @@ -218,13 +240,26 @@ u64 SerializeToRawProfile(MIBMapTy &MIBMap, ArrayRef<LoadedModule> Modules,
const u64 NumMIBInfoBytes = RoundUpTo(
sizeof(u64) + StackIds.Size() * (sizeof(u64) + sizeof(MemInfoBlock)), 8);

// Get Number of AccessHistogram entries in total
u64 TotalAccessHistogramEntries = 0;
MIBMap.ForEach(
[](const uptr Key, UNUSED LockedMemInfoBlock *const &MIB, void *Arg) {
u64 *TotalAccessHistogramEntries = (u64 *)Arg;
*TotalAccessHistogramEntries += MIB->mib.AccessHistogramSize;
},
reinterpret_cast<void *>(&TotalAccessHistogramEntries));
const u64 NumHistogramBytes =
RoundUpTo(TotalAccessHistogramEntries * sizeof(uint64_t), 8);

const u64 NumStackBytes = RoundUpTo(StackSizeBytes(StackIds), 8);

// Ensure that the profile is 8b aligned. We allow for some optional padding
// at the end so that any subsequent profile serialized to the same file does
// not incur unaligned accesses.
const u64 TotalSizeBytes = RoundUpTo(
sizeof(Header) + NumSegmentBytes + NumStackBytes + NumMIBInfoBytes, 8);
const u64 TotalSizeBytes =
RoundUpTo(sizeof(Header) + NumSegmentBytes + NumStackBytes +
NumMIBInfoBytes + NumHistogramBytes,
8);

// Allocate the memory for the entire buffer incl. info blocks.
Buffer = (char *)InternalAlloc(TotalSizeBytes);
Expand All @@ -235,14 +270,16 @@ u64 SerializeToRawProfile(MIBMapTy &MIBMap, ArrayRef<LoadedModule> Modules,
static_cast<u64>(TotalSizeBytes),
sizeof(Header),
sizeof(Header) + NumSegmentBytes,
sizeof(Header) + NumSegmentBytes + NumMIBInfoBytes};
sizeof(Header) + NumSegmentBytes + NumMIBInfoBytes +
NumHistogramBytes};
Ptr = WriteBytes(header, Ptr);

SerializeSegmentsToBuffer(Modules, NumSegmentBytes, Ptr);
Ptr += NumSegmentBytes;

SerializeMIBInfoToBuffer(MIBMap, StackIds, NumMIBInfoBytes, Ptr);
Ptr += NumMIBInfoBytes;
SerializeMIBInfoToBuffer(MIBMap, StackIds,
NumMIBInfoBytes + NumHistogramBytes, Ptr);
Ptr += NumMIBInfoBytes + NumHistogramBytes;

SerializeStackToBuffer(StackIds, NumStackBytes, Ptr);

Expand Down
Loading
Loading