-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[Memprof] Adds the option to collect AccessCountHistograms for memprof. #94264
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Memprof] Adds the option to collect AccessCountHistograms for memprof. #94264
Conversation
Thank you for submitting a Pull Request (PR) to the LLVM Project! This PR will be automatically labeled and the relevant teams will be If you wish to, you can add reviewers by using the "Reviewers" section on this page. If this is not working for you, it is probably because you do not have write If you have received no comments on your PR for a week, you can request a review If you have further questions, they may be answered by the LLVM GitHub User Guide. You can also ask questions in a comment on this PR, on the LLVM Discord or on the forums. |
@llvm/pr-subscribers-llvm-transforms @llvm/pr-subscribers-pgo Author: Matthew Weingarten (mattweingarten) ChangesAdds compile time flag -mllvm -memprof-histogram and runtime flag histogram=true|false to turn Histogram collection on and off. The -memprof-histogram flag relies on -memprof-use-callbacks=true to work. Updates shadow mapping logic in histogram mode from having one 8 byte counter for 64 bytes, to 1 byte for 8 bytes, capped at 255. Only supports this granularity as of now. Updates the RawMemprofReader and serializing MemoryInfoBlocks to binary format, including changing to a new version of the raw binary format from version 3 to version 4. Updates creating MemoryInfoBlocks with and without Histograms. When two MemoryInfoBlocks are merged, AccessCounts are summed up and the shorter Histogram is removed. Adds a memprof_histogram test case. Initial commit for adding AccessCountHistograms up until RawProfile for memprof Patch is 65.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/94264.diff 30 Files Affected:
diff --git a/compiler-rt/include/profile/MIBEntryDef.inc b/compiler-rt/include/profile/MIBEntryDef.inc
index 794163ae10386..58c1fc4de4aba 100644
--- a/compiler-rt/include/profile/MIBEntryDef.inc
+++ b/compiler-rt/include/profile/MIBEntryDef.inc
@@ -51,3 +51,5 @@ MIBEntryDef(MaxAccessDensity = 22, MaxAccessDensity, uint32_t)
MIBEntryDef(TotalLifetimeAccessDensity = 23, TotalLifetimeAccessDensity, uint64_t)
MIBEntryDef(MinLifetimeAccessDensity = 24, MinLifetimeAccessDensity, uint32_t)
MIBEntryDef(MaxLifetimeAccessDensity = 25, MaxLifetimeAccessDensity, uint32_t)
+MIBEntryDef(AccessHistogramSize = 26, AccessHistogramSize, uint32_t)
+MIBEntryDef(AccessHistogram = 27, AccessHistogram, uintptr_t)
\ No newline at end of file
diff --git a/compiler-rt/include/profile/MemProfData.inc b/compiler-rt/include/profile/MemProfData.inc
index b82a4baf6dd74..d4356275d1609 100644
--- a/compiler-rt/include/profile/MemProfData.inc
+++ b/compiler-rt/include/profile/MemProfData.inc
@@ -22,24 +22,27 @@
#include <string.h>
#ifdef _MSC_VER
-#define PACKED(...) __pragma(pack(push,1)) __VA_ARGS__ __pragma(pack(pop))
+#define PACKED(...) __pragma(pack(push, 1)) __VA_ARGS__ __pragma(pack(pop))
#else
#define PACKED(...) __VA_ARGS__ __attribute__((__packed__))
#endif
-// A 64-bit magic number to uniquely identify the raw binary memprof profile file.
-#define MEMPROF_RAW_MAGIC_64 \
- ((uint64_t)255 << 56 | (uint64_t)'m' << 48 | (uint64_t)'p' << 40 | (uint64_t)'r' << 32 | \
- (uint64_t)'o' << 24 | (uint64_t)'f' << 16 | (uint64_t)'r' << 8 | (uint64_t)129)
+// A 64-bit magic number to uniquely identify the raw binary memprof profile
+// file.
+#define MEMPROF_RAW_MAGIC_64 \
+ ((uint64_t)255 << 56 | (uint64_t)'m' << 48 | (uint64_t)'p' << 40 | \
+ (uint64_t)'r' << 32 | (uint64_t)'o' << 24 | (uint64_t)'f' << 16 | \
+ (uint64_t)'r' << 8 | (uint64_t)129)
// The version number of the raw binary format.
-#define MEMPROF_RAW_VERSION 3ULL
+#define MEMPROF_RAW_VERSION 4ULL
#define MEMPROF_BUILDID_MAX_SIZE 32ULL
namespace llvm {
namespace memprof {
-// A struct describing the header used for the raw binary memprof profile format.
+// A struct describing the header used for the raw binary memprof profile
+// format.
PACKED(struct Header {
uint64_t Magic;
uint64_t Version;
@@ -62,7 +65,7 @@ PACKED(struct SegmentEntry {
SegmentEntry(uint64_t S, uint64_t E, uint64_t O)
: Start(S), End(E), Offset(O), BuildIdSize(0) {}
- SegmentEntry(const SegmentEntry& S) {
+ SegmentEntry(const SegmentEntry &S) {
Start = S.Start;
End = S.End;
Offset = S.Offset;
@@ -70,7 +73,7 @@ PACKED(struct SegmentEntry {
memcpy(BuildId, S.BuildId, S.BuildIdSize);
}
- SegmentEntry& operator=(const SegmentEntry& S) {
+ SegmentEntry &operator=(const SegmentEntry &S) {
Start = S.Start;
End = S.End;
Offset = S.Offset;
@@ -79,7 +82,7 @@ PACKED(struct SegmentEntry {
return *this;
}
- bool operator==(const SegmentEntry& S) const {
+ bool operator==(const SegmentEntry &S) const {
return Start == S.Start && End == S.End && Offset == S.Offset &&
BuildIdSize == S.BuildIdSize &&
memcmp(BuildId, S.BuildId, S.BuildIdSize) == 0;
@@ -90,111 +93,143 @@ PACKED(struct SegmentEntry {
// MemProfData.inc since it would mean we are embedding a directive (the
// #include for MIBEntryDef) into the macros which is undefined behaviour.
#ifdef _MSC_VER
-__pragma(pack(push,1))
+__pragma(pack(push, 1))
#endif
-// A struct representing the heap allocation characteristics of a particular
-// runtime context. This struct is shared between the compiler-rt runtime and
-// the raw profile reader. The indexed format uses a separate, self-describing
-// backwards compatible format.
-struct MemInfoBlock{
+ // A struct representing the heap allocation characteristics of a particular
+ // runtime context. This struct is shared between the compiler-rt runtime
+ // and the raw profile reader. The indexed format uses a separate,
+ // self-describing backwards compatible format.
+ struct MemInfoBlock {
#define MIBEntryDef(NameTag, Name, Type) Type Name;
#include "MIBEntryDef.inc"
#undef MIBEntryDef
-bool operator==(const MemInfoBlock& Other) const {
- bool IsEqual = true;
-#define MIBEntryDef(NameTag, Name, Type) \
+ bool operator==(const MemInfoBlock &Other) const {
+ bool IsEqual = true;
+#define MIBEntryDef(NameTag, Name, Type) \
IsEqual = (IsEqual && Name == Other.Name);
#include "MIBEntryDef.inc"
#undef MIBEntryDef
- return IsEqual;
-}
+ return IsEqual;
+ }
-MemInfoBlock() {
+ MemInfoBlock() {
#define MIBEntryDef(NameTag, Name, Type) Name = Type();
#include "MIBEntryDef.inc"
#undef MIBEntryDef
-}
-
-MemInfoBlock(uint32_t Size, uint64_t AccessCount, uint32_t AllocTs,
- uint32_t DeallocTs, uint32_t AllocCpu, uint32_t DeallocCpu)
- : MemInfoBlock() {
- AllocCount = 1U;
- TotalAccessCount = AccessCount;
- MinAccessCount = AccessCount;
- MaxAccessCount = AccessCount;
- TotalSize = Size;
- MinSize = Size;
- MaxSize = Size;
- AllocTimestamp = AllocTs;
- DeallocTimestamp = DeallocTs;
- TotalLifetime = DeallocTimestamp - AllocTimestamp;
- MinLifetime = TotalLifetime;
- MaxLifetime = TotalLifetime;
- // Access density is accesses per byte. Multiply by 100 to include the
- // fractional part.
- TotalAccessDensity = AccessCount * 100 / Size;
- MinAccessDensity = TotalAccessDensity;
- MaxAccessDensity = TotalAccessDensity;
- // Lifetime access density is the access density per second of lifetime.
- // Multiply by 1000 to convert denominator lifetime to seconds (using a
- // minimum lifetime of 1ms to avoid divide by 0. Do the multiplication first
- // to reduce truncations to 0.
- TotalLifetimeAccessDensity =
- TotalAccessDensity * 1000 / (TotalLifetime ? TotalLifetime : 1);
- MinLifetimeAccessDensity = TotalLifetimeAccessDensity;
- MaxLifetimeAccessDensity = TotalLifetimeAccessDensity;
- AllocCpuId = AllocCpu;
- DeallocCpuId = DeallocCpu;
- NumMigratedCpu = AllocCpuId != DeallocCpuId;
-}
-
-void Merge(const MemInfoBlock &newMIB) {
- AllocCount += newMIB.AllocCount;
-
- TotalAccessCount += newMIB.TotalAccessCount;
- MinAccessCount = newMIB.MinAccessCount < MinAccessCount ? newMIB.MinAccessCount : MinAccessCount;
- MaxAccessCount = newMIB.MaxAccessCount > MaxAccessCount ? newMIB.MaxAccessCount : MaxAccessCount;
-
- TotalSize += newMIB.TotalSize;
- MinSize = newMIB.MinSize < MinSize ? newMIB.MinSize : MinSize;
- MaxSize = newMIB.MaxSize > MaxSize ? newMIB.MaxSize : MaxSize;
-
- TotalLifetime += newMIB.TotalLifetime;
- MinLifetime = newMIB.MinLifetime < MinLifetime ? newMIB.MinLifetime : MinLifetime;
- MaxLifetime = newMIB.MaxLifetime > MaxLifetime ? newMIB.MaxLifetime : MaxLifetime;
-
- TotalAccessDensity += newMIB.TotalAccessDensity;
- MinAccessDensity = newMIB.MinAccessDensity < MinAccessDensity
- ? newMIB.MinAccessDensity
- : MinAccessDensity;
- MaxAccessDensity = newMIB.MaxAccessDensity > MaxAccessDensity
- ? newMIB.MaxAccessDensity
- : MaxAccessDensity;
-
- TotalLifetimeAccessDensity += newMIB.TotalLifetimeAccessDensity;
- MinLifetimeAccessDensity =
- newMIB.MinLifetimeAccessDensity < MinLifetimeAccessDensity
- ? newMIB.MinLifetimeAccessDensity
- : MinLifetimeAccessDensity;
- MaxLifetimeAccessDensity =
- newMIB.MaxLifetimeAccessDensity > MaxLifetimeAccessDensity
- ? newMIB.MaxLifetimeAccessDensity
- : MaxLifetimeAccessDensity;
-
- // We know newMIB was deallocated later, so just need to check if it was
- // allocated before last one deallocated.
- NumLifetimeOverlaps += newMIB.AllocTimestamp < DeallocTimestamp;
- AllocTimestamp = newMIB.AllocTimestamp;
- DeallocTimestamp = newMIB.DeallocTimestamp;
-
- NumSameAllocCpu += AllocCpuId == newMIB.AllocCpuId;
- NumSameDeallocCpu += DeallocCpuId == newMIB.DeallocCpuId;
- AllocCpuId = newMIB.AllocCpuId;
- DeallocCpuId = newMIB.DeallocCpuId;
-}
+ }
+
+ MemInfoBlock(uint32_t Size, uint64_t AccessCount, uint32_t AllocTs,
+ uint32_t DeallocTs, uint32_t AllocCpu, uint32_t DeallocCpu,
+ uintptr_t Histogram, uint32_t HistogramSize)
+ : MemInfoBlock() {
+ AllocCount = 1U;
+ TotalAccessCount = AccessCount;
+ MinAccessCount = AccessCount;
+ MaxAccessCount = AccessCount;
+ TotalSize = Size;
+ MinSize = Size;
+ MaxSize = Size;
+ AllocTimestamp = AllocTs;
+ DeallocTimestamp = DeallocTs;
+ TotalLifetime = DeallocTimestamp - AllocTimestamp;
+ MinLifetime = TotalLifetime;
+ MaxLifetime = TotalLifetime;
+ // Access density is accesses per byte. Multiply by 100 to include the
+ // fractional part.
+ TotalAccessDensity = AccessCount * 100 / Size;
+ MinAccessDensity = TotalAccessDensity;
+ MaxAccessDensity = TotalAccessDensity;
+ // Lifetime access density is the access density per second of lifetime.
+ // Multiply by 1000 to convert denominator lifetime to seconds (using a
+ // minimum lifetime of 1ms to avoid divide by 0. Do the multiplication first
+ // to reduce truncations to 0.
+ TotalLifetimeAccessDensity =
+ TotalAccessDensity * 1000 / (TotalLifetime ? TotalLifetime : 1);
+ MinLifetimeAccessDensity = TotalLifetimeAccessDensity;
+ MaxLifetimeAccessDensity = TotalLifetimeAccessDensity;
+ AllocCpuId = AllocCpu;
+ DeallocCpuId = DeallocCpu;
+ NumMigratedCpu = AllocCpuId != DeallocCpuId;
+ // For now we assume HistogramSize is the same as user requested size
+ AccessHistogramSize = HistogramSize;
+ AccessHistogram = Histogram;
+ }
+
+ // Merge cannot free the AccessHistogram pointer, since we need to free either
+ // with InternalFree or free depending on where the allocation is made
+ // (runtime or profdata tool). The merge function expects the Histogram
+ // pointer with the smaller size to be freed.
+ void Merge(const MemInfoBlock &newMIB) {
+ AllocCount += newMIB.AllocCount;
+
+ TotalAccessCount += newMIB.TotalAccessCount;
+ MinAccessCount = newMIB.MinAccessCount < MinAccessCount
+ ? newMIB.MinAccessCount
+ : MinAccessCount;
+ MaxAccessCount = newMIB.MaxAccessCount > MaxAccessCount
+ ? newMIB.MaxAccessCount
+ : MaxAccessCount;
+
+ TotalSize += newMIB.TotalSize;
+ MinSize = newMIB.MinSize < MinSize ? newMIB.MinSize : MinSize;
+ MaxSize = newMIB.MaxSize > MaxSize ? newMIB.MaxSize : MaxSize;
+
+ TotalLifetime += newMIB.TotalLifetime;
+ MinLifetime =
+ newMIB.MinLifetime < MinLifetime ? newMIB.MinLifetime : MinLifetime;
+ MaxLifetime =
+ newMIB.MaxLifetime > MaxLifetime ? newMIB.MaxLifetime : MaxLifetime;
+
+ TotalAccessDensity += newMIB.TotalAccessDensity;
+ MinAccessDensity = newMIB.MinAccessDensity < MinAccessDensity
+ ? newMIB.MinAccessDensity
+ : MinAccessDensity;
+ MaxAccessDensity = newMIB.MaxAccessDensity > MaxAccessDensity
+ ? newMIB.MaxAccessDensity
+ : MaxAccessDensity;
+
+ TotalLifetimeAccessDensity += newMIB.TotalLifetimeAccessDensity;
+ MinLifetimeAccessDensity =
+ newMIB.MinLifetimeAccessDensity < MinLifetimeAccessDensity
+ ? newMIB.MinLifetimeAccessDensity
+ : MinLifetimeAccessDensity;
+ MaxLifetimeAccessDensity =
+ newMIB.MaxLifetimeAccessDensity > MaxLifetimeAccessDensity
+ ? newMIB.MaxLifetimeAccessDensity
+ : MaxLifetimeAccessDensity;
+
+ // We know newMIB was deallocated later, so just need to check if it was
+ // allocated before last one deallocated.
+ NumLifetimeOverlaps += newMIB.AllocTimestamp < DeallocTimestamp;
+ AllocTimestamp = newMIB.AllocTimestamp;
+ DeallocTimestamp = newMIB.DeallocTimestamp;
+
+ NumSameAllocCpu += AllocCpuId == newMIB.AllocCpuId;
+ NumSameDeallocCpu += DeallocCpuId == newMIB.DeallocCpuId;
+ AllocCpuId = newMIB.AllocCpuId;
+ DeallocCpuId = newMIB.DeallocCpuId;
+
+ // For merging histograms, we always keep the longer histogram, and add
+ // values of shorter histogram to larger one.
+ uintptr_t ShorterHistogram;
+ uint32_t ShorterHistogramSize;
+ if (newMIB.AccessHistogramSize > AccessHistogramSize) {
+ ShorterHistogram = AccessHistogram;
+ ShorterHistogramSize = AccessHistogramSize;
+ // Swap histogram of current to larger histogram
+ AccessHistogram = newMIB.AccessHistogram;
+ AccessHistogramSize = newMIB.AccessHistogramSize;
+ } else {
+ ShorterHistogram = newMIB.AccessHistogram;
+ ShorterHistogramSize = newMIB.AccessHistogramSize;
+ }
+ for (size_t i = 0; i < ShorterHistogramSize; ++i) {
+ ((uint64_t *)AccessHistogram)[i] += ((uint64_t *)ShorterHistogram)[i];
+ }
+ }
#ifdef _MSC_VER
} __pragma(pack(pop));
@@ -205,4 +240,4 @@ void Merge(const MemInfoBlock &newMIB) {
} // namespace memprof
} // namespace llvm
-#endif
+#endif
\ No newline at end of file
diff --git a/compiler-rt/lib/memprof/memprof_allocator.cpp b/compiler-rt/lib/memprof/memprof_allocator.cpp
index 35e941228525a..f93f633f0182f 100644
--- a/compiler-rt/lib/memprof/memprof_allocator.cpp
+++ b/compiler-rt/lib/memprof/memprof_allocator.cpp
@@ -34,6 +34,8 @@
#include <sched.h>
#include <time.h>
+#define MAX_HISTOGRAM_PRINT_SIZE 32U
+
namespace __memprof {
namespace {
using ::llvm::memprof::MemInfoBlock;
@@ -68,6 +70,14 @@ void Print(const MemInfoBlock &M, const u64 id, bool print_terse) {
"cpu: %u, num same dealloc_cpu: %u\n",
M.NumMigratedCpu, M.NumLifetimeOverlaps, M.NumSameAllocCpu,
M.NumSameDeallocCpu);
+ Printf("AcccessCountHistogram[%u]: ", M.AccessHistogramSize);
+ uint32_t PrintSize = M.AccessHistogramSize > MAX_HISTOGRAM_PRINT_SIZE
+ ? MAX_HISTOGRAM_PRINT_SIZE
+ : M.AccessHistogramSize;
+ for (size_t i = 0; i < PrintSize; ++i) {
+ Printf("%llu ", ((uint64_t *)M.AccessHistogram)[i]);
+ }
+ Printf("\n");
}
}
} // namespace
@@ -216,6 +226,32 @@ u64 GetShadowCount(uptr p, u32 size) {
return count;
}
+// Accumulates the access count from the shadow for the given pointer and size.
+u64 GetShadowCountHistogram(uptr p, u32 size) {
+ u8 *shadow = (u8 *)HISTOGRAM_MEM_TO_SHADOW(p);
+ u8 *shadow_end = (u8 *)MEM_TO_SHADOW(p + size);
+ u64 count = 0;
+ for (; shadow <= shadow_end; shadow++)
+ count += *shadow;
+ return count;
+}
+
+// If we use the normal approach in clearCountersWithoutHistogram, the histogram
+// will clear to much data and may overwrite shadow counters that are in use.
+void clearCountersHistogram(uptr addr, uptr size) {
+ u8 *shadow_8 = (u8 *)HISTOGRAM_MEM_TO_SHADOW(addr);
+ u8 *shadow_end_8 = (u8 *)HISTOGRAM_MEM_TO_SHADOW(addr + size);
+ for (; shadow_8 < shadow_end_8; shadow_8++) {
+ *shadow_8 = 0;
+ }
+}
+
+void clearCountersWithoutHistogram(uptr addr, uptr size) {
+ uptr shadow_beg = MEM_TO_SHADOW(addr);
+ uptr shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1;
+ REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg);
+}
+
// Clears the shadow counters (when memory is allocated).
void ClearShadow(uptr addr, uptr size) {
CHECK(AddrIsAlignedByGranularity(addr));
@@ -226,7 +262,11 @@ void ClearShadow(uptr addr, uptr size) {
uptr shadow_beg = MEM_TO_SHADOW(addr);
uptr shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1;
if (shadow_end - shadow_beg < common_flags()->clear_shadow_mmap_threshold) {
- REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg);
+ if (flags()->histogram) {
+ clearCountersHistogram(addr, size);
+ } else {
+ clearCountersWithoutHistogram(addr, size);
+ }
} else {
uptr page_size = GetPageSizeCached();
uptr page_beg = RoundUpTo(shadow_beg, page_size);
@@ -279,6 +319,43 @@ struct Allocator {
Print(Value->mib, Key, bool(Arg));
}
+ static MemInfoBlock CreateNewMIB(uptr p, MemprofChunk *m, u64 user_size) {
+ if (flags()->histogram) {
+ return CreateNewMIBWithHistogram(p, m, user_size);
+ } else {
+ return CreateNewMIBWithoutHistogram(p, m, user_size);
+ }
+ }
+
+ static MemInfoBlock CreateNewMIBWithHistogram(uptr p, MemprofChunk *m,
+ u64 user_size) {
+
+ u64 c = GetShadowCountHistogram(p, user_size);
+ long curtime = GetTimestamp();
+ uint32_t HistogramSize =
+ RoundUpTo(user_size, HISTOGRAM_GRANULARITY) / HISTOGRAM_GRANULARITY;
+ uintptr_t Histogram =
+ (uintptr_t)InternalAlloc(HistogramSize * sizeof(uint64_t));
+ memset((void *)Histogram, 0, HistogramSize * sizeof(uint64_t));
+ for (size_t i = 0; i < HistogramSize; ++i) {
+ u8 Counter =
+ *((u8 *)HISTOGRAM_MEM_TO_SHADOW(p + HISTOGRAM_GRANULARITY * i));
+ ((uint64_t *)Histogram)[i] = (uint64_t)Counter;
+ }
+ MemInfoBlock newMIB(user_size, c, m->timestamp_ms, curtime, m->cpu_id,
+ GetCpuId(), Histogram, HistogramSize);
+ return newMIB;
+ }
+
+ static MemInfoBlock CreateNewMIBWithoutHistogram(uptr p, MemprofChunk *m,
+ u64 user_size) {
+ u64 c = GetShadowCount(p, user_size);
+ long curtime = GetTimestamp();
+ MemInfoBlock newMIB(user_size, c, m->timestamp_ms, curtime, m->cpu_id,
+ GetCpuId(), 0, 0);
+ return newMIB;
+ }
+
void FinishAndWrite() {
if (print_text && common_flags()->print_module_map)
DumpProcessMap();
@@ -319,10 +396,7 @@ struct Allocator {
if (!m)
return;
uptr user_beg = ((uptr)m) + kChunkHeaderSize;
- u64 c = GetShadowCount(user_beg, user_requested_size);
- long curtime = GetTimestamp();
- MemInfoBlock newMIB(user_requested_size, c, m->timestamp_ms, curtime,
- m->cpu_id, GetCpuId());
+ MemInfoBlock newMIB = CreateNewMIB(user_beg, m, user_requested_size);
InsertOrMerge(m->alloc_context_id, newMIB, A->MIBMap);
},
this);
@@ -451,11 +525,7 @@ struct Allocator {
atomic_exchange(&m->user_requested_size, 0, memory_order_acquire);
if (memprof_inited && atomic_load_relaxed(&constructed) &&
!atomic_load_relaxed(&destructing)) {
- u64 c = GetShadowCount(p, user_requested_size);
- long curtime = GetTimestamp();
-
- MemInfoBlock newMIB(user_requested_size, c, m->timestamp_ms, curtime,
- m->cpu_id, GetCpuId());
+ MemInfoBlock newMIB = this->CreateNewMIB(p, m, user_requested_size);
InsertOrMerge(m->alloc_context_id, newMIB, MIBMap);
}
diff --git a/compiler-rt/lib/memprof/memprof_flags.inc b/compiler-rt/lib/memprof/memprof_flags.inc
index ee0760ddc302a..8d8d539ea752f 100644
--- a/compiler-rt/lib/memprof/memprof_flags.inc
+++ b/compiler-rt/lib/memprof/memprof_flags.inc
@@ -36,6 +36,11 @@ MEMPROF_FLAG(bool, allocator_frees_and_returns_null_on_realloc_zero, true,
"POSIX standard). If set to false, realloc(p, 0) will return a "
"pointer to an allocated space which can not be used.")
MEMPROF_FLAG(bool, print_text, false,
- "If set, prints the heap profile in text format. Else use the raw binary serialization format.")
+ "If set, prints the heap profile in text format. Else use the raw "
+ "binary serialization format.")
MEMPROF_FLAG(bool, print_terse, false,
- "If set, prints memory profile in a terse format. Only applicable if print_text = true.")
+ "If set, prints memory profile in a terse format. Only applicable "
+ "if print_text = true.")
+MEMPROF_FLAG(bool, histogram, false,
+ "If set, collects a histogram in memory info blocks alongside one "
+ "large counter.")
\ No newline at end of file
diff --git a/compiler-rt/lib/memprof/memprof_mapping.h b/compiler-rt/lib/memprof/memprof_mapping.h
index 1cc0836834cdf..658ed9d0e74dd 100644
--- a/compiler-rt/lib/memprof/mempr...
[truncated]
|
ba1bcba
to
dd2113d
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Haven't gotten through the tests yet, but here are comments on the source code.
} | ||
|
||
// If we use the normal approach in clearCountersWithoutHistogram, the histogram | ||
// will clear to much data and may overwrite shadow counters that are in use. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/to/too/
Can you add a comment about why?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
(In general it would be good to have a clear comment somewhere that can be referenced from various places that describes the difference between the way the shadow memory is laid out and handled for the 2 cases)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I am actually not 100% sure, (I think it is because it rounds up the shadow_end).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added comment.
return count; | ||
} | ||
|
||
// If we use the normal approach in clearCountersWithoutHistogram, the histogram |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
"from clearCountersWithoutHistogram"?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
lmib->mib.Merge(Block); | ||
// Free only the shorter Histogram |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Expand on why
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
// ---------- | ||
void SerializeMIBInfoToBuffer(MIBMapTy &MIBMap, const Vector<u64> &StackIds, | ||
const u64 ExpectedNumBytes, char *&Buffer) { | ||
char *Ptr = Buffer; | ||
const u64 NumEntries = StackIds.Size(); | ||
Ptr = WriteBytes(NumEntries, Ptr); | ||
|
||
for (u64 i = 0; i < NumEntries; i++) { | ||
const u64 Key = StackIds[i]; | ||
MIBMapTy::Handle h(&MIBMap, Key, /*remove=*/true, /*create=*/false); | ||
CHECK(h.exists()); | ||
Ptr = WriteBytes(Key, Ptr); | ||
Ptr = WriteBytes((*h)->mib, Ptr); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do we end up serializing out the AccessHistogram pointer unnecessarily?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, if we change to a schema based serialization (similar to the PortableMIB), it would fix it. Alternatively, we can write down each field seperately.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added FIXME here as well to mention this
@@ -610,13 +671,30 @@ RawMemProfReader::peekBuildIds(MemoryBuffer *DataBuffer) { | |||
return BuildIds.takeVector(); | |||
} | |||
|
|||
llvm::SmallVector<std::pair<uint64_t, MemInfoBlock>> | |||
RawMemProfReader::readMemInfoBlocks(const char *Ptr) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add a FIXME here or somewhere else relevant that we should implement the Schema used by the indexed format to self describe which MIB fields exist in the raw profile, then gracefully handle the reading of older versions?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
@@ -40,6 +36,28 @@ extern uptr kHighMemEnd; // Initialized in __memprof_init. | |||
#define MEM_TO_SHADOW(mem) \ | |||
((((mem) & SHADOW_MASK) >> SHADOW_SCALE) + (SHADOW_OFFSET)) | |||
|
|||
// Histogram shadow memory is layed different to the standard configuration: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/layed/laid/ (or "laid out")
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
// translates a memory address to the address of its corresponding shadow | ||
// counter memory address. The same data is still provided in MIB whether | ||
// histograms are used or not. Total access counts per allocations are | ||
// computed by summing up all individual 1 byte counters. This can incurr an |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/incurr/incur/
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
// counter memory address. The same data is still provided in MIB whether | ||
// histograms are used or not. Total access counts per allocations are | ||
// computed by summing up all individual 1 byte counters. This can incurr an | ||
// accruacy penalty. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/accruacy/accuracy/
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
@@ -166,6 +166,9 @@ void SerializeMIBInfoToBuffer(MIBMapTy &MIBMap, const Vector<u64> &StackIds, | |||
MIBMapTy::Handle h(&MIBMap, Key, /*remove=*/true, /*create=*/false); | |||
CHECK(h.exists()); | |||
Ptr = WriteBytes(Key, Ptr); | |||
// FIXME: We unnecessarily serialze the AccessHistogram pointer. Adding a |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
s/serialze/serialize/
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
@@ -140,12 +140,16 @@ static cl::opt<int> ClDebugMin("memprof-debug-min", cl::desc("Debug min inst"), | |||
static cl::opt<int> ClDebugMax("memprof-debug-max", cl::desc("Debug max inst"), | |||
cl::Hidden, cl::init(-1)); | |||
|
|||
static cl::opt<bool> ClHistogram("memprof-histogram", |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This seems unused except for the assertion, which doesn't seem necessary. I.e. compiling or not with memprof-histogram wouldn't affect/control whether someone can run with the histogram flag enabled at runtime.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not sure if there is a good way to detect whether the callback approach was enabled when we are in the runtime, to give an error there. If we are concerned, we could insert a flag variable into the binary here and check it at runtime.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I removed the assertion, you are right its not useful. I am using the flag to generate different versions of Callback function now (see comment thread in memprof_mapping.h
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use global variable __memprof_histogram
in IR to set flag in runtime, and got rid of runtime flag.
@@ -104,8 +130,15 @@ inline bool AddrIsAlignedByGranularity(uptr a) { | |||
inline void RecordAccess(uptr a) { | |||
// If we use a different shadow size then the type below needs adjustment. | |||
CHECK_EQ(SHADOW_ENTRY_SIZE, 8); | |||
u64 *shadow_address = (u64 *)MEM_TO_SHADOW(a); | |||
(*shadow_address)++; | |||
if (flags()->histogram) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm slightly concerned about the runtime overhead of the extra compare and branch for every memory access, but it probably isn't significant on top of the function call. I believe we still use the non-callback instrumentation by default, so this shouldn't affect default behavior iiuc (is that correct)? An alternative is to control this from the compiler, by having a separate version of RecordAccess (and its caller _memprof* functions that are called by the instrumented code), but imo that isn't worth it if the default is unaffected.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Solved this having two different functions, RecordAddress and RecordAddressWithHistogram, which get used based on compiler flag, not runtime flag.
void clearCountersHistogram(uptr addr, uptr size) { | ||
u8 *shadow_8 = (u8 *)HISTOGRAM_MEM_TO_SHADOW(addr); | ||
u8 *shadow_end_8 = (u8 *)HISTOGRAM_MEM_TO_SHADOW(addr + size); | ||
for (; shadow_8 < shadow_end_8; shadow_8++) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why not use REAL(memset) like the non-histogram version below?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
changed.
uptr shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1; | ||
REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg); | ||
} | ||
|
||
// Clears the shadow counters (when memory is allocated). | ||
void ClearShadow(uptr addr, uptr size) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looking at this function more, there are a lot of uses of the MEM_TO_SHADOW computed values, which is not the right mapping function to use with the smaller granularity of the histogram case. I think you probably need to version the calls to MEM_TO_SHADOW here, then all of the rest of the code can work as is? I.e. you wouldn't need the 2 versions of clearCounters*
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function uses the MEM_TO_SHADOW computed pointers "kind of", but in reality it just rounds these pointers up to nearest page_sizes. So in effect, full pages should be cleared no matter if it is with histogram or without.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I guess it works, because the shadow granularities are less than the page size, and because they are both scaling by the same 8 to 1 scale, and also I guess because the clear_shadow_mmap_threshold is an optimization and doesn't need to be exact. However, it still feels a little wonky to me (and also means that we have to do extra mapping operations here and again in clearCounters*
). I do think I would prefer to have it be something like:
uptr shadow_beg;
uptr shadow_end;
if (__memprof_histogram) {
shadow_beg = HISTOGRAM_MEM_TO_SHADOW(addr);
shadow_end = HISTOGRAM_MEM_TO_SHADOW(addr + size);
} else {
shadow_beg = MEM_TO_SHADOW(addr);
shadow_end = MEM_TO_SHADOW(addr + size - SHADOW_GRANULARITY) + 1;
}
if (shadow_end - shadow_beg < common_flags()->clear_shadow_mmap_threshold) {
REAL(memset)((void *)shadow_beg, 0, shadow_end - shadow_beg);
} else {
...
I.e. set shadow_beg/end based on whether we are doing the histogramming or not, leave the rest as-is. Would that work?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes I agree, this is nicer.
Changed.
@@ -38,4 +38,5 @@ MEMPROF_FLAG(bool, allocator_frees_and_returns_null_on_realloc_zero, true, | |||
MEMPROF_FLAG(bool, print_text, false, | |||
"If set, prints the heap profile in text format. Else use the raw binary serialization format.") | |||
MEMPROF_FLAG(bool, print_terse, false, | |||
"If set, prints memory profile in a terse format. Only applicable if print_text = true.") | |||
"If set, prints memory profile in a terse format. Only applicable " |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Formatting change only, remove from patch
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
// The profiled binary. | ||
object::OwningBinary<object::Binary> Binary; | ||
// Version of raw memprof binary currently being read. Defaults to most update |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo: s/update to date/up to date/
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
// not segfault, since there will be callstack data placed after this in the | ||
// binary format. | ||
MemInfoBlock MIB = *reinterpret_cast<const MemInfoBlock *>(Ptr); | ||
// Overwrite dirty data. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Isn't this going to overwrite some callstack data?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
My thought process is that the MemInfoBlock MIB is a allocated on the stack, since its a local variable and not a pointer. Meaning the reinterpret_cast copies the information from the data-buffer into the stack variable. Correct me if I am wrong
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh yes you are right, I missed the first *
which means it is making a copy of what was in the memory pointed to by Ptr.
auto MemprofHistogramFlag = new GlobalVariable( | ||
M, IntTy1, true, GlobalValue::WeakAnyLinkage, | ||
Constant::getIntegerValue(IntTy1, APInt(1, ClHistogram)), VarName); | ||
// MemprofHistogramFlag->setVisibility(GlobalValue::HiddenVisibility); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remove?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done.
@@ -0,0 +1,101 @@ | |||
REQUIRES: x86_64-linux | |||
|
|||
This is a copy of memprof-basict.test with slight changes to check that we can still read v3 of memprofraw. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo: basict
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
done
|
||
This is a copy of memprof-basict.test with slight changes to check that we can still read v3 of memprofraw. | ||
|
||
To update the inputs used below run Inputs/update_memprof_inputs.sh /path/to/updated/clang |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this comment is wrong - we can't update the v3 version, is that correct? Nor would we want to.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, good point!
done.
CHECK-NEXT: Function: 15505678318020221912 | ||
CHECK-NEXT: SymbolName: qux | ||
CHECK-NEXT: Function: 3873612792189045660 | ||
CHECK-NEXT: SymbolName: _Z3quxi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did the function symbol names change with your patch?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
good question, I suspect this test has not been regenerated in a long time and something changed in meantime in memprof? I think it might have to do with the SymbolNames being mangled vs demangled, which changes the hash value? Is it a problem that the hash value changed?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not a problem, just struck me as odd since your patch shouldn't change. But like you said, probably some prior change was made that didn't require updating this test to get it to pass, and now those changes are showing up.
if (AccessHistogramSize > 0) { | ||
OS << " " << "AccessHistogramValues" << ":"; | ||
for (uint32_t I = 0; I < AccessHistogramSize; ++I) { | ||
OS << " -" << ((uint64_t *)AccessHistogram)[I]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why the " -" in the outputs? I noticed when looking at the text that they look like negative values as a result. Any reason not to just delimit with the space?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I thought only having space delimiting did not produce valid YAML, I double checked and was wrong. Changed to space delimiting, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm
055a845
to
11811a4
Compare
Adds compile time flag -mllvm -memprof-histogram to turn histogram collection on and off. The -memprof-histogram flag relies on -memprof-use-callbacks=true to work. The flag also sets a flag in the IR, that is used by the runtime to check if histogram mode is enabled. When collecting histograms, shadow mapping logic is updated from having one 8 byte counter for 64 bytes, to 1 byte for 8 bytes, capped at 255. Only supports this granularity as of now. Updates the RawMemprofReader and serializing MemoryInfoBlocks to binary format, including changing to a new version of the raw binary format from version 3 to version 4. The current MemprofReader is compatible and can read memprofraw version 3. Version 3 can no longer be produced. Add a test case memprofV3 to make sure RawMemprofReader remains backward compatible. Updates creating MemoryInfoBlocks with and without Histograms. When two MemoryInfoBlocks are merged, AccessCounts are summed up and the shorter Histogram is removed. Adds a memprof_histogram test case. Initial commit for adding AccessCountHistograms up until RawProfile for memprof.
11811a4
to
1fd03fa
Compare
@mattweingarten Congratulations on having your first Pull Request (PR) merged into the LLVM Project! Your changes will be combined with recent changes from other authors, then tested Please check whether problems have been caused by your change specifically, as How to do this, and the rest of the post-merge process, is covered in detail here. If your change does cause a problem, it may be reverted, or you can revert it yourself. If you don't get any reports, no action is required from you. Your changes are working as expected, well done! |
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/66/builds/616 Here is the relevant piece of the build log for the reference:
|
This patch adds a bunch of 3+ MB binary files to source control. Do you really really need to do that? I'm not sure if there's a specific LLVM policy that forbids it, but it feels like bad practice to me: it's not a very efficient way to use source control and it possibly makes it harderto audit the whole of the LLVM code base. |
We need some generally small test binaries for the memprof merging tests, but I see now that they are much larger which is unexpected, and I believe more have been updated than needed. The input executables need to be updated whenever the raw profiles are updated (because their buildids must match), and in this case we have an new field in the raw profile. But I think we should be able to read the old format and leave the existing raw profiles as is - is that correct @mattweingarten ? In that case the old ones don't need to be updated. But I'm not sure why the size of the executables is so much larger than before, which will also affect the new tests being added. @mattweingarten can you see why, for example, |
Thanks for point it out, looking into reducing the binary sizes again.
|
Patch to reduce binary file size changes: |
…f. (llvm#94264) Adds compile time flag -mllvm -memprof-histogram and runtime flag histogram=true|false to turn Histogram collection on and off. The -memprof-histogram flag relies on -memprof-use-callbacks=true to work. Updates shadow mapping logic in histogram mode from having one 8 byte counter for 64 bytes, to 1 byte for 8 bytes, capped at 255. Only supports this granularity as of now. Updates the RawMemprofReader and serializing MemoryInfoBlocks to binary format, including changing to a new version of the raw binary format from version 3 to version 4. Updates creating MemoryInfoBlocks with and without Histograms. When two MemoryInfoBlocks are merged, AccessCounts are summed up and the shorter Histogram is removed. Adds a memprof_histogram test case. Initial commit for adding AccessCountHistograms up until RawProfile for memprof
…f. (llvm#94264) Adds compile time flag -mllvm -memprof-histogram and runtime flag histogram=true|false to turn Histogram collection on and off. The -memprof-histogram flag relies on -memprof-use-callbacks=true to work. Updates shadow mapping logic in histogram mode from having one 8 byte counter for 64 bytes, to 1 byte for 8 bytes, capped at 255. Only supports this granularity as of now. Updates the RawMemprofReader and serializing MemoryInfoBlocks to binary format, including changing to a new version of the raw binary format from version 3 to version 4. Updates creating MemoryInfoBlocks with and without Histograms. When two MemoryInfoBlocks are merged, AccessCounts are summed up and the shorter Histogram is removed. Adds a memprof_histogram test case. Initial commit for adding AccessCountHistograms up until RawProfile for memprof
Adds compile time flag -mllvm -memprof-histogram and runtime flag histogram=true|false to turn Histogram collection on and off. The -memprof-histogram flag relies on -memprof-use-callbacks=true to work.
Updates shadow mapping logic in histogram mode from having one 8 byte counter for 64 bytes, to 1 byte for 8 bytes, capped at 255. Only supports this granularity as of now.
Updates the RawMemprofReader and serializing MemoryInfoBlocks to binary format, including changing to a new version of the raw binary format from version 3 to version 4.
Updates creating MemoryInfoBlocks with and without Histograms. When two MemoryInfoBlocks are merged, AccessCounts are summed up and the shorter Histogram is removed.
Adds a memprof_histogram test case.
Initial commit for adding AccessCountHistograms up until RawProfile for memprof