Skip to content

[memprof] Speed up caller-callee pair extraction #116184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/include/llvm/ProfileData/InstrProfReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,8 @@ class IndexedMemProfReader {
const unsigned char *FrameBase = nullptr;
/// The starting address of the call stack array.
const unsigned char *CallStackBase = nullptr;
// The number of elements in the radix tree array.
unsigned RadixTreeSize = 0;

Error deserializeV012(const unsigned char *Start, const unsigned char *Ptr,
uint64_t FirstWord);
Expand Down
19 changes: 18 additions & 1 deletion llvm/lib/ProfileData/InstrProfReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1303,6 +1303,12 @@ Error IndexedMemProfReader::deserializeV3(const unsigned char *Start,
FrameBase = Ptr;
CallStackBase = Start + CallStackPayloadOffset;

// Compute the number of elements in the radix tree array. Since we use this
// to reserve enough bits in a BitVector, it's totally OK if we overestimate
// this number a little bit because of padding just before the next section.
RadixTreeSize = (RecordPayloadOffset - CallStackPayloadOffset) /
sizeof(memprof::LinearFrameId);

// Now initialize the table reader with a pointer into data buffer.
MemProfRecordTable.reset(MemProfRecordHashTable::Create(
/*Buckets=*/Start + RecordTableOffset,
Expand Down Expand Up @@ -1674,11 +1680,22 @@ IndexedMemProfReader::getMemProfCallerCalleePairs() const {
memprof::LinearFrameIdConverter FrameIdConv(FrameBase);
memprof::CallerCalleePairExtractor Extractor(CallStackBase, FrameIdConv);

// The set of linear call stack IDs that we need to traverse from. We expect
// the set to be dense, so we use a BitVector.
BitVector Worklist(RadixTreeSize);

// Collect the set of linear call stack IDs. Since we expect a lot of
// duplicates, we first collect them in the form of a bit vector before
// processing them.
for (const memprof::IndexedMemProfRecord &IndexedRecord :
MemProfRecordTable->data())
for (const memprof::IndexedAllocationInfo &IndexedAI :
IndexedRecord.AllocSites)
Extractor(IndexedAI.CSId);
Worklist.set(IndexedAI.CSId);

// Collect caller-callee pairs for each linear call stack ID in Worklist.
for (unsigned CS : Worklist.set_bits())
Extractor(CS);

DenseMap<uint64_t, SmallVector<memprof::CallEdgeTy, 0>> Pairs =
std::move(Extractor.CallerCalleePairs);
Expand Down
19 changes: 16 additions & 3 deletions llvm/lib/ProfileData/InstrProfWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -601,7 +601,8 @@ writeMemProfCallStackArray(
&MemProfCallStackData,
llvm::DenseMap<memprof::FrameId, memprof::LinearFrameId>
&MemProfFrameIndexes,
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram) {
llvm::DenseMap<memprof::FrameId, memprof::FrameStat> &FrameHistogram,
unsigned &NumElements) {
llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
MemProfCallStackIndexes;

Expand All @@ -610,6 +611,7 @@ writeMemProfCallStackArray(
FrameHistogram);
for (auto I : Builder.getRadixArray())
OS.write32(I);
NumElements = Builder.getRadixArray().size();
MemProfCallStackIndexes = Builder.takeCallStackPos();

// Release the memory of this vector as it is no longer needed.
Expand Down Expand Up @@ -771,15 +773,26 @@ static Error writeMemProfV3(ProfOStream &OS,
writeMemProfFrameArray(OS, MemProfData.Frames, FrameHistogram);

uint64_t CallStackPayloadOffset = OS.tell();
// The number of elements in the call stack array.
unsigned NumElements = 0;
llvm::DenseMap<memprof::CallStackId, memprof::LinearCallStackId>
MemProfCallStackIndexes = writeMemProfCallStackArray(
OS, MemProfData.CallStacks, MemProfFrameIndexes, FrameHistogram);
MemProfCallStackIndexes =
writeMemProfCallStackArray(OS, MemProfData.CallStacks,
MemProfFrameIndexes, FrameHistogram,
NumElements);

uint64_t RecordPayloadOffset = OS.tell();
uint64_t RecordTableOffset =
writeMemProfRecords(OS, MemProfData.Records, &Schema, memprof::Version3,
&MemProfCallStackIndexes);

// IndexedMemProfReader::deserializeV3 computes the number of elements in the
// call stack array from the difference between CallStackPayloadOffset and
// RecordPayloadOffset. Verify that the computation works.
assert(CallStackPayloadOffset +
NumElements * sizeof(memprof::LinearFrameId) ==
RecordPayloadOffset);

uint64_t Header[] = {
CallStackPayloadOffset,
RecordPayloadOffset,
Expand Down
Loading