Skip to content

Commit 3f97016

Browse files
committed
[llvm-profgen] Decoding pseudo probe for profiled function only.
Complete pseudo probes decoding can result in large memory usage. In practice only a small porting of the decoded probes are used in profile generation. I'm changing the full decoding mode to be decoding for profiled functions only, though we still do a full scan of the .pseudoprobe section due to a missing table-of-content but we don't have to build the in-memory data structure for functions not sampled. To build the in-memory data structure for profiled functions only, I'm rewriting the previous non-recursive probe decoding logic to be recursive. This is easy to read and maintain. I also have to change the previous representation of unsymbolized context from probe-based stack to address-based stack since the profiled functions are unknown yet by the time of virtual unwinding. The address-based stack will be converted to probe-based stack after virtual unwinding and on-demand probe decoding. I'm seeing 20GB memory is saved for one of our internal large service. Reviewed By: wenlei Differential Revision: https://reviews.llvm.org/D121643
1 parent d90a3fc commit 3f97016

File tree

10 files changed

+286
-192
lines changed

10 files changed

+286
-192
lines changed

llvm/include/llvm/MC/MCPseudoProbe.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@
5555
#include <tuple>
5656
#include <type_traits>
5757
#include <unordered_map>
58+
#include <unordered_set>
5859
#include <vector>
5960

6061
namespace llvm {
@@ -353,6 +354,15 @@ class MCPseudoProbeDecoder {
353354
// Decode pseudo_probe section to build address to probes map.
354355
bool buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size);
355356

357+
// Decode pseudo_probe section to build address to probes map for specifed
358+
// functions only.
359+
bool buildAddress2ProbeMap(const uint8_t *Start, std::size_t Size,
360+
std::unordered_set<uint64_t> &GuildFilter);
361+
362+
bool buildAddress2ProbeMap(MCDecodedPseudoProbeInlineTree *Cur,
363+
uint64_t &LastAddr,
364+
std::unordered_set<uint64_t> &GuildFilter);
365+
356366
// Print pseudo_probe_desc section info
357367
void printGUID2FuncDescMap(raw_ostream &OS);
358368

llvm/lib/MC/MCPseudoProbe.cpp

Lines changed: 87 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -358,8 +358,9 @@ bool MCPseudoProbeDecoder::buildGUID2FuncDescMap(const uint8_t *Start,
358358
return true;
359359
}
360360

361-
bool MCPseudoProbeDecoder::buildAddress2ProbeMap(const uint8_t *Start,
362-
std::size_t Size) {
361+
bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
362+
MCDecodedPseudoProbeInlineTree *Cur, uint64_t &LastAddr,
363+
std::unordered_set<uint64_t> &GuildFilter) {
363364
// The pseudo_probe section encodes an inline forest and each tree has a
364365
// format like:
365366
// FUNCTION BODY (one for each uninlined function present in the text
@@ -390,101 +391,110 @@ bool MCPseudoProbeDecoder::buildAddress2ProbeMap(const uint8_t *Start,
390391
// FUNCTION BODY
391392
// A FUNCTION BODY entry describing the inlined function.
392393

393-
Data = Start;
394-
End = Data + Size;
395-
396-
MCDecodedPseudoProbeInlineTree *Root = &DummyInlineRoot;
397-
MCDecodedPseudoProbeInlineTree *Cur = &DummyInlineRoot;
398-
uint64_t LastAddr = 0;
399394
uint32_t Index = 0;
400-
// A DFS-based decoding
401-
while (Data < End) {
402-
if (Root == Cur) {
403-
// Use a sequential id for top level inliner.
404-
Index = Root->getChildren().size();
405-
} else {
406-
// Read inline site for inlinees
407-
auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
408-
if (!ErrorOrIndex)
409-
return false;
410-
Index = std::move(*ErrorOrIndex);
411-
}
395+
if (Cur == &DummyInlineRoot) {
396+
// Use a sequential id for top level inliner.
397+
Index = Cur->getChildren().size();
398+
} else {
399+
// Read inline site for inlinees
400+
auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
401+
if (!ErrorOrIndex)
402+
return false;
403+
Index = std::move(*ErrorOrIndex);
404+
}
405+
406+
// Read guid
407+
auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
408+
if (!ErrorOrCurGuid)
409+
return false;
410+
uint64_t Guid = std::move(*ErrorOrCurGuid);
411+
412+
// Decide if top-level node should be disgarded.
413+
if (Cur == &DummyInlineRoot && !GuildFilter.empty() &&
414+
!GuildFilter.count(Guid))
415+
Cur = nullptr;
416+
417+
// If the incoming node is null, all its children nodes should be disgarded.
418+
if (Cur) {
412419
// Switch/add to a new tree node(inlinee)
413420
Cur = Cur->getOrAddNode(std::make_tuple(Cur->Guid, Index));
414-
// Read guid
415-
auto ErrorOrCurGuid = readUnencodedNumber<uint64_t>();
416-
if (!ErrorOrCurGuid)
417-
return false;
418-
Cur->Guid = std::move(*ErrorOrCurGuid);
419-
// Read number of probes in the current node.
420-
auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
421-
if (!ErrorOrNodeCount)
421+
Cur->Guid = Guid;
422+
}
423+
424+
// Read number of probes in the current node.
425+
auto ErrorOrNodeCount = readUnsignedNumber<uint32_t>();
426+
if (!ErrorOrNodeCount)
427+
return false;
428+
uint32_t NodeCount = std::move(*ErrorOrNodeCount);
429+
// Read number of direct inlinees
430+
auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
431+
if (!ErrorOrCurChildrenToProcess)
432+
return false;
433+
// Read all probes in this node
434+
for (std::size_t I = 0; I < NodeCount; I++) {
435+
// Read index
436+
auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
437+
if (!ErrorOrIndex)
422438
return false;
423-
uint32_t NodeCount = std::move(*ErrorOrNodeCount);
424-
// Read number of direct inlinees
425-
auto ErrorOrCurChildrenToProcess = readUnsignedNumber<uint32_t>();
426-
if (!ErrorOrCurChildrenToProcess)
439+
uint32_t Index = std::move(*ErrorOrIndex);
440+
// Read type | flag.
441+
auto ErrorOrValue = readUnencodedNumber<uint8_t>();
442+
if (!ErrorOrValue)
427443
return false;
428-
Cur->ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
429-
// Read all probes in this node
430-
for (std::size_t I = 0; I < NodeCount; I++) {
431-
// Read index
432-
auto ErrorOrIndex = readUnsignedNumber<uint32_t>();
433-
if (!ErrorOrIndex)
444+
uint8_t Value = std::move(*ErrorOrValue);
445+
uint8_t Kind = Value & 0xf;
446+
uint8_t Attr = (Value & 0x70) >> 4;
447+
// Read address
448+
uint64_t Addr = 0;
449+
if (Value & 0x80) {
450+
auto ErrorOrOffset = readSignedNumber<int64_t>();
451+
if (!ErrorOrOffset)
434452
return false;
435-
uint32_t Index = std::move(*ErrorOrIndex);
436-
// Read type | flag.
437-
auto ErrorOrValue = readUnencodedNumber<uint8_t>();
438-
if (!ErrorOrValue)
453+
int64_t Offset = std::move(*ErrorOrOffset);
454+
Addr = LastAddr + Offset;
455+
} else {
456+
auto ErrorOrAddr = readUnencodedNumber<int64_t>();
457+
if (!ErrorOrAddr)
439458
return false;
440-
uint8_t Value = std::move(*ErrorOrValue);
441-
uint8_t Kind = Value & 0xf;
442-
uint8_t Attr = (Value & 0x70) >> 4;
443-
// Read address
444-
uint64_t Addr = 0;
445-
if (Value & 0x80) {
446-
auto ErrorOrOffset = readSignedNumber<int64_t>();
447-
if (!ErrorOrOffset)
448-
return false;
449-
int64_t Offset = std::move(*ErrorOrOffset);
450-
Addr = LastAddr + Offset;
451-
} else {
452-
auto ErrorOrAddr = readUnencodedNumber<int64_t>();
453-
if (!ErrorOrAddr)
454-
return false;
455-
Addr = std::move(*ErrorOrAddr);
456-
}
459+
Addr = std::move(*ErrorOrAddr);
460+
}
461+
462+
if (Cur) {
457463
// Populate Address2ProbesMap
458464
auto &Probes = Address2ProbesMap[Addr];
459465
Probes.emplace_back(Addr, Cur->Guid, Index, PseudoProbeType(Kind), Attr,
460466
Cur);
461467
Cur->addProbes(&Probes.back());
462-
LastAddr = Addr;
463468
}
469+
LastAddr = Addr;
470+
}
464471

465-
// Look for the parent for the next node by subtracting the current
466-
// node count from tree counts along the parent chain. The first node
467-
// in the chain that has a non-zero tree count is the target.
468-
while (Cur != Root) {
469-
if (Cur->ChildrenToProcess == 0) {
470-
Cur = static_cast<MCDecodedPseudoProbeInlineTree *>(Cur->Parent);
471-
if (Cur != Root) {
472-
assert(Cur->ChildrenToProcess > 0 &&
473-
"Should have some unprocessed nodes");
474-
Cur->ChildrenToProcess -= 1;
475-
}
476-
} else {
477-
break;
478-
}
479-
}
472+
uint32_t ChildrenToProcess = std::move(*ErrorOrCurChildrenToProcess);
473+
for (uint32_t I = 0; I < ChildrenToProcess; I++) {
474+
buildAddress2ProbeMap(Cur, LastAddr, GuildFilter);
480475
}
481476

477+
return true;
478+
}
479+
480+
bool MCPseudoProbeDecoder::buildAddress2ProbeMap(
481+
const uint8_t *Start, std::size_t Size,
482+
std::unordered_set<uint64_t> &GuildFilter) {
483+
Data = Start;
484+
End = Data + Size;
485+
uint64_t LastAddr = 0;
486+
while (Data < End)
487+
buildAddress2ProbeMap(&DummyInlineRoot, LastAddr, GuildFilter);
482488
assert(Data == End && "Have unprocessed data in pseudo_probe section");
483-
assert(Cur == Root &&
484-
" Cur should point to root when the forest is fully built up");
485489
return true;
486490
}
487491

492+
bool MCPseudoProbeDecoder::buildAddress2ProbeMap(const uint8_t *Start,
493+
std::size_t Size) {
494+
std::unordered_set<uint64_t> GuildFilter;
495+
return buildAddress2ProbeMap(Start, Size, GuildFilter);
496+
}
497+
488498
void MCPseudoProbeDecoder::printGUID2FuncDescMap(raw_ostream &OS) {
489499
OS << "Pseudo Probe Desc:\n";
490500
// Make the output deterministic

llvm/test/tools/llvm-profgen/noinline-cs-pseudoprobe.test

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,14 @@
2424
; CHECK-NEXT: 4: 15
2525
; CHECK-NEXT: !CFGChecksum: 72617220756
2626

27-
; CHECK-UNWINDER: [main:2]
27+
; CHECK-UNWINDER: [0x7f4]
2828
; CHECK-UNWINDER-NEXT: 2
2929
; CHECK-UNWINDER-NEXT: 79e-7bf:15
3030
; CHECK-UNWINDER-NEXT: 7c4-7cf:15
3131
; CHECK-UNWINDER-NEXT: 2
3232
; CHECK-UNWINDER-NEXT: 7bf->760:15
3333
; CHECK-UNWINDER-NEXT: 7cf->79e:16
34-
; CHECK-UNWINDER-NEXT: [main:2 @ foo:8]
34+
; CHECK-UNWINDER-NEXT: [0x7f4 @ 0x7bf]
3535
; CHECK-UNWINDER-NEXT: 1
3636
; CHECK-UNWINDER-NEXT: 760-77f:15
3737
; CHECK-UNWINDER-NEXT: 1

llvm/test/tools/llvm-profgen/recursion-compression-pseudoprobe.test

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -123,7 +123,7 @@
123123
; CHECK: 6: 1 fa:1
124124
; CHECK: !CFGChecksum: 563022570642068
125125

126-
; CHECK-UNWINDER: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5]
126+
; CHECK-UNWINDER: [0x842 @ 0x7d4 @ 0x7e0 @ 0x7ab]
127127
; CHECK-UNWINDER-NEXT: 3
128128
; CHECK-UNWINDER-NEXT: 7a0-7a7:1
129129
; CHECK-UNWINDER-NEXT: 7a0-7ab:3
@@ -132,33 +132,33 @@
132132
; CHECK-UNWINDER-NEXT: 7a7->7b2:1
133133
; CHECK-UNWINDER-NEXT: 7ab->7a0:4
134134
; CHECK-UNWINDER-NEXT: 7b5->7c0:1
135-
; CHECK-UNWINDER-NEXT: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6]
135+
; CHECK-UNWINDER-NEXT: [0x842 @ 0x7d4 @ 0x7e0 @ 0x7ab @ 0x7b5]
136136
; CHECK-UNWINDER-NEXT: 1
137137
; CHECK-UNWINDER-NEXT: 7c0-7d4:1
138138
; CHECK-UNWINDER-NEXT: 1
139139
; CHECK-UNWINDER-NEXT: 7d4->7c0:1
140-
; CHECK-UNWINDER-NEXT: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8]
140+
; CHECK-UNWINDER-NEXT: [0x842 @ 0x7d4 @ 0x7e0 @ 0x7ab @ 0x7b5 @ 0x7d4]
141141
; CHECK-UNWINDER-NEXT: 2
142142
; CHECK-UNWINDER-NEXT: 7c0-7cd:1
143143
; CHECK-UNWINDER-NEXT: 7db-7e0:1
144144
; CHECK-UNWINDER-NEXT: 2
145145
; CHECK-UNWINDER-NEXT: 7cd->7db:1
146146
; CHECK-UNWINDER-NEXT: 7e0->7a0:1
147-
; CHECK-UNWINDER-NEXT: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7]
147+
; CHECK-UNWINDER-NEXT: [0x842 @ 0x7d4 @ 0x7e0 @ 0x7ab @ 0x7b5 @ 0x7d4 @ 0x7e0]
148148
; CHECK-UNWINDER-NEXT: 2
149149
; CHECK-UNWINDER-NEXT: 7a0-7a7:1
150150
; CHECK-UNWINDER-NEXT: 7b2-7b5:1
151151
; CHECK-UNWINDER-NEXT: 2
152152
; CHECK-UNWINDER-NEXT: 7a7->7b2:1
153153
; CHECK-UNWINDER-NEXT: 7b5->7c0:1
154-
; CHECK-UNWINDER-NEXT: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6]
154+
; CHECK-UNWINDER-NEXT: [0x842 @ 0x7d4 @ 0x7e0 @ 0x7ab @ 0x7b5 @ 0x7d4 @ 0x7e0 @ 0x7b5]
155155
; CHECK-UNWINDER-NEXT: 2
156156
; CHECK-UNWINDER-NEXT: 7c0-7cd:2
157157
; CHECK-UNWINDER-NEXT: 7db-7e0:1
158158
; CHECK-UNWINDER-NEXT: 2
159159
; CHECK-UNWINDER-NEXT: 7cd->7db:2
160160
; CHECK-UNWINDER-NEXT: 7e0->7a0:1
161-
; CHECK-UNWINDER-NEXT: [main:2 @ foo:5 @ fa:8 @ fa:7 @ fb:5 @ fb:6 @ fa:8 @ fa:7 @ fb:6 @ fa:7]
161+
; CHECK-UNWINDER-NEXT: [0x842 @ 0x7d4 @ 0x7e0 @ 0x7ab @ 0x7b5 @ 0x7d4 @ 0x7e0 @ 0x7b5 @ 0x7e0]
162162
; CHECK-UNWINDER-NEXT: 2
163163
; CHECK-UNWINDER-NEXT: 7a0-7a7:1
164164
; CHECK-UNWINDER-NEXT: 7b2-7b5:1

llvm/tools/llvm-profgen/PerfReader.cpp

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -179,17 +179,12 @@ std::shared_ptr<StringBasedCtxKey> FrameStack::getContextKey() {
179179
return KeyStr;
180180
}
181181

182-
std::shared_ptr<ProbeBasedCtxKey> ProbeStack::getContextKey() {
183-
std::shared_ptr<ProbeBasedCtxKey> ProbeBasedKey =
184-
std::make_shared<ProbeBasedCtxKey>();
185-
for (auto CallProbe : Stack) {
186-
ProbeBasedKey->Probes.emplace_back(CallProbe);
187-
}
188-
CSProfileGenerator::compressRecursionContext<const MCDecodedPseudoProbe *>(
189-
ProbeBasedKey->Probes);
190-
CSProfileGenerator::trimContext<const MCDecodedPseudoProbe *>(
191-
ProbeBasedKey->Probes);
192-
return ProbeBasedKey;
182+
std::shared_ptr<AddrBasedCtxKey> AddressStack::getContextKey() {
183+
std::shared_ptr<AddrBasedCtxKey> KeyStr = std::make_shared<AddrBasedCtxKey>();
184+
KeyStr->Context = Stack;
185+
CSProfileGenerator::compressRecursionContext<uint64_t>(KeyStr->Context);
186+
CSProfileGenerator::trimContext<uint64_t>(KeyStr->Context);
187+
return KeyStr;
193188
}
194189

195190
template <typename T>
@@ -252,8 +247,8 @@ void VirtualUnwinder::collectSamplesFromFrameTrie(
252247
void VirtualUnwinder::collectSamplesFromFrameTrie(
253248
UnwindState::ProfiledFrame *Cur) {
254249
if (Binary->usePseudoProbes()) {
255-
ProbeStack Stack(Binary);
256-
collectSamplesFromFrameTrie<ProbeStack>(Cur, Stack);
250+
AddressStack Stack(Binary);
251+
collectSamplesFromFrameTrie<AddressStack>(Cur, Stack);
257252
} else {
258253
FrameStack Stack(Binary);
259254
collectSamplesFromFrameTrie<FrameStack>(Cur, Stack);
@@ -461,14 +456,17 @@ static std::string getContextKeyStr(ContextKey *K,
461456
const ProfiledBinary *Binary) {
462457
if (const auto *CtxKey = dyn_cast<StringBasedCtxKey>(K)) {
463458
return SampleContext::getContextString(CtxKey->Context);
464-
} else if (const auto *CtxKey = dyn_cast<ProbeBasedCtxKey>(K)) {
465-
SampleContextFrameVector ContextStack;
466-
for (const auto *Probe : CtxKey->Probes) {
467-
Binary->getInlineContextForProbe(Probe, ContextStack, true);
459+
} else if (const auto *CtxKey = dyn_cast<AddrBasedCtxKey>(K)) {
460+
std::ostringstream OContextStr;
461+
for (uint32_t I = 0; I < CtxKey->Context.size(); I++) {
462+
if (OContextStr.str().size())
463+
OContextStr << " @ ";
464+
OContextStr << "0x"
465+
<< to_hexString(
466+
Binary->virtualAddrToOffset(CtxKey->Context[I]),
467+
false);
468468
}
469-
// Probe context key at this point does not have leaf probe, so do not
470-
// include the leaf inline location.
471-
return SampleContext::getContextString(ContextStack, true);
469+
return OContextStr.str();
472470
} else {
473471
llvm_unreachable("unexpected key type");
474472
}

0 commit comments

Comments
 (0)