Skip to content

Commit 74e6478

Browse files
authored
[BOLT] Set call to continuation count in pre-aggregated profile
#109683 identified an issue with pre-aggregated profile where a call to continuation fallthrough edge count is missing (profile discontinuity). This issue only affects pre-aggregated profile but not perf data since LBR stack has the necessary information to determine if the trace (fall- through) starts at call continuation, whereas pre-aggregated fallthrough lacks this information. The solution is to look at branch records in pre-aggregated profiles that correspond to returns and assign counts to call to continuation fallthrough: - BranchFrom is in another function or DSO, - BranchTo may be a call continuation site: - not an entry point/landing pad. Note that we can't directly check if BranchFrom corresponds to a return instruction if it's in external DSO. Keep call continuation handling for perf data (`getFallthroughsInTrace`) [1] as-is due to marginally better performance. The difference is that return-converted call to continuation fallthrough is slightly more frequent than other fallthroughs since the former only requires one LBR address while the latter need two that belong to the profiled binary. Hence return-converted fallthroughs have larger "weight" which affects code layout. [1] `DataAggregator::getFallthroughsInTrace` https://github.com/llvm/llvm-project/blob/fea18afeed39fe4435d67eee1834f0f34b23013d/bolt/lib/Profile/DataAggregator.cpp#L906-L915 Test Plan: added callcont-fallthru.s Reviewers: maksfb, ayermolo, ShatianWang, dcci Reviewed By: maksfb, ShatianWang Pull Request: #109486
1 parent 5b697ef commit 74e6478

File tree

4 files changed

+215
-35
lines changed

4 files changed

+215
-35
lines changed

bolt/include/bolt/Core/BinaryFunction.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -908,6 +908,10 @@ class BinaryFunction {
908908
return BB && BB->getOffset() == Offset ? BB : nullptr;
909909
}
910910

911+
const BinaryBasicBlock *getBasicBlockAtOffset(uint64_t Offset) const {
912+
return const_cast<BinaryFunction *>(this)->getBasicBlockAtOffset(Offset);
913+
}
914+
911915
/// Retrieve the landing pad BB associated with invoke instruction \p Invoke
912916
/// that is in \p BB. Return nullptr if none exists
913917
BinaryBasicBlock *getLandingPadBBFor(const BinaryBasicBlock &BB,

bolt/include/bolt/Profile/DataAggregator.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,8 @@ class DataAggregator : public DataReader {
266266
uint64_t Mispreds);
267267

268268
/// Register a \p Branch.
269-
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds);
269+
bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds,
270+
bool IsPreagg);
270271

271272
/// Register a trace between two LBR entries supplied in execution order.
272273
bool doTrace(const LBREntry &First, const LBREntry &Second,

bolt/lib/Profile/DataAggregator.cpp

Lines changed: 77 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -778,42 +778,75 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc,
778778
}
779779

780780
bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count,
781-
uint64_t Mispreds) {
782-
bool IsReturn = false;
783-
auto handleAddress = [&](uint64_t &Addr, bool IsFrom) -> BinaryFunction * {
784-
if (BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr)) {
785-
Addr -= Func->getAddress();
786-
if (IsFrom) {
787-
auto checkReturn = [&](auto MaybeInst) {
788-
IsReturn = MaybeInst && BC->MIB->isReturn(*MaybeInst);
789-
};
790-
if (Func->hasInstructions())
791-
checkReturn(Func->getInstructionAtOffset(Addr));
792-
else
793-
checkReturn(Func->disassembleInstructionAtOffset(Addr));
794-
}
781+
uint64_t Mispreds, bool IsPreagg) {
782+
// Returns whether \p Offset in \p Func contains a return instruction.
783+
auto checkReturn = [&](const BinaryFunction &Func, const uint64_t Offset) {
784+
auto isReturn = [&](auto MI) { return MI && BC->MIB->isReturn(*MI); };
785+
return Func.hasInstructions()
786+
? isReturn(Func.getInstructionAtOffset(Offset))
787+
: isReturn(Func.disassembleInstructionAtOffset(Offset));
788+
};
795789

796-
if (BAT)
797-
Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
790+
// Returns whether \p Offset in \p Func may be a call continuation excluding
791+
// entry points and landing pads.
792+
auto checkCallCont = [&](const BinaryFunction &Func, const uint64_t Offset) {
793+
// No call continuation at a function start.
794+
if (!Offset)
795+
return false;
796+
797+
// FIXME: support BAT case where the function might be in empty state
798+
// (split fragments declared non-simple).
799+
if (!Func.hasCFG())
800+
return false;
801+
802+
// The offset should not be an entry point or a landing pad.
803+
const BinaryBasicBlock *ContBB = Func.getBasicBlockAtOffset(Offset);
804+
return ContBB && !ContBB->isEntryPoint() && !ContBB->isLandingPad();
805+
};
798806

799-
if (BinaryFunction *ParentFunc = getBATParentFunction(*Func)) {
800-
Func = ParentFunc;
801-
if (IsFrom)
802-
NumColdSamples += Count;
803-
}
807+
// Mutates \p Addr to an offset into the containing function, performing BAT
808+
// offset translation and parent lookup.
809+
//
810+
// Returns the containing function (or BAT parent) and whether the address
811+
// corresponds to a return (if \p IsFrom) or a call continuation (otherwise).
812+
auto handleAddress = [&](uint64_t &Addr, bool IsFrom) {
813+
BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr);
814+
if (!Func)
815+
return std::pair{Func, false};
804816

805-
return Func;
806-
}
807-
return nullptr;
817+
Addr -= Func->getAddress();
818+
819+
bool IsRetOrCallCont =
820+
IsFrom ? checkReturn(*Func, Addr) : checkCallCont(*Func, Addr);
821+
822+
if (BAT)
823+
Addr = BAT->translate(Func->getAddress(), Addr, IsFrom);
824+
825+
BinaryFunction *ParentFunc = getBATParentFunction(*Func);
826+
if (!ParentFunc)
827+
return std::pair{Func, IsRetOrCallCont};
828+
829+
if (IsFrom)
830+
NumColdSamples += Count;
831+
832+
return std::pair{ParentFunc, IsRetOrCallCont};
808833
};
809834

810-
BinaryFunction *FromFunc = handleAddress(From, /*IsFrom=*/true);
835+
uint64_t ToOrig = To;
836+
auto [FromFunc, IsReturn] = handleAddress(From, /*IsFrom*/ true);
837+
auto [ToFunc, IsCallCont] = handleAddress(To, /*IsFrom*/ false);
838+
if (!FromFunc && !ToFunc)
839+
return false;
840+
841+
// Record call to continuation trace.
842+
if (IsPreagg && FromFunc != ToFunc && (IsReturn || IsCallCont)) {
843+
LBREntry First{ToOrig - 1, ToOrig - 1, false};
844+
LBREntry Second{ToOrig, ToOrig, false};
845+
return doTrace(First, Second, Count);
846+
}
811847
// Ignore returns.
812848
if (IsReturn)
813849
return true;
814-
BinaryFunction *ToFunc = handleAddress(To, /*IsFrom=*/false);
815-
if (!FromFunc && !ToFunc)
816-
return false;
817850

818851
// Treat recursive control transfers as inter-branches.
819852
if (FromFunc == ToFunc && To != 0) {
@@ -830,10 +863,19 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
830863
BinaryFunction *ToFunc = getBinaryFunctionContainingAddress(Second.From);
831864
if (!FromFunc || !ToFunc) {
832865
LLVM_DEBUG({
833-
dbgs() << "Out of range trace starting in " << FromFunc->getPrintName()
834-
<< formatv(" @ {0:x}", First.To - FromFunc->getAddress())
835-
<< " and ending in " << ToFunc->getPrintName()
836-
<< formatv(" @ {0:x}\n", Second.From - ToFunc->getAddress());
866+
dbgs() << "Out of range trace starting in ";
867+
if (FromFunc)
868+
dbgs() << formatv("{0} @ {1:x}", *FromFunc,
869+
First.To - FromFunc->getAddress());
870+
else
871+
dbgs() << Twine::utohexstr(First.To);
872+
dbgs() << " and ending in ";
873+
if (ToFunc)
874+
dbgs() << formatv("{0} @ {1:x}", *ToFunc,
875+
Second.From - ToFunc->getAddress());
876+
else
877+
dbgs() << Twine::utohexstr(Second.From);
878+
dbgs() << '\n';
837879
});
838880
NumLongRangeTraces += Count;
839881
return false;
@@ -1620,7 +1662,8 @@ void DataAggregator::processBranchEvents() {
16201662
for (const auto &AggrLBR : BranchLBRs) {
16211663
const Trace &Loc = AggrLBR.first;
16221664
const TakenBranchInfo &Info = AggrLBR.second;
1623-
doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount);
1665+
doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount,
1666+
/*IsPreagg*/ false);
16241667
}
16251668
}
16261669

@@ -1781,7 +1824,7 @@ void DataAggregator::processPreAggregated() {
17811824
switch (AggrEntry.EntryType) {
17821825
case AggregatedLBREntry::BRANCH:
17831826
doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count,
1784-
AggrEntry.Mispreds);
1827+
AggrEntry.Mispreds, /*IsPreagg*/ true);
17851828
break;
17861829
case AggregatedLBREntry::FT:
17871830
case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: {

bolt/test/X86/callcont-fallthru.s

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
## Ensures that a call continuation fallthrough count is set when using
2+
## pre-aggregated perf data.
3+
4+
# RUN: %clangxx %cxxflags %s -o %t -Wl,-q -nostdlib
5+
# RUN: link_fdata %s %t %t.pa1 PREAGG
6+
# RUN: link_fdata %s %t %t.pa2 PREAGG2
7+
# RUN: link_fdata %s %t %t.pa3 PREAGG3
8+
# RUN: link_fdata %s %t %t.pa4 PREAGG4
9+
10+
## Check normal case: fallthrough is not LP or secondary entry.
11+
# RUN: llvm-strip --strip-unneeded %t -o %t.exe
12+
# RUN: llvm-bolt %t.exe --pa -p %t.pa1 -o %t.out \
13+
# RUN: --print-cfg --print-only=main | FileCheck %s
14+
15+
## Check that getFallthroughsInTrace correctly handles a trace starting at plt
16+
## call continuation
17+
# RUN: llvm-bolt %t.exe --pa -p %t.pa2 -o %t.out2 \
18+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK2
19+
20+
## Check that we don't treat secondary entry points as call continuation sites.
21+
# RUN: llvm-bolt %t --pa -p %t.pa3 -o %t.out \
22+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK3
23+
24+
## Check fallthrough to a landing pad case.
25+
# RUN: llvm-bolt %t.exe --pa -p %t.pa4 -o %t.out \
26+
# RUN: --print-cfg --print-only=main | FileCheck %s --check-prefix=CHECK4
27+
28+
.globl foo
29+
.type foo, %function
30+
foo:
31+
pushq %rbp
32+
movq %rsp, %rbp
33+
popq %rbp
34+
Lfoo_ret:
35+
retq
36+
.size foo, .-foo
37+
38+
.globl main
39+
.type main, %function
40+
main:
41+
.Lfunc_begin0:
42+
.cfi_startproc
43+
.cfi_personality 155, DW.ref.__gxx_personality_v0
44+
.cfi_lsda 27, .Lexception0
45+
pushq %rbp
46+
movq %rsp, %rbp
47+
subq $0x20, %rsp
48+
movl $0x0, -0x4(%rbp)
49+
movl %edi, -0x8(%rbp)
50+
movq %rsi, -0x10(%rbp)
51+
callq puts@PLT
52+
## Target is a call continuation
53+
# PREAGG: B X:0 #Ltmp1# 2 0
54+
# CHECK: callq puts@PLT
55+
# CHECK-NEXT: count: 2
56+
57+
Ltmp1:
58+
movq -0x10(%rbp), %rax
59+
movq 0x8(%rax), %rdi
60+
movl %eax, -0x14(%rbp)
61+
62+
Ltmp4:
63+
cmpl $0x0, -0x14(%rbp)
64+
je Ltmp0
65+
# CHECK2: je .Ltmp0
66+
# CHECK2-NEXT: count: 3
67+
68+
movl $0xa, -0x18(%rbp)
69+
callq foo
70+
## Target is a call continuation
71+
# PREAGG: B #Lfoo_ret# #Ltmp3# 1 0
72+
# CHECK: callq foo
73+
# CHECK-NEXT: count: 1
74+
75+
## PLT call continuation fallthrough spanning the call
76+
# PREAGG2: F #Ltmp1# #Ltmp3_br# 3
77+
# CHECK2: callq foo
78+
# CHECK2-NEXT: count: 3
79+
80+
## Target is a secondary entry point
81+
# PREAGG3: B X:0 #Ltmp3# 2 0
82+
# CHECK3: callq foo
83+
# CHECK3-NEXT: count: 0
84+
85+
## Target is a landing pad
86+
# PREAGG4: B X:0 #Ltmp3# 2 0
87+
# CHECK4: callq puts@PLT
88+
# CHECK4-NEXT: count: 0
89+
90+
Ltmp3:
91+
cmpl $0x0, -0x18(%rbp)
92+
Ltmp3_br:
93+
jmp Ltmp2
94+
95+
Ltmp2:
96+
movl -0x18(%rbp), %eax
97+
addl $-0x1, %eax
98+
movl %eax, -0x18(%rbp)
99+
jmp Ltmp3
100+
jmp Ltmp4
101+
jmp Ltmp1
102+
103+
Ltmp0:
104+
xorl %eax, %eax
105+
addq $0x20, %rsp
106+
popq %rbp
107+
retq
108+
.Lfunc_end0:
109+
.cfi_endproc
110+
.size main, .-main
111+
112+
.section .gcc_except_table,"a",@progbits
113+
.p2align 2, 0x0
114+
GCC_except_table0:
115+
.Lexception0:
116+
.byte 255 # @LPStart Encoding = omit
117+
.byte 255 # @TType Encoding = omit
118+
.byte 1 # Call site Encoding = uleb128
119+
.uleb128 .Lcst_end0-.Lcst_begin0
120+
.Lcst_begin0:
121+
.uleb128 .Lfunc_begin0-.Lfunc_begin0 # >> Call Site 1 <<
122+
.uleb128 .Lfunc_end0-.Lfunc_begin0 # Call between .Lfunc_begin0 and .Lfunc_end0
123+
.uleb128 Ltmp3-.Lfunc_begin0 # jumps to Ltmp3
124+
.byte 0 # has no landing pad
125+
.byte 0 # On action: cleanup
126+
.Lcst_end0:
127+
.p2align 2, 0x0
128+
.hidden DW.ref.__gxx_personality_v0
129+
.weak DW.ref.__gxx_personality_v0
130+
.section .data.DW.ref.__gxx_personality_v0,"awG",@progbits,DW.ref.__gxx_personality_v0,comdat
131+
.p2align 3, 0x0
132+
.type DW.ref.__gxx_personality_v0,@object

0 commit comments

Comments
 (0)