Skip to content

Commit 4483cf2

Browse files
authored
[BOLT] CDSplit main logic part 2/2 (#74032)
This diff implements the main splitting logic of CDSplit. CDSplit processes functions in a binary in parallel. For each function BF, it assumes that all other functions are hot-cold split. For each possible hot-warm split point of BF, it computes its corresponding SplitScore, and chooses the split point with the best SplitScore. The SplitScore of each split point is computed in the following way: each call edge or jump edge has an edge score that is proportional to its execution count, and inversely proportional to its distance. The SplitScore of a split point is a sum of edge scores over a fixed set of edges whose distance can change due to hot-warm splitting BF. This set contains all cover calls in the form of X->Y or Y->X given function order [... X ... BF ... Y ...]; we refer to the sum of edge scores over the set of cover calls as CoverCallScore. This set also contains all jump edges (branches) within BF as well as all call edges originated from BF; we refer to the sum of edge scores over this set of edges as LocalScore. CDSplit finds the split index maximizing CoverCallScore + LocalScore.
1 parent a37c69e commit 4483cf2

File tree

3 files changed

+484
-3
lines changed

3 files changed

+484
-3
lines changed

bolt/lib/Passes/SplitFunctions.cpp

Lines changed: 200 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,16 @@ static cl::opt<double> CallScale(
114114
"call-scale",
115115
cl::desc("Call score scale coefficient (when --split-strategy=cdsplit)"),
116116
cl::init(0.95), cl::ReallyHidden, cl::cat(BoltOptCategory));
117+
118+
static cl::opt<double>
119+
CallPower("call-power",
120+
cl::desc("Call score power (when --split-strategy=cdsplit)"),
121+
cl::init(0.05), cl::ReallyHidden, cl::cat(BoltOptCategory));
122+
123+
static cl::opt<double>
124+
JumpPower("jump-power",
125+
cl::desc("Jump score power (when --split-strategy=cdsplit)"),
126+
cl::init(0.15), cl::ReallyHidden, cl::cat(BoltOptCategory));
117127
} // namespace opts
118128

119129
namespace {
@@ -195,6 +205,13 @@ struct SplitCacheDirected final : public SplitStrategy {
195205
size_t Count;
196206
};
197207

208+
struct SplitScore {
209+
size_t SplitIndex;
210+
size_t HotSizeReduction = 0;
211+
double LocalScore = 0;
212+
double CoverCallScore = 0;
213+
};
214+
198215
// Auxiliary variables used by the algorithm.
199216
size_t TotalNumBlocks{0};
200217
size_t OrigHotSectionSize{0};
@@ -340,8 +357,9 @@ struct SplitCacheDirected final : public SplitStrategy {
340357
// We only care about new addresses of blocks in hot/warm.
341358
if (BB->getFragmentNum() == FragmentNum::cold())
342359
break;
360+
const size_t NewSize = BB->getOutputSize();
343361
BB->setOutputStartAddress(CurrentAddr);
344-
CurrentAddr += BB->getOutputSize();
362+
CurrentAddr += NewSize;
345363
BB->setOutputEndAddress(CurrentAddr);
346364
if (BB->getLayoutIndex() == SplitIndex) {
347365
NewHotEndAddr = CurrentAddr;
@@ -402,13 +420,192 @@ struct SplitCacheDirected final : public SplitStrategy {
402420
return CoverCalls;
403421
}
404422

423+
/// Compute the edge score of a call edge.
424+
double computeCallScore(uint64_t CallCount, size_t CallLength) {
425+
// Increase call lengths by 1 to avoid raising 0 to a negative power.
426+
return opts::CallScale * static_cast<double>(CallCount) /
427+
std::pow(static_cast<double>(CallLength + 1), opts::CallPower);
428+
}
429+
430+
/// Compute the edge score of a jump (branch) edge.
431+
double computeJumpScore(uint64_t JumpCount, size_t JumpLength) {
432+
// Increase jump lengths by 1 to avoid raising 0 to a negative power.
433+
return static_cast<double>(JumpCount) /
434+
std::pow(static_cast<double>(JumpLength + 1), opts::JumpPower);
435+
}
436+
437+
/// Compute sum of scores over jumps within \p BlockOrder given \p SplitIndex.
438+
/// Increament Score.LocalScore in place by the sum.
439+
void computeJumpScore(const BasicBlockOrder &BlockOrder,
440+
const size_t SplitIndex, SplitScore &Score) {
441+
442+
for (const BinaryBasicBlock *SrcBB : BlockOrder) {
443+
if (SrcBB->getKnownExecutionCount() == 0)
444+
continue;
445+
446+
const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
447+
448+
for (const auto Pair : zip(SrcBB->successors(), SrcBB->branch_info())) {
449+
const BinaryBasicBlock *DstBB = std::get<0>(Pair);
450+
const BinaryBasicBlock::BinaryBranchInfo &Branch = std::get<1>(Pair);
451+
const size_t JumpCount = Branch.Count;
452+
453+
if (JumpCount == 0)
454+
continue;
455+
456+
const size_t DstBBStartAddr = DstBB->getOutputAddressRange().first;
457+
const size_t NewJumpLength =
458+
AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
459+
Score.LocalScore += computeJumpScore(JumpCount, NewJumpLength);
460+
}
461+
}
462+
}
463+
464+
/// Compute sum of scores over calls originated in the current function
465+
/// given \p SplitIndex. Increament Score.LocalScore in place by the sum.
466+
void computeLocalCallScore(const BasicBlockOrder &BlockOrder,
467+
const size_t SplitIndex, SplitScore &Score) {
468+
if (opts::CallScale == 0)
469+
return;
470+
471+
// Global index of the last block in the current function.
472+
// This is later used to determine whether a call originated in the current
473+
// function is to a function that comes after the current function.
474+
const size_t LastGlobalIndex = GlobalIndices[BlockOrder.back()];
475+
476+
// The length of calls originated in the input function can increase /
477+
// decrease depending on the splitting decision.
478+
for (const BinaryBasicBlock *SrcBB : BlockOrder) {
479+
const size_t CallCount = SrcBB->getKnownExecutionCount();
480+
// If SrcBB does not call any functions, skip it.
481+
if (CallCount == 0)
482+
continue;
483+
484+
// Obtain an estimate on the end address of the src basic block
485+
// after splitting at SplitIndex.
486+
const size_t SrcBBEndAddr = SrcBB->getOutputAddressRange().second;
487+
488+
for (const BinaryBasicBlock *DstBB : Callees[GlobalIndices[SrcBB]]) {
489+
// Obtain an estimate on the start address of the dst basic block
490+
// after splitting at SplitIndex. If DstBB is in a function before
491+
// the current function, then its start address remains unchanged.
492+
size_t DstBBStartAddr = BBOffsets[DstBB];
493+
// If DstBB is in a function after the current function, then its
494+
// start address should be adjusted based on the reduction in hot size.
495+
if (GlobalIndices[DstBB] > LastGlobalIndex) {
496+
assert(DstBBStartAddr >= Score.HotSizeReduction);
497+
DstBBStartAddr -= Score.HotSizeReduction;
498+
}
499+
const size_t NewCallLength =
500+
AbsoluteDifference(SrcBBEndAddr, DstBBStartAddr);
501+
Score.LocalScore += computeCallScore(CallCount, NewCallLength);
502+
}
503+
}
504+
}
505+
506+
/// Compute sum of splitting scores for cover calls of the input function.
507+
/// Increament Score.CoverCallScore in place by the sum.
508+
void computeCoverCallScore(const BasicBlockOrder &BlockOrder,
509+
const size_t SplitIndex,
510+
const std::vector<CallInfo> &CoverCalls,
511+
SplitScore &Score) {
512+
if (opts::CallScale == 0)
513+
return;
514+
515+
for (const CallInfo CI : CoverCalls) {
516+
assert(CI.Length >= Score.HotSizeReduction &&
517+
"Length of cover calls must exceed reduced size of hot fragment.");
518+
// Compute the new length of the call, which is shorter than the original
519+
// one by the size of the splitted fragment minus the total size increase.
520+
const size_t NewCallLength = CI.Length - Score.HotSizeReduction;
521+
Score.CoverCallScore += computeCallScore(CI.Count, NewCallLength);
522+
}
523+
}
524+
525+
/// Compute the split score of splitting a function at a given index.
526+
/// The split score consists of local score and cover score. Cover call score
527+
/// is expensive to compute. As a result, we pass in a \p ReferenceScore and
528+
/// compute cover score only when the local score exceeds that in the
529+
/// ReferenceScore or that the size reduction of the hot fragment is larger
530+
/// than that achieved by the split index of the ReferenceScore. This function
531+
/// returns \p Score of SplitScore type. It contains the local score and cover
532+
/// score (if computed) of the current splitting index. For easier book
533+
/// keeping and comparison, it also stores the split index and the resulting
534+
/// reduction in hot fragment size.
535+
SplitScore computeSplitScore(const BinaryFunction &BF,
536+
const BasicBlockOrder &BlockOrder,
537+
const size_t SplitIndex,
538+
const std::vector<CallInfo> &CoverCalls,
539+
const SplitScore &ReferenceScore) {
540+
// Populate BinaryBasicBlock::OutputAddressRange with estimated
541+
// new start and end addresses after hot-warm splitting at SplitIndex.
542+
size_t OldHotEnd;
543+
size_t NewHotEnd;
544+
std::tie(OldHotEnd, NewHotEnd) =
545+
estimatePostSplitBBAddress(BlockOrder, SplitIndex);
546+
547+
SplitScore Score;
548+
Score.SplitIndex = SplitIndex;
549+
550+
// It's not worth splitting if OldHotEnd < NewHotEnd.
551+
if (OldHotEnd < NewHotEnd)
552+
return Score;
553+
554+
// Hot fragment size reduction due to splitting.
555+
Score.HotSizeReduction = OldHotEnd - NewHotEnd;
556+
557+
// First part of LocalScore is the sum over call edges originated in the
558+
// input function. These edges can get shorter or longer depending on
559+
// SplitIndex. Score.LocalScore is increamented in place.
560+
computeLocalCallScore(BlockOrder, SplitIndex, Score);
561+
562+
// Second part of LocalScore is the sum over jump edges with src basic block
563+
// and dst basic block in the current function. Score.LocalScore is
564+
// increamented in place.
565+
computeJumpScore(BlockOrder, SplitIndex, Score);
566+
567+
// There is no need to compute CoverCallScore if we have already found
568+
// another split index with a bigger LocalScore and bigger HotSizeReduction.
569+
if (Score.LocalScore <= ReferenceScore.LocalScore &&
570+
Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
571+
return Score;
572+
573+
// Compute CoverCallScore and store in Score in place.
574+
computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
575+
return Score;
576+
}
577+
405578
/// Find the best index for splitting. The returned value is the index of the
406579
/// last hot basic block. Hence, "no splitting" is equivalent to returning the
407580
/// value which is one less than the size of the function.
408581
size_t findSplitIndex(const BinaryFunction &BF,
409582
const BasicBlockOrder &BlockOrder) {
410-
// Placeholder: hot-warm split after entry block.
411-
return 0;
583+
// Find all function calls that can be shortened if we move blocks of the
584+
// current function to warm/cold
585+
const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
586+
587+
// Try all possible split indices (blocks with Index <= SplitIndex are in
588+
// hot) and find the one maximizing the splitting score.
589+
SplitScore BestScore;
590+
double BestScoreSum = -1.0;
591+
SplitScore ReferenceScore;
592+
for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
593+
const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
594+
// No need to keep cold blocks in the hot section.
595+
if (LastHotBB->getFragmentNum() == FragmentNum::cold())
596+
break;
597+
const SplitScore Score =
598+
computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
599+
double ScoreSum = Score.LocalScore + Score.CoverCallScore;
600+
if (ScoreSum > BestScoreSum) {
601+
BestScoreSum = ScoreSum;
602+
BestScore = Score;
603+
}
604+
if (Score.LocalScore > ReferenceScore.LocalScore)
605+
ReferenceScore = Score;
606+
}
607+
608+
return BestScore.SplitIndex;
412609
}
413610
};
414611

bolt/test/X86/cdsplit-call-scale.s

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
# Test the control of aggressiveness of 3-way splitting by -call-scale.
2+
# When -call-scale=0.0, the tested function is 2-way splitted.
3+
# When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
4+
# in warm because of the increased benefit of shortening the call edges.
5+
# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
6+
# in warm because of the strong benefit of shortening the call edges.
7+
8+
# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
9+
# RUN: link_fdata %s %t.o %t.fdata
10+
# RUN: llvm-strip --strip-unneeded %t.o
11+
# RUN: %clang %cflags %t.o -o %t.exe -Wl,-q
12+
# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
13+
# RUN: --call-scale=0.0 --print-split --print-only=chain \
14+
# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \
15+
# RUN: 2>&1 | FileCheck --check-prefix=LOWINCENTIVE %s
16+
# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
17+
# RUN: --call-scale=1.0 --print-split --print-only=chain \
18+
# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \
19+
# RUN: 2>&1 | FileCheck --check-prefix=MEDINCENTIVE %s
20+
# RUN: llvm-bolt %t.exe -o %t.bolt --split-functions --split-strategy=cdsplit \
21+
# RUN: --call-scale=1000.0 --print-split --print-only=chain \
22+
# RUN: --data=%t.fdata --reorder-blocks=ext-tsp \
23+
# RUN: 2>&1 | FileCheck --check-prefix=HIGHINCENTIVE %s
24+
25+
# LOWINCENTIVE: Binary Function "chain" after split-functions
26+
# LOWINCENTIVE: {{^\.Ltmp5}}
27+
# LOWINCENTIVE: ------- HOT-COLD SPLIT POINT -------
28+
# LOWINCENTIVE: {{^\.LFT1}}
29+
30+
# MEDINCENTIVE: Binary Function "chain" after split-functions
31+
# MEDINCENTIVE: {{^\.Ltmp1}}
32+
# MEDINCENTIVE: ------- HOT-COLD SPLIT POINT -------
33+
# MEDINCENTIVE: {{^\.LFT1}}
34+
# MEDINCENTIVE: ------- HOT-COLD SPLIT POINT -------
35+
# MEDINCENTIVE: {{^\.Ltmp0}}
36+
# MEDINCENTIVE: {{^\.Ltmp2}}
37+
# MEDINCENTIVE: {{^\.Ltmp3}}
38+
# MEDINCENTIVE: {{^\.Ltmp4}}
39+
# MEDINCENTIVE: {{^\.Ltmp5}}
40+
41+
# HIGHINCENTIVE: Binary Function "chain" after split-functions
42+
# HIGHINCENTIVE: {{^\.LBB00}}
43+
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
44+
# HIGHINCENTIVE: {{^\.LFT1}}
45+
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
46+
# HIGHINCENTIVE: {{^\.LFT0}}
47+
# HIGHINCENTIVE: {{^\.Ltmp1}}
48+
# HIGHINCENTIVE: {{^\.Ltmp0}}
49+
# HIGHINCENTIVE: {{^\.Ltmp2}}
50+
# HIGHINCENTIVE: {{^\.Ltmp3}}
51+
# HIGHINCENTIVE: {{^\.Ltmp4}}
52+
# HIGHINCENTIVE: {{^\.Ltmp5}}
53+
54+
55+
56+
.text
57+
.globl chain
58+
.type chain, @function
59+
chain:
60+
pushq %rbp
61+
movq %rsp, %rbp
62+
cmpl $2, %edi
63+
LLentry_LLchain_start:
64+
jge LLchain_start
65+
# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLchain_start# 0 10
66+
# FDATA: 1 chain #LLentry_LLchain_start# 1 chain #LLfast# 0 500
67+
LLfast:
68+
movl $5, %eax
69+
LLfast_LLexit:
70+
jmp LLexit
71+
# FDATA: 1 chain #LLfast_LLexit# 1 chain #LLexit# 0 500
72+
LLchain_start:
73+
movl $10, %eax
74+
LLchain_start_LLchain1:
75+
jge LLchain1
76+
# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLchain1# 0 10
77+
# FDATA: 1 chain #LLchain_start_LLchain1# 1 chain #LLcold# 0 0
78+
LLcold:
79+
addl $1, %eax
80+
addl $1, %eax
81+
addl $1, %eax
82+
addl $1, %eax
83+
addl $1, %eax
84+
addl $1, %eax
85+
LLchain1:
86+
addl $1, %eax
87+
LLchain1_LLchain2:
88+
jmp LLchain2
89+
# FDATA: 1 chain #LLchain1_LLchain2# 1 chain #LLchain2# 0 10
90+
LLchain2:
91+
addl $1, %eax
92+
LLchain2_LLchain3:
93+
jmp LLchain3
94+
# FDATA: 1 chain #LLchain2_LLchain3# 1 chain #LLchain3# 0 10
95+
LLchain3:
96+
addl $1, %eax
97+
addl $1, %eax
98+
addl $1, %eax
99+
addl $1, %eax
100+
addl $1, %eax
101+
LLchain3_LLchain4:
102+
jmp LLchain4
103+
# FDATA: 1 chain #LLchain3_LLchain4# 1 chain #LLchain4# 0 10
104+
LLchain4:
105+
addl $1, %eax
106+
addl $1, %eax
107+
addl $1, %eax
108+
addl $1, %eax
109+
addl $1, %eax
110+
LLchain4_LLexit:
111+
jmp LLexit
112+
# FDATA: 1 chain #LLchain4_LLexit# 1 chain #LLexit# 0 10
113+
LLexit:
114+
popq %rbp
115+
ret
116+
LLchain_end:
117+
.size chain, LLchain_end-chain
118+
119+
120+
.globl main
121+
.type main, @function
122+
main:
123+
pushq %rbp
124+
movq %rsp, %rbp
125+
movl $1, %edi
126+
LLmain_chain1:
127+
call chain
128+
# FDATA: 1 main #LLmain_chain1# 1 chain 0 0 500
129+
movl $4, %edi
130+
LLmain_chain2:
131+
call chain
132+
# FDATA: 1 main #LLmain_chain2# 1 chain 0 0 10
133+
xorl %eax, %eax
134+
popq %rbp
135+
retq
136+
.Lmain_end:
137+
.size main, .Lmain_end-main

0 commit comments

Comments
 (0)