Skip to content

Commit 1577483

Browse files
ShatianWangspupyrev
andauthored
[BOLT] Don't split likely fallthrough in CDSplit (#76164)
This diff speeds up CDSplit by not considering any hot-warm splitting point that could break a fall-through branch from a basic block to its most likely successor. Co-authored-by: spupyrev <[email protected]>
1 parent 0cf3af0 commit 1577483

File tree

2 files changed

+68
-41
lines changed

2 files changed

+68
-41
lines changed

bolt/lib/Passes/SplitFunctions.cpp

Lines changed: 64 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -175,8 +175,12 @@ struct SplitCacheDirected final : public SplitStrategy {
175175
void fragment(const BlockIt Start, const BlockIt End) override {
176176
BasicBlockOrder BlockOrder(Start, End);
177177
BinaryFunction &BF = *BlockOrder.front()->getFunction();
178+
// No need to re-split small functions.
179+
if (BlockOrder.size() <= 2)
180+
return;
178181

179182
size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
183+
assert(BestSplitIndex < BlockOrder.size());
180184

181185
// Assign fragments based on the computed best split index.
182186
// All basic blocks with index up to the best split index become hot.
@@ -200,10 +204,12 @@ struct SplitCacheDirected final : public SplitStrategy {
200204
};
201205

202206
struct SplitScore {
203-
size_t SplitIndex;
207+
size_t SplitIndex = size_t(-1);
204208
size_t HotSizeReduction = 0;
205209
double LocalScore = 0;
206210
double CoverCallScore = 0;
211+
212+
double sum() const { return LocalScore + CoverCallScore; }
207213
};
208214

209215
// Auxiliary variables used by the algorithm.
@@ -303,7 +309,7 @@ struct SplitCacheDirected final : public SplitStrategy {
303309
const size_t SplitIndex) {
304310
assert(SplitIndex < BlockOrder.size() && "Invalid split index");
305311

306-
// Update function layout assuming hot-warm splitting at SplitIndex
312+
// Update function layout assuming hot-warm splitting at SplitIndex.
307313
for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
308314
BinaryBasicBlock *BB = BlockOrder[Index];
309315
if (BB->getFragmentNum() == FragmentNum::cold())
@@ -319,8 +325,8 @@ struct SplitCacheDirected final : public SplitStrategy {
319325
// Populate BB.OutputAddressRange with estimated new start and end addresses
320326
// and compute the old end address of the hot section and the new end
321327
// address of the hot section.
322-
size_t OldHotEndAddr;
323-
size_t NewHotEndAddr;
328+
size_t OldHotEndAddr{0};
329+
size_t NewHotEndAddr{0};
324330
size_t CurrentAddr = BBOffsets[BlockOrder[0]];
325331
for (BinaryBasicBlock *BB : BlockOrder) {
326332
// We only care about new addresses of blocks in hot/warm.
@@ -492,20 +498,15 @@ struct SplitCacheDirected final : public SplitStrategy {
492498
}
493499

494500
/// Compute the split score of splitting a function at a given index.
495-
/// The split score consists of local score and cover score. Cover call score
496-
/// is expensive to compute. As a result, we pass in a \p ReferenceScore and
497-
/// compute cover score only when the local score exceeds that in the
498-
/// ReferenceScore or that the size reduction of the hot fragment is larger
499-
/// than that achieved by the split index of the ReferenceScore. This function
500-
/// returns \p Score of SplitScore type. It contains the local score and cover
501-
/// score (if computed) of the current splitting index. For easier book
502-
/// keeping and comparison, it also stores the split index and the resulting
503-
/// reduction in hot fragment size.
501+
/// The split score consists of local score and cover score. This function
502+
/// returns \p Score of SplitScore type. It contains the local score and
503+
/// cover score of the current splitting index. For easier book keeping and
504+
/// comparison, it also stores the split index and the resulting reduction
505+
/// in hot fragment size.
504506
SplitScore computeSplitScore(const BinaryFunction &BF,
505507
const BasicBlockOrder &BlockOrder,
506508
const size_t SplitIndex,
507-
const std::vector<CallInfo> &CoverCalls,
508-
const SplitScore &ReferenceScore) {
509+
const std::vector<CallInfo> &CoverCalls) {
509510
// Populate BinaryBasicBlock::OutputAddressRange with estimated
510511
// new start and end addresses after hot-warm splitting at SplitIndex.
511512
size_t OldHotEnd;
@@ -533,47 +534,74 @@ struct SplitCacheDirected final : public SplitStrategy {
533534
// increamented in place.
534535
computeJumpScore(BlockOrder, SplitIndex, Score);
535536

536-
// There is no need to compute CoverCallScore if we have already found
537-
// another split index with a bigger LocalScore and bigger HotSizeReduction.
538-
if (Score.LocalScore <= ReferenceScore.LocalScore &&
539-
Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
540-
return Score;
541-
542537
// Compute CoverCallScore and store in Score in place.
543538
computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
544539
return Score;
545540
}
546541

542+
/// Find the most likely successor of a basic block when it has one or two
543+
/// successors. Return nullptr otherwise.
544+
const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) {
545+
if (BB->succ_size() == 1)
546+
return BB->getSuccessor();
547+
if (BB->succ_size() == 2) {
548+
uint64_t TakenCount = BB->getTakenBranchInfo().Count;
549+
assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
550+
uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
551+
assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
552+
if (TakenCount > NonTakenCount)
553+
return BB->getConditionalSuccessor(true);
554+
else if (TakenCount < NonTakenCount)
555+
return BB->getConditionalSuccessor(false);
556+
}
557+
return nullptr;
558+
}
559+
547560
/// Find the best index for splitting. The returned value is the index of the
548561
/// last hot basic block. Hence, "no splitting" is equivalent to returning the
549562
/// value which is one less than the size of the function.
550563
size_t findSplitIndex(const BinaryFunction &BF,
551564
const BasicBlockOrder &BlockOrder) {
565+
assert(BlockOrder.size() > 2);
552566
// Find all function calls that can be shortened if we move blocks of the
553567
// current function to warm/cold
554568
const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);
555569

556-
// Try all possible split indices (blocks with Index <= SplitIndex are in
557-
// hot) and find the one maximizing the splitting score.
570+
// Find the existing hot-cold splitting index.
571+
size_t HotColdIndex = 0;
572+
while (HotColdIndex + 1 < BlockOrder.size()) {
573+
if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold())
574+
break;
575+
HotColdIndex++;
576+
}
577+
assert(HotColdIndex + 1 == BlockOrder.size() ||
578+
(BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
579+
BlockOrder[HotColdIndex + 1]->getFragmentNum() ==
580+
FragmentNum::cold()));
581+
582+
// Try all possible split indices up to HotColdIndex (blocks that have
583+
// Index <= SplitIndex are in hot) and find the one maximizing the
584+
// splitting score.
558585
SplitScore BestScore;
559-
double BestScoreSum = -1.0;
560-
SplitScore ReferenceScore;
561-
for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
586+
for (size_t Index = 0; Index <= HotColdIndex; Index++) {
562587
const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
563-
// No need to keep cold blocks in the hot section.
564-
if (LastHotBB->getFragmentNum() == FragmentNum::cold())
565-
break;
588+
assert(LastHotBB->getFragmentNum() != FragmentNum::cold());
589+
590+
// Do not break jump to the most likely successor.
591+
if (Index + 1 < BlockOrder.size() &&
592+
BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB))
593+
continue;
594+
566595
const SplitScore Score =
567-
computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
568-
double ScoreSum = Score.LocalScore + Score.CoverCallScore;
569-
if (ScoreSum > BestScoreSum) {
570-
BestScoreSum = ScoreSum;
596+
computeSplitScore(BF, BlockOrder, Index, CoverCalls);
597+
if (Score.sum() > BestScore.sum())
571598
BestScore = Score;
572-
}
573-
if (Score.LocalScore > ReferenceScore.LocalScore)
574-
ReferenceScore = Score;
575599
}
576600

601+
// If we don't find a good splitting point, fallback to the original one.
602+
if (BestScore.SplitIndex == size_t(-1))
603+
return HotColdIndex;
604+
577605
return BestScore.SplitIndex;
578606
}
579607
};

bolt/test/X86/cdsplit-call-scale.s

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,9 @@
22
# When -call-scale=0.0, the tested function is 2-way splitted.
33
# When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
44
# in warm because of the increased benefit of shortening the call edges.
5-
# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
6-
# in warm because of the strong benefit of shortening the call edges.
5+
# When -call-scale=1000.0, the tested function is still 3-way splitted with
6+
# 5 blocks in warm because cdsplit does not allow hot-warm splitting to break
7+
# a fall through branch from a basic block to its most likely successor.
78

89
# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
910
# RUN: link_fdata %s %t.o %t.fdata
@@ -39,12 +40,10 @@
3940
# MEDINCENTIVE: {{^\.Ltmp5}}
4041

4142
# HIGHINCENTIVE: Binary Function "chain" after split-functions
42-
# HIGHINCENTIVE: {{^\.LBB00}}
43+
# HIGHINCENTIVE: {{^\.Ltmp1}}
4344
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
4445
# HIGHINCENTIVE: {{^\.LFT1}}
4546
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
46-
# HIGHINCENTIVE: {{^\.LFT0}}
47-
# HIGHINCENTIVE: {{^\.Ltmp1}}
4847
# HIGHINCENTIVE: {{^\.Ltmp0}}
4948
# HIGHINCENTIVE: {{^\.Ltmp2}}
5049
# HIGHINCENTIVE: {{^\.Ltmp3}}

0 commit comments

Comments
 (0)