Skip to content

[BOLT] Don't split likely fallthrough in CDSplit #76164

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Dec 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 64 additions & 36 deletions bolt/lib/Passes/SplitFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,8 +175,12 @@ struct SplitCacheDirected final : public SplitStrategy {
void fragment(const BlockIt Start, const BlockIt End) override {
BasicBlockOrder BlockOrder(Start, End);
BinaryFunction &BF = *BlockOrder.front()->getFunction();
// No need to re-split small functions.
if (BlockOrder.size() <= 2)
return;

size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);
assert(BestSplitIndex < BlockOrder.size());

// Assign fragments based on the computed best split index.
// All basic blocks with index up to the best split index become hot.
Expand All @@ -200,10 +204,12 @@ struct SplitCacheDirected final : public SplitStrategy {
};

struct SplitScore {
size_t SplitIndex;
size_t SplitIndex = size_t(-1);
size_t HotSizeReduction = 0;
double LocalScore = 0;
double CoverCallScore = 0;

double sum() const { return LocalScore + CoverCallScore; }
};

// Auxiliary variables used by the algorithm.
Expand Down Expand Up @@ -303,7 +309,7 @@ struct SplitCacheDirected final : public SplitStrategy {
const size_t SplitIndex) {
assert(SplitIndex < BlockOrder.size() && "Invalid split index");

// Update function layout assuming hot-warm splitting at SplitIndex
// Update function layout assuming hot-warm splitting at SplitIndex.
for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
BinaryBasicBlock *BB = BlockOrder[Index];
if (BB->getFragmentNum() == FragmentNum::cold())
Expand All @@ -319,8 +325,8 @@ struct SplitCacheDirected final : public SplitStrategy {
// Populate BB.OutputAddressRange with estimated new start and end addresses
// and compute the old end address of the hot section and the new end
// address of the hot section.
size_t OldHotEndAddr;
size_t NewHotEndAddr;
size_t OldHotEndAddr{0};
size_t NewHotEndAddr{0};
size_t CurrentAddr = BBOffsets[BlockOrder[0]];
for (BinaryBasicBlock *BB : BlockOrder) {
// We only care about new addresses of blocks in hot/warm.
Expand Down Expand Up @@ -492,20 +498,15 @@ struct SplitCacheDirected final : public SplitStrategy {
}

/// Compute the split score of splitting a function at a given index.
/// The split score consists of local score and cover score. Cover call score
/// is expensive to compute. As a result, we pass in a \p ReferenceScore and
/// compute cover score only when the local score exceeds that in the
/// ReferenceScore or that the size reduction of the hot fragment is larger
/// than that achieved by the split index of the ReferenceScore. This function
/// returns \p Score of SplitScore type. It contains the local score and cover
/// score (if computed) of the current splitting index. For easier book
/// keeping and comparison, it also stores the split index and the resulting
/// reduction in hot fragment size.
/// The split score consists of local score and cover score. This function
/// returns \p Score of SplitScore type. It contains the local score and
/// cover score of the current splitting index. For easier book keeping and
/// comparison, it also stores the split index and the resulting reduction
/// in hot fragment size.
SplitScore computeSplitScore(const BinaryFunction &BF,
const BasicBlockOrder &BlockOrder,
const size_t SplitIndex,
const std::vector<CallInfo> &CoverCalls,
const SplitScore &ReferenceScore) {
const std::vector<CallInfo> &CoverCalls) {
// Populate BinaryBasicBlock::OutputAddressRange with estimated
// new start and end addresses after hot-warm splitting at SplitIndex.
size_t OldHotEnd;
Expand Down Expand Up @@ -533,47 +534,74 @@ struct SplitCacheDirected final : public SplitStrategy {
// increamented in place.
computeJumpScore(BlockOrder, SplitIndex, Score);

// There is no need to compute CoverCallScore if we have already found
// another split index with a bigger LocalScore and bigger HotSizeReduction.
if (Score.LocalScore <= ReferenceScore.LocalScore &&
Score.HotSizeReduction <= ReferenceScore.HotSizeReduction)
return Score;

// Compute CoverCallScore and store in Score in place.
computeCoverCallScore(BlockOrder, SplitIndex, CoverCalls, Score);
return Score;
}

/// Find the most likely successor of a basic block when it has one or two
/// successors. Return nullptr otherwise.
const BinaryBasicBlock *getMostLikelySuccessor(const BinaryBasicBlock *BB) {
if (BB->succ_size() == 1)
return BB->getSuccessor();
if (BB->succ_size() == 2) {
uint64_t TakenCount = BB->getTakenBranchInfo().Count;
assert(TakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
uint64_t NonTakenCount = BB->getFallthroughBranchInfo().Count;
assert(NonTakenCount != BinaryBasicBlock::COUNT_NO_PROFILE);
if (TakenCount > NonTakenCount)
return BB->getConditionalSuccessor(true);
else if (TakenCount < NonTakenCount)
return BB->getConditionalSuccessor(false);
}
return nullptr;
}

/// Find the best index for splitting. The returned value is the index of the
/// last hot basic block. Hence, "no splitting" is equivalent to returning the
/// value which is one less than the size of the function.
size_t findSplitIndex(const BinaryFunction &BF,
const BasicBlockOrder &BlockOrder) {
assert(BlockOrder.size() > 2);
// Find all function calls that can be shortened if we move blocks of the
// current function to warm/cold
const std::vector<CallInfo> CoverCalls = extractCoverCalls(BF);

// Try all possible split indices (blocks with Index <= SplitIndex are in
// hot) and find the one maximizing the splitting score.
// Find the existing hot-cold splitting index.
size_t HotColdIndex = 0;
while (HotColdIndex + 1 < BlockOrder.size()) {
if (BlockOrder[HotColdIndex + 1]->getFragmentNum() == FragmentNum::cold())
break;
HotColdIndex++;
}
assert(HotColdIndex + 1 == BlockOrder.size() ||
(BlockOrder[HotColdIndex]->getFragmentNum() == FragmentNum::main() &&
BlockOrder[HotColdIndex + 1]->getFragmentNum() ==
FragmentNum::cold()));

// Try all possible split indices up to HotColdIndex (blocks that have
// Index <= SplitIndex are in hot) and find the one maximizing the
// splitting score.
SplitScore BestScore;
double BestScoreSum = -1.0;
SplitScore ReferenceScore;
for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
for (size_t Index = 0; Index <= HotColdIndex; Index++) {
const BinaryBasicBlock *LastHotBB = BlockOrder[Index];
// No need to keep cold blocks in the hot section.
if (LastHotBB->getFragmentNum() == FragmentNum::cold())
break;
assert(LastHotBB->getFragmentNum() != FragmentNum::cold());

// Do not break jump to the most likely successor.
if (Index + 1 < BlockOrder.size() &&
BlockOrder[Index + 1] == getMostLikelySuccessor(LastHotBB))
continue;

const SplitScore Score =
computeSplitScore(BF, BlockOrder, Index, CoverCalls, ReferenceScore);
double ScoreSum = Score.LocalScore + Score.CoverCallScore;
if (ScoreSum > BestScoreSum) {
BestScoreSum = ScoreSum;
computeSplitScore(BF, BlockOrder, Index, CoverCalls);
if (Score.sum() > BestScore.sum())
BestScore = Score;
}
if (Score.LocalScore > ReferenceScore.LocalScore)
ReferenceScore = Score;
}

// If we don't find a good splitting point, fallback to the original one.
if (BestScore.SplitIndex == size_t(-1))
return HotColdIndex;

return BestScore.SplitIndex;
}
};
Expand Down
9 changes: 4 additions & 5 deletions bolt/test/X86/cdsplit-call-scale.s
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
# When -call-scale=0.0, the tested function is 2-way splitted.
# When -call-scale=1.0, the tested function is 3-way splitted with 5 blocks
# in warm because of the increased benefit of shortening the call edges.
# When -call-scale=1000.0, the tested function is 3-way splitted with 7 blocks
# in warm because of the strong benefit of shortening the call edges.
# When -call-scale=1000.0, the tested function is still 3-way splitted with
# 5 blocks in warm because cdsplit does not allow hot-warm splitting to break
# a fall through branch from a basic block to its most likely successor.

# RUN: llvm-mc --filetype=obj --triple x86_64-unknown-unknown %s -o %t.o
# RUN: link_fdata %s %t.o %t.fdata
Expand Down Expand Up @@ -39,12 +40,10 @@
# MEDINCENTIVE: {{^\.Ltmp5}}

# HIGHINCENTIVE: Binary Function "chain" after split-functions
# HIGHINCENTIVE: {{^\.LBB00}}
# HIGHINCENTIVE: {{^\.Ltmp1}}
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
# HIGHINCENTIVE: {{^\.LFT1}}
# HIGHINCENTIVE: ------- HOT-COLD SPLIT POINT -------
# HIGHINCENTIVE: {{^\.LFT0}}
# HIGHINCENTIVE: {{^\.Ltmp1}}
# HIGHINCENTIVE: {{^\.Ltmp0}}
# HIGHINCENTIVE: {{^\.Ltmp2}}
# HIGHINCENTIVE: {{^\.Ltmp3}}
Expand Down