Skip to content

[BOLT] Setup CDSplit Pass Structure #73079

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions bolt/include/bolt/Core/BinaryContext.h
Original file line number Diff line number Diff line change
Expand Up @@ -1230,6 +1230,9 @@ class BinaryContext {
///
/// Return the pair where the first size is for the main part, and the second
/// size is for the cold one.
/// Modify BinaryBasicBlock::OutputAddressRange for each basic block in the
/// function in place so that BB.OutputAddressRange.second less
/// BB.OutputAddressRange.first gives the emitted size of BB.
std::pair<size_t, size_t> calculateEmittedSize(BinaryFunction &BF,
bool FixBranches = true);

Expand Down
14 changes: 14 additions & 0 deletions bolt/include/bolt/Core/BinaryFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -1272,6 +1272,20 @@ class BinaryFunction {
/// otherwise processed.
bool isPseudo() const { return IsPseudo; }

/// Return true if every block in the function has a valid execution count.
bool hasFullProfile() const {
return llvm::all_of(blocks(), [](const BinaryBasicBlock &BB) {
return BB.getExecutionCount() != BinaryBasicBlock::COUNT_NO_PROFILE;
});
}

/// Return true if every block in the function has a zero execution count.
bool allBlocksCold() const {
return llvm::all_of(blocks(), [](const BinaryBasicBlock &BB) {
return BB.getExecutionCount() == 0;
});
}

/// Return true if the function contains explicit or implicit indirect branch
/// to its split fragments, e.g., split jump table, landing pad in split
/// fragment.
Expand Down
63 changes: 63 additions & 0 deletions bolt/include/bolt/Passes/CDSplit.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
//===- bolt/Passes/CDSplit.h - Split functions into hot/warm/cold
// after function reordering pass -------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef BOLT_PASSES_CDSPLIT
#define BOLT_PASSES_CDSPLIT

#include "bolt/Passes/SplitFunctions.h"
#include <atomic>

namespace llvm {
namespace bolt {

using BasicBlockOrder = BinaryFunction::BasicBlockOrderType;

class CDSplit : public BinaryFunctionPass {
private:
/// Overall stats.
std::atomic<uint64_t> SplitBytesHot{0ull};
std::atomic<uint64_t> SplitBytesCold{0ull};

/// List of functions to be considered.
/// All functions in the list are used to construct a call graph.
/// A subset of functions in this list are considered for splitting.
std::vector<BinaryFunction *> FunctionsToConsider;

/// Helper functions to initialize global variables.
void initialize(BinaryContext &BC);

/// Split function body into 3 fragments: hot / warm / cold.
void runOnFunction(BinaryFunction &BF);

/// Assign each basic block in the given function to either hot, cold,
/// or warm fragment using the CDSplit algorithm.
void assignFragmentThreeWay(const BinaryFunction &BF,
const BasicBlockOrder &BlockOrder);

/// Find the best split index that separates hot from warm.
/// The basic block whose index equals the returned split index will be the
/// last hot block.
size_t findSplitIndex(const BinaryFunction &BF,
const BasicBlockOrder &BlockOrder);

public:
explicit CDSplit(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) {}

bool shouldOptimize(const BinaryFunction &BF) const override;

const char *getName() const override { return "cdsplit"; }

void runOnFunctions(BinaryContext &BC) override;
};

} // namespace bolt
} // namespace llvm

#endif
32 changes: 16 additions & 16 deletions bolt/include/bolt/Passes/SplitFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,19 @@ class SplitFunctions : public BinaryFunctionPass {
/// Split function body into fragments.
void splitFunction(BinaryFunction &Function, SplitStrategy &Strategy);

std::atomic<uint64_t> SplitBytesHot{0ull};
std::atomic<uint64_t> SplitBytesCold{0ull};

public:
explicit SplitFunctions(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) {}

bool shouldOptimize(const BinaryFunction &BF) const override;

const char *getName() const override { return "split-functions"; }

void runOnFunctions(BinaryContext &BC) override;

struct TrampolineKey {
FragmentNum SourceFN = FragmentNum::main();
const MCSymbol *Target = nullptr;
Expand Down Expand Up @@ -81,27 +94,14 @@ class SplitFunctions : public BinaryFunctionPass {
/// corresponding thrower block. The trampoline landing pad, when created,
/// will redirect the execution to the real landing pad in a different
/// fragment.
TrampolineSetType createEHTrampolines(BinaryFunction &Function) const;
static TrampolineSetType createEHTrampolines(BinaryFunction &Function);

/// Merge trampolines into \p Layout without trampolines. The merge will place
/// a trampoline immediately before its destination. Used to revert the effect
/// of trampolines after createEHTrampolines().
BasicBlockOrderType
static BasicBlockOrderType
mergeEHTrampolines(BinaryFunction &BF, BasicBlockOrderType &Layout,
const TrampolineSetType &Trampolines) const;

std::atomic<uint64_t> SplitBytesHot{0ull};
std::atomic<uint64_t> SplitBytesCold{0ull};

public:
explicit SplitFunctions(const cl::opt<bool> &PrintPass)
: BinaryFunctionPass(PrintPass) {}

bool shouldOptimize(const BinaryFunction &BF) const override;

const char *getName() const override { return "split-functions"; }

void runOnFunctions(BinaryContext &BC) override;
const TrampolineSetType &Trampolines);
};

} // namespace bolt
Expand Down
35 changes: 29 additions & 6 deletions bolt/lib/Core/BinaryContext.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2331,14 +2331,37 @@ BinaryContext::calculateEmittedSize(BinaryFunction &BF, bool FixBranches) {
MCAsmLayout Layout(Assembler);
Assembler.layout(Layout);

// Obtain fragment sizes.
std::vector<uint64_t> FragmentSizes;
// Main fragment size.
const uint64_t HotSize =
Layout.getSymbolOffset(*EndLabel) - Layout.getSymbolOffset(*StartLabel);
const uint64_t ColdSize =
std::accumulate(SplitLabels.begin(), SplitLabels.end(), 0ULL,
[&](const uint64_t Accu, const LabelRange &Labels) {
return Accu + Layout.getSymbolOffset(*Labels.second) -
Layout.getSymbolOffset(*Labels.first);
});
FragmentSizes.push_back(HotSize);
// Split fragment sizes.
uint64_t ColdSize = 0;
for (const auto &Labels : SplitLabels) {
uint64_t Size = Layout.getSymbolOffset(*Labels.second) -
Layout.getSymbolOffset(*Labels.first);
FragmentSizes.push_back(Size);
ColdSize += Size;
}

// Populate new start and end offsets of each basic block.
BinaryBasicBlock *PrevBB = nullptr;
uint64_t FragmentIndex = 0;
for (FunctionFragment &FF : BF.getLayout().fragments()) {
for (BinaryBasicBlock *BB : FF) {
const uint64_t BBStartOffset = Layout.getSymbolOffset(*(BB->getLabel()));
BB->setOutputStartAddress(BBStartOffset);
if (PrevBB)
PrevBB->setOutputEndAddress(BBStartOffset);
PrevBB = BB;
}
if (PrevBB)
PrevBB->setOutputEndAddress(FragmentSizes[FragmentIndex]);
FragmentIndex++;
PrevBB = nullptr;
}

// Clean-up the effect of the code emission.
for (const MCSymbol &Symbol : Assembler.symbols()) {
Expand Down
208 changes: 208 additions & 0 deletions bolt/lib/Passes/CDSplit.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
//===- bolt/Passes/CDSplit.cpp - Pass for splitting function code 3-way
//--===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the CDSplit pass.
//
//===----------------------------------------------------------------------===//

#include "bolt/Passes/CDSplit.h"
#include "bolt/Core/ParallelUtilities.h"
#include "bolt/Utils/CommandLineOpts.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/MC/MCInst.h"
#include "llvm/Support/MathExtras.h"

#define DEBUG_TYPE "bolt-opts"

using namespace llvm;
using namespace bolt;

namespace opts {

extern cl::OptionCategory BoltOptCategory;

extern cl::opt<bool> UseCDSplit;
extern cl::opt<bool> SplitEH;
extern cl::opt<unsigned> ExecutionCountThreshold;
} // namespace opts

namespace llvm {
namespace bolt {

namespace {
/// Return true if the function should be considered for building call graph.
bool shouldConsider(const BinaryFunction &BF) {
return BF.hasValidIndex() && BF.hasValidProfile() && !BF.empty();
}
} // anonymous namespace

bool CDSplit::shouldOptimize(const BinaryFunction &BF) const {
// Do not split functions with a small execution count.
if (BF.getKnownExecutionCount() < opts::ExecutionCountThreshold)
return false;

// Do not split functions with at least one block that has no known
// execution count due to incomplete information.
// Do not split functions with only zero-execution count blocks
// as there is not enough variation in block count to justify splitting.
if (!BF.hasFullProfile() || BF.allBlocksCold())
return false;

return BinaryFunctionPass::shouldOptimize(BF);
}

/// Initialize algorithm's metadata.
void CDSplit::initialize(BinaryContext &BC) {
// Construct a list of functions that are considered for building call graph.
// Only those in this list that evaluates true for shouldOptimize are
// candidates for 3-way splitting.
std::vector<BinaryFunction *> SortedFunctions = BC.getSortedFunctions();
FunctionsToConsider.reserve(SortedFunctions.size());
for (BinaryFunction *BF : SortedFunctions) {
if (shouldConsider(*BF))
FunctionsToConsider.push_back(BF);
}
}

/// Find the best index for splitting. The returned value is the index of the
/// last hot basic block. Hence, "no splitting" is equivalent to returning the
/// value which is one less than the size of the function.
size_t CDSplit::findSplitIndex(const BinaryFunction &BF,
const BasicBlockOrder &BlockOrder) {
// Placeholder: hot-cold splitting.
return BF.getLayout().getMainFragment().size() - 1;
}

/// Assign each basic block in the given function to either hot, cold,
/// or warm fragment using the CDSplit algorithm.
void CDSplit::assignFragmentThreeWay(const BinaryFunction &BF,
const BasicBlockOrder &BlockOrder) {
size_t BestSplitIndex = findSplitIndex(BF, BlockOrder);

// Assign fragments based on the computed best split index.
// All basic blocks with index up to the best split index become hot.
// All remaining blocks are warm / cold depending on if count is
// greater than 0 or not.
FragmentNum Main(0);
FragmentNum Warm(1);
FragmentNum Cold(2);
for (size_t Index = 0; Index < BlockOrder.size(); Index++) {
BinaryBasicBlock *BB = BlockOrder[Index];
if (Index <= BestSplitIndex)
BB->setFragmentNum(Main);
else
BB->setFragmentNum(BB->getKnownExecutionCount() > 0 ? Warm : Cold);
}
}

void CDSplit::runOnFunction(BinaryFunction &BF) {
assert(!BF.empty() && "splitting an empty function");

FunctionLayout &Layout = BF.getLayout();
BinaryContext &BC = BF.getBinaryContext();

BasicBlockOrder NewLayout(Layout.block_begin(), Layout.block_end());
// Never outline the first basic block.
NewLayout.front()->setCanOutline(false);
for (BinaryBasicBlock *BB : NewLayout) {
if (!BB->canOutline())
continue;

// Do not split extra entry points in aarch64. They can be referred by
// using ADRs and when this happens, these blocks cannot be placed far
// away due to the limited range in ADR instruction.
if (BC.isAArch64() && BB->isEntryPoint()) {
BB->setCanOutline(false);
continue;
}

if (BF.hasEHRanges() && !opts::SplitEH) {
// We cannot move landing pads (or rather entry points for landing pads).
if (BB->isLandingPad()) {
BB->setCanOutline(false);
continue;
}
// We cannot move a block that can throw since exception-handling
// runtime cannot deal with split functions. However, if we can guarantee
// that the block never throws, it is safe to move the block to
// decrease the size of the function.
for (MCInst &Instr : *BB) {
if (BC.MIB->isInvoke(Instr)) {
BB->setCanOutline(false);
break;
}
}
}
}

// Assign each basic block in NewLayout to either hot, warm, or cold fragment.
assignFragmentThreeWay(BF, NewLayout);

// Make sure all non-outlineable blocks are in the main-fragment.
for (BinaryBasicBlock *BB : NewLayout) {
if (!BB->canOutline())
BB->setFragmentNum(FragmentNum::main());
}

// In case any non-outlineable blocks previously in warm or cold is now set
// to be in main by the preceding for loop, move them to the end of main.
llvm::stable_sort(NewLayout,
[&](const BinaryBasicBlock *L, const BinaryBasicBlock *R) {
return L->getFragmentNum() < R->getFragmentNum();
});

BF.getLayout().update(NewLayout);

// For shared objects, invoke instructions and corresponding landing pads
// have to be placed in the same fragment. When we split them, create
// trampoline landing pads that will redirect the execution to real LPs.
SplitFunctions::TrampolineSetType Trampolines;
if (!BC.HasFixedLoadAddress && BF.hasEHRanges() && BF.isSplit())
Trampolines = SplitFunctions::createEHTrampolines(BF);

if (BC.isX86() && BF.isSplit()) {
size_t HotSize;
size_t ColdSize;
std::tie(HotSize, ColdSize) = BC.calculateEmittedSize(BF);
SplitBytesHot += HotSize;
SplitBytesCold += ColdSize;
}
}

void CDSplit::runOnFunctions(BinaryContext &BC) {
if (!opts::UseCDSplit)
return;

// Initialize global variables.
initialize(BC);

// Only functions satisfying shouldConsider and shouldOptimize are candidates
// for splitting.
ParallelUtilities::PredicateTy SkipFunc = [&](const BinaryFunction &BF) {
return !(shouldConsider(BF) && shouldOptimize(BF));
};

// Make function splitting decisions in parallel.
ParallelUtilities::runOnEachFunction(
BC, ParallelUtilities::SchedulingPolicy::SP_BB_LINEAR,
[&](BinaryFunction &BF) { runOnFunction(BF); }, SkipFunc, "CDSplit",
/*ForceSequential=*/false);

if (SplitBytesHot + SplitBytesCold > 0) {
outs() << "BOLT-INFO: cdsplit separates " << SplitBytesHot
<< " hot bytes from " << SplitBytesCold << " cold bytes "
<< format("(%.2lf%% of split functions is in the main fragment)\n",
100.0 * SplitBytesHot / (SplitBytesHot + SplitBytesCold));

} else
outs() << "BOLT-INFO: cdsplit didn't split any functions\n";
}

} // namespace bolt
} // namespace llvm
Loading