Skip to content

[BOLT] Add profile density computation #101094

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Oct 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions bolt/include/bolt/Core/BinaryFunction.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,9 @@ class BinaryFunction {
/// Raw branch count for this function in the profile.
uint64_t RawBranchCount{0};

/// Dynamically executed function bytes, used for density computation.
uint64_t SampleCountInBytes{0};

/// Indicates the type of profile the function is using.
uint16_t ProfileFlags{PF_NONE};

Expand Down Expand Up @@ -1844,6 +1847,9 @@ class BinaryFunction {
/// to this function.
void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; }

/// Return the number of dynamically executed bytes, from raw perf data.
uint64_t getSampleCountInBytes() const { return SampleCountInBytes; }

/// Return the execution count for functions with known profile.
/// Return 0 if the function has no profile.
uint64_t getKnownExecutionCount() const {
Expand Down
1 change: 1 addition & 0 deletions bolt/include/bolt/Utils/CommandLineOpts.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ extern llvm::cl::opt<bool> PrintSections;
enum ProfileFormatKind { PF_Fdata, PF_YAML };

extern llvm::cl::opt<ProfileFormatKind> ProfileFormat;
extern llvm::cl::opt<bool> ShowDensity;
extern llvm::cl::opt<bool> SplitEH;
extern llvm::cl::opt<bool> StrictMode;
extern llvm::cl::opt<bool> TimeOpts;
Expand Down
74 changes: 74 additions & 0 deletions bolt/lib/Passes/BinaryPasses.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "bolt/Core/ParallelUtilities.h"
#include "bolt/Passes/ReorderAlgorithm.h"
#include "bolt/Passes/ReorderFunctions.h"
#include "bolt/Utils/CommandLineOpts.h"
#include "llvm/Support/CommandLine.h"
#include <atomic>
#include <mutex>
Expand Down Expand Up @@ -223,6 +224,18 @@ static cl::opt<unsigned> TopCalledLimit(
"functions section"),
cl::init(100), cl::Hidden, cl::cat(BoltCategory));

// Profile density options, synced with llvm-profgen/ProfileGenerator.cpp
static cl::opt<int> ProfileDensityCutOffHot(
"profile-density-cutoff-hot", cl::init(990000),
cl::desc("Total samples cutoff for functions used to calculate "
"profile density."));

static cl::opt<double> ProfileDensityThreshold(
"profile-density-threshold", cl::init(60),
cl::desc("If the profile density is below the given threshold, it "
"will be suggested to increase the sampling rate."),
cl::Optional);

} // namespace opts

namespace llvm {
Expand Down Expand Up @@ -1383,6 +1396,7 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
uint64_t StaleSampleCount = 0;
uint64_t InferredSampleCount = 0;
std::vector<const BinaryFunction *> ProfiledFunctions;
std::vector<std::pair<double, uint64_t>> FuncDensityList;
const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
for (auto &BFI : BC.getBinaryFunctions()) {
const BinaryFunction &Function = BFI.second;
Expand Down Expand Up @@ -1441,6 +1455,22 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
StaleSampleCount += SampleCount;
++NumAllStaleFunctions;
}

if (opts::ShowDensity) {
uint64_t Size = Function.getSize();
// In case of BOLT split functions registered in BAT, executed traces are
// automatically attributed to the main fragment. Add up function sizes
// for all fragments.
if (IsHotParentOfBOLTSplitFunction)
for (const BinaryFunction *Fragment : Function.getFragments())
Size += Fragment->getSize();
double Density = (double)1.0 * Function.getSampleCountInBytes() / Size;
FuncDensityList.emplace_back(Density, SampleCount);
LLVM_DEBUG(BC.outs() << Function << ": executed bytes "
<< Function.getSampleCountInBytes() << ", size (b) "
<< Size << ", density " << Density
<< ", sample count " << SampleCount << '\n');
}
}
BC.NumProfiledFuncs = ProfiledFunctions.size();
BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
Expand Down Expand Up @@ -1684,6 +1714,50 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
BC.outs() << ". Use -print-unknown to see the list.";
BC.outs() << '\n';
}

if (opts::ShowDensity) {
double Density = 0.0;
// Sorted by the density in descending order.
llvm::stable_sort(FuncDensityList,
[&](const std::pair<double, uint64_t> &A,
const std::pair<double, uint64_t> &B) {
if (A.first != B.first)
return A.first > B.first;
return A.second < B.second;
});

uint64_t AccumulatedSamples = 0;
uint32_t I = 0;
assert(opts::ProfileDensityCutOffHot <= 1000000 &&
"The cutoff value is greater than 1000000(100%)");
while (AccumulatedSamples <
TotalSampleCount *
static_cast<float>(opts::ProfileDensityCutOffHot) /
1000000 &&
I < FuncDensityList.size()) {
AccumulatedSamples += FuncDensityList[I].second;
Density = FuncDensityList[I].first;
I++;
}
if (Density == 0.0) {
BC.errs() << "BOLT-WARNING: the output profile is empty or the "
"--profile-density-cutoff-hot option is "
"set too low. Please check your command.\n";
} else if (Density < opts::ProfileDensityThreshold) {
BC.errs()
<< "BOLT-WARNING: BOLT is estimated to optimize better with "
<< format("%.1f", opts::ProfileDensityThreshold / Density)
<< "x more samples. Please consider increasing sampling rate or "
"profiling for longer duration to get more samples.\n";
}

BC.outs() << "BOLT-INFO: Functions with density >= "
<< format("%.1f", Density) << " account for "
<< format("%.2f",
static_cast<double>(opts::ProfileDensityCutOffHot) /
10000)
<< "% total sample counts.\n";
}
return Error::success();
}

Expand Down
9 changes: 7 additions & 2 deletions bolt/lib/Profile/DataAggregator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -849,6 +849,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
return false;
}

// Set ParentFunc to BAT parent function or FromFunc itself.
BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
if (!ParentFunc)
ParentFunc = FromFunc;
ParentFunc->SampleCountInBytes += Count * (Second.From - First.To);

std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
BAT ? BAT->getFallthroughsInTrace(FromFunc->getAddress(), First.To,
Second.From)
Expand All @@ -868,13 +874,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
<< FromFunc->getPrintName() << ":"
<< Twine::utohexstr(First.To) << " to "
<< Twine::utohexstr(Second.From) << ".\n");
BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
for (auto [From, To] : *FTs) {
if (BAT) {
From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true);
To = BAT->translate(FromFunc->getAddress(), To, /*IsBranchSrc=*/false);
}
doIntraBranch(ParentFunc ? *ParentFunc : *FromFunc, From, To, Count, false);
doIntraBranch(*ParentFunc, From, To, Count, false);
}

return true;
Expand Down
4 changes: 4 additions & 0 deletions bolt/lib/Utils/CommandLineOpts.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,10 @@ cl::opt<std::string> SaveProfile("w",
cl::desc("save recorded profile to a file"),
cl::cat(BoltOutputCategory));

cl::opt<bool> ShowDensity("show-density",
cl::desc("show profile density details"),
cl::Optional, cl::cat(AggregatorCategory));

cl::opt<bool> SplitEH("split-eh", cl::desc("split C++ exception handling code"),
cl::Hidden, cl::cat(BoltOptCategory));

Expand Down
14 changes: 13 additions & 1 deletion bolt/test/X86/pre-aggregated-perf.test
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,19 @@ REQUIRES: system-linux

RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \
RUN: --profile-use-dfs | FileCheck %s
RUN: --profile-density-threshold=9 --profile-density-cutoff-hot=970000 \
RUN: --profile-use-dfs | FileCheck %s --check-prefix=CHECK-P2B

CHECK-P2B: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
CHECK-P2B: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts.

RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \
RUN: --profile-density-cutoff-hot=970000 \
RUN: --profile-use-dfs 2>&1 | FileCheck %s --check-prefix=CHECK-WARNING

CHECK-WARNING: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
CHECK-WARNING: BOLT-WARNING: BOLT is estimated to optimize better with 2.8x more samples.
CHECK-WARNING: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts.

RUN: llvm-bolt %t.exe -data %t -o %t.null | FileCheck %s
RUN: llvm-bolt %t.exe -data %t.new -o %t.null | FileCheck %s
Expand Down
1 change: 1 addition & 0 deletions bolt/tools/driver/llvm-bolt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ void perf2boltMode(int argc, char **argv) {
exit(1);
}
opts::AggregateOnly = true;
opts::ShowDensity = true;
}

void boltDiffMode(int argc, char **argv) {
Expand Down
Loading