Skip to content

Commit 6ee5ff9

Browse files
authored
[BOLT] Add profile density computation
Reuse the definition of profile density from llvm-profgen (#92144): - the density is computed in perf2bolt using raw samples (perf.data or pre-aggregated data), - function density is the ratio of dynamically executed function bytes to the static function size in bytes, - profile density: - functions are sorted by density in decreasing order, accumulating their respective sample counts, - profile density is the smallest density covering 99% of total sample count. In other words, BOLT binary profile density is the minimum amount of profile information per function (excluding functions in tail 1% sample count) which is sufficient to optimize the binary well. The density threshold of 60 was determined through experiments with large binaries by reducing the sample count and checking resulting profile density and performance. The threshold is conservative. perf2bolt would print the warning if the density is below the threshold and suggest to increase the sampling duration and/or frequency to reach a given density, e.g.: ``` BOLT-WARNING: BOLT is estimated to optimize better with 2.8x more samples. ``` Test Plan: updated pre-aggregated-perf.test Reviewers: maksfb, wlei-llvm, rafaelauler, ayermolo, dcci, WenleiHe Reviewed By: WenleiHe, wlei-llvm Pull Request: #101094
1 parent 08916ce commit 6ee5ff9

File tree

7 files changed

+106
-3
lines changed

7 files changed

+106
-3
lines changed

bolt/include/bolt/Core/BinaryFunction.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -386,6 +386,9 @@ class BinaryFunction {
386386
/// Raw branch count for this function in the profile.
387387
uint64_t RawBranchCount{0};
388388

389+
/// Dynamically executed function bytes, used for density computation.
390+
uint64_t SampleCountInBytes{0};
391+
389392
/// Indicates the type of profile the function is using.
390393
uint16_t ProfileFlags{PF_NONE};
391394

@@ -1844,6 +1847,9 @@ class BinaryFunction {
18441847
/// to this function.
18451848
void setRawBranchCount(uint64_t Count) { RawBranchCount = Count; }
18461849

1850+
/// Return the number of dynamically executed bytes, from raw perf data.
1851+
uint64_t getSampleCountInBytes() const { return SampleCountInBytes; }
1852+
18471853
/// Return the execution count for functions with known profile.
18481854
/// Return 0 if the function has no profile.
18491855
uint64_t getKnownExecutionCount() const {

bolt/include/bolt/Utils/CommandLineOpts.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ extern llvm::cl::opt<bool> PrintSections;
5555
enum ProfileFormatKind { PF_Fdata, PF_YAML };
5656

5757
extern llvm::cl::opt<ProfileFormatKind> ProfileFormat;
58+
extern llvm::cl::opt<bool> ShowDensity;
5859
extern llvm::cl::opt<bool> SplitEH;
5960
extern llvm::cl::opt<bool> StrictMode;
6061
extern llvm::cl::opt<bool> TimeOpts;

bolt/lib/Passes/BinaryPasses.cpp

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "bolt/Core/ParallelUtilities.h"
1616
#include "bolt/Passes/ReorderAlgorithm.h"
1717
#include "bolt/Passes/ReorderFunctions.h"
18+
#include "bolt/Utils/CommandLineOpts.h"
1819
#include "llvm/Support/CommandLine.h"
1920
#include <atomic>
2021
#include <mutex>
@@ -223,6 +224,18 @@ static cl::opt<unsigned> TopCalledLimit(
223224
"functions section"),
224225
cl::init(100), cl::Hidden, cl::cat(BoltCategory));
225226

227+
// Profile density options, synced with llvm-profgen/ProfileGenerator.cpp
228+
static cl::opt<int> ProfileDensityCutOffHot(
229+
"profile-density-cutoff-hot", cl::init(990000),
230+
cl::desc("Total samples cutoff for functions used to calculate "
231+
"profile density."));
232+
233+
static cl::opt<double> ProfileDensityThreshold(
234+
"profile-density-threshold", cl::init(60),
235+
cl::desc("If the profile density is below the given threshold, it "
236+
"will be suggested to increase the sampling rate."),
237+
cl::Optional);
238+
226239
} // namespace opts
227240

228241
namespace llvm {
@@ -1383,6 +1396,7 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
13831396
uint64_t StaleSampleCount = 0;
13841397
uint64_t InferredSampleCount = 0;
13851398
std::vector<const BinaryFunction *> ProfiledFunctions;
1399+
std::vector<std::pair<double, uint64_t>> FuncDensityList;
13861400
const char *StaleFuncsHeader = "BOLT-INFO: Functions with stale profile:\n";
13871401
for (auto &BFI : BC.getBinaryFunctions()) {
13881402
const BinaryFunction &Function = BFI.second;
@@ -1441,6 +1455,22 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
14411455
StaleSampleCount += SampleCount;
14421456
++NumAllStaleFunctions;
14431457
}
1458+
1459+
if (opts::ShowDensity) {
1460+
uint64_t Size = Function.getSize();
1461+
// In case of BOLT split functions registered in BAT, executed traces are
1462+
// automatically attributed to the main fragment. Add up function sizes
1463+
// for all fragments.
1464+
if (IsHotParentOfBOLTSplitFunction)
1465+
for (const BinaryFunction *Fragment : Function.getFragments())
1466+
Size += Fragment->getSize();
1467+
double Density = (double)1.0 * Function.getSampleCountInBytes() / Size;
1468+
FuncDensityList.emplace_back(Density, SampleCount);
1469+
LLVM_DEBUG(BC.outs() << Function << ": executed bytes "
1470+
<< Function.getSampleCountInBytes() << ", size (b) "
1471+
<< Size << ", density " << Density
1472+
<< ", sample count " << SampleCount << '\n');
1473+
}
14441474
}
14451475
BC.NumProfiledFuncs = ProfiledFunctions.size();
14461476
BC.NumStaleProfileFuncs = NumStaleProfileFunctions;
@@ -1684,6 +1714,50 @@ Error PrintProgramStats::runOnFunctions(BinaryContext &BC) {
16841714
BC.outs() << ". Use -print-unknown to see the list.";
16851715
BC.outs() << '\n';
16861716
}
1717+
1718+
if (opts::ShowDensity) {
1719+
double Density = 0.0;
1720+
// Sorted by the density in descending order.
1721+
llvm::stable_sort(FuncDensityList,
1722+
[&](const std::pair<double, uint64_t> &A,
1723+
const std::pair<double, uint64_t> &B) {
1724+
if (A.first != B.first)
1725+
return A.first > B.first;
1726+
return A.second < B.second;
1727+
});
1728+
1729+
uint64_t AccumulatedSamples = 0;
1730+
uint32_t I = 0;
1731+
assert(opts::ProfileDensityCutOffHot <= 1000000 &&
1732+
"The cutoff value is greater than 1000000(100%)");
1733+
while (AccumulatedSamples <
1734+
TotalSampleCount *
1735+
static_cast<float>(opts::ProfileDensityCutOffHot) /
1736+
1000000 &&
1737+
I < FuncDensityList.size()) {
1738+
AccumulatedSamples += FuncDensityList[I].second;
1739+
Density = FuncDensityList[I].first;
1740+
I++;
1741+
}
1742+
if (Density == 0.0) {
1743+
BC.errs() << "BOLT-WARNING: the output profile is empty or the "
1744+
"--profile-density-cutoff-hot option is "
1745+
"set too low. Please check your command.\n";
1746+
} else if (Density < opts::ProfileDensityThreshold) {
1747+
BC.errs()
1748+
<< "BOLT-WARNING: BOLT is estimated to optimize better with "
1749+
<< format("%.1f", opts::ProfileDensityThreshold / Density)
1750+
<< "x more samples. Please consider increasing sampling rate or "
1751+
"profiling for longer duration to get more samples.\n";
1752+
}
1753+
1754+
BC.outs() << "BOLT-INFO: Functions with density >= "
1755+
<< format("%.1f", Density) << " account for "
1756+
<< format("%.2f",
1757+
static_cast<double>(opts::ProfileDensityCutOffHot) /
1758+
10000)
1759+
<< "% total sample counts.\n";
1760+
}
16871761
return Error::success();
16881762
}
16891763

bolt/lib/Profile/DataAggregator.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
849849
return false;
850850
}
851851

852+
// Set ParentFunc to BAT parent function or FromFunc itself.
853+
BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
854+
if (!ParentFunc)
855+
ParentFunc = FromFunc;
856+
ParentFunc->SampleCountInBytes += Count * (Second.From - First.To);
857+
852858
std::optional<BoltAddressTranslation::FallthroughListTy> FTs =
853859
BAT ? BAT->getFallthroughsInTrace(FromFunc->getAddress(), First.To,
854860
Second.From)
@@ -868,13 +874,12 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second,
868874
<< FromFunc->getPrintName() << ":"
869875
<< Twine::utohexstr(First.To) << " to "
870876
<< Twine::utohexstr(Second.From) << ".\n");
871-
BinaryFunction *ParentFunc = getBATParentFunction(*FromFunc);
872877
for (auto [From, To] : *FTs) {
873878
if (BAT) {
874879
From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true);
875880
To = BAT->translate(FromFunc->getAddress(), To, /*IsBranchSrc=*/false);
876881
}
877-
doIntraBranch(ParentFunc ? *ParentFunc : *FromFunc, From, To, Count, false);
882+
doIntraBranch(*ParentFunc, From, To, Count, false);
878883
}
879884

880885
return true;

bolt/lib/Utils/CommandLineOpts.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,10 @@ cl::opt<std::string> SaveProfile("w",
175175
cl::desc("save recorded profile to a file"),
176176
cl::cat(BoltOutputCategory));
177177

178+
cl::opt<bool> ShowDensity("show-density",
179+
cl::desc("show profile density details"),
180+
cl::Optional, cl::cat(AggregatorCategory));
181+
178182
cl::opt<bool> SplitEH("split-eh", cl::desc("split C++ exception handling code"),
179183
cl::Hidden, cl::cat(BoltOptCategory));
180184

bolt/test/X86/pre-aggregated-perf.test

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,19 @@ REQUIRES: system-linux
1111

1212
RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
1313
RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \
14-
RUN: --profile-use-dfs | FileCheck %s
14+
RUN: --profile-density-threshold=9 --profile-density-cutoff-hot=970000 \
15+
RUN: --profile-use-dfs | FileCheck %s --check-prefix=CHECK-P2B
16+
17+
CHECK-P2B: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
18+
CHECK-P2B: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts.
19+
20+
RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated.txt -w %t.new \
21+
RUN: --profile-density-cutoff-hot=970000 \
22+
RUN: --profile-use-dfs 2>&1 | FileCheck %s --check-prefix=CHECK-WARNING
23+
24+
CHECK-WARNING: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
25+
CHECK-WARNING: BOLT-WARNING: BOLT is estimated to optimize better with 2.8x more samples.
26+
CHECK-WARNING: BOLT-INFO: Functions with density >= 21.7 account for 97.00% total sample counts.
1527

1628
RUN: llvm-bolt %t.exe -data %t -o %t.null | FileCheck %s
1729
RUN: llvm-bolt %t.exe -data %t.new -o %t.null | FileCheck %s

bolt/tools/driver/llvm-bolt.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ void perf2boltMode(int argc, char **argv) {
129129
exit(1);
130130
}
131131
opts::AggregateOnly = true;
132+
opts::ShowDensity = true;
132133
}
133134

134135
void boltDiffMode(int argc, char **argv) {

0 commit comments

Comments
 (0)