-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[BOLT] Drop parsing sample PC when processing LBR perf data #123420
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
aaupov
merged 4 commits into
main
from
users/aaupov/spr/bolt-drop-parsing-sample-pc-when-processing-perf-data-with-lbr
Jan 21, 2025
Merged
[BOLT] Drop parsing sample PC when processing LBR perf data #123420
aaupov
merged 4 commits into
main
from
users/aaupov/spr/bolt-drop-parsing-sample-pc-when-processing-perf-data-with-lbr
Jan 21, 2025
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Created using spr 1.3.4
✅ With the latest revision this PR passed the C/C++ code formatter. |
@llvm/pr-subscribers-bolt Author: Amir Ayupov (aaupov) ChangesRemove options to generate autofdo data (unused) and Cuts down perf2bolt time for 11GB perf.data by 40s (11:10->10:30). Full diff: https://github.com/llvm/llvm-project/pull/123420.diff 2 Files Affected:
diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h
index 320623cfa15af1..aa83d7f9b13ab5 100644
--- a/bolt/include/bolt/Profile/DataAggregator.h
+++ b/bolt/include/bolt/Profile/DataAggregator.h
@@ -80,7 +80,6 @@ class DataAggregator : public DataReader {
private:
struct PerfBranchSample {
SmallVector<LBREntry, 32> LBR;
- uint64_t PC;
};
struct PerfBasicSample {
@@ -334,9 +333,6 @@ class DataAggregator : public DataReader {
/// Process all branch events.
void processBranchEvents();
- /// This member function supports generating data for AutoFDO LLVM tools.
- std::error_code writeAutoFDOData(StringRef OutputFilename);
-
/// Parse the full output generated by perf script to report non-LBR samples.
std::error_code parseBasicEvents();
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 2b02086e3e0c99..024f6cf6dcb75e 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -108,15 +108,6 @@ TimeAggregator("time-aggr",
cl::ZeroOrMore,
cl::cat(AggregatorCategory));
-static cl::opt<bool>
- UseEventPC("use-event-pc",
- cl::desc("use event PC in combination with LBR sampling"),
- cl::cat(AggregatorCategory));
-
-static cl::opt<bool> WriteAutoFDOData(
- "autofdo", cl::desc("generate autofdo textual data instead of bolt data"),
- cl::cat(AggregatorCategory));
-
} // namespace opts
namespace {
@@ -187,15 +178,13 @@ void DataAggregator::start() {
/*Wait = */false);
} else if (!opts::ITraceAggregation.empty()) {
std::string ItracePerfScriptArgs = llvm::formatv(
- "script -F pid,ip,brstack --itrace={0}", opts::ITraceAggregation);
+ "script -F pid,brstack --itrace={0}", opts::ITraceAggregation);
launchPerfProcess("branch events with itrace", MainEventsPPI,
ItracePerfScriptArgs.c_str(),
/*Wait = */ false);
} else {
- launchPerfProcess("branch events",
- MainEventsPPI,
- "script -F pid,ip,brstack",
- /*Wait = */false);
+ launchPerfProcess("branch events", MainEventsPPI, "script -F pid,brstack",
+ /*Wait = */ false);
}
// Note: we launch script for mem events regardless of the option, as the
@@ -381,67 +370,6 @@ void DataAggregator::parsePreAggregated() {
}
}
-std::error_code DataAggregator::writeAutoFDOData(StringRef OutputFilename) {
- outs() << "PERF2BOLT: writing data for autofdo tools...\n";
- NamedRegionTimer T("writeAutoFDO", "Processing branch events", TimerGroupName,
- TimerGroupDesc, opts::TimeAggregator);
-
- std::error_code EC;
- raw_fd_ostream OutFile(OutputFilename, EC, sys::fs::OpenFlags::OF_None);
- if (EC)
- return EC;
-
- // Format:
- // number of unique traces
- // from_1-to_1:count_1
- // from_2-to_2:count_2
- // ......
- // from_n-to_n:count_n
- // number of unique sample addresses
- // addr_1:count_1
- // addr_2:count_2
- // ......
- // addr_n:count_n
- // number of unique LBR entries
- // src_1->dst_1:count_1
- // src_2->dst_2:count_2
- // ......
- // src_n->dst_n:count_n
-
- const uint64_t FirstAllocAddress = this->BC->FirstAllocAddress;
-
- // AutoFDO addresses are relative to the first allocated loadable program
- // segment
- auto filterAddress = [&FirstAllocAddress](uint64_t Address) -> uint64_t {
- if (Address < FirstAllocAddress)
- return 0;
- return Address - FirstAllocAddress;
- };
-
- OutFile << FallthroughLBRs.size() << "\n";
- for (const auto &[Trace, Info] : FallthroughLBRs) {
- OutFile << formatv("{0:x-}-{1:x-}:{2}\n", filterAddress(Trace.From),
- filterAddress(Trace.To),
- Info.InternCount + Info.ExternCount);
- }
-
- OutFile << BasicSamples.size() << "\n";
- for (const auto [PC, HitCount] : BasicSamples)
- OutFile << formatv("{0:x-}:{1}\n", filterAddress(PC), HitCount);
-
- OutFile << BranchLBRs.size() << "\n";
- for (const auto &[Trace, Info] : BranchLBRs) {
- OutFile << formatv("{0:x-}->{1:x-}:{2}\n", filterAddress(Trace.From),
- filterAddress(Trace.To), Info.TakenCount);
- }
-
- outs() << "PERF2BOLT: wrote " << FallthroughLBRs.size() << " unique traces, "
- << BasicSamples.size() << " sample addresses and " << BranchLBRs.size()
- << " unique branches to " << OutputFilename << "\n";
-
- return std::error_code();
-}
-
void DataAggregator::filterBinaryMMapInfo() {
if (opts::FilterPID) {
auto MMapInfoIter = BinaryMMapInfo.find(opts::FilterPID);
@@ -583,15 +511,6 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
(opts::BasicAggregation && parseBasicEvents()))
errs() << "PERF2BOLT: failed to parse samples\n";
- // We can finish early if the goal is just to generate data for autofdo
- if (opts::WriteAutoFDOData) {
- if (std::error_code EC = writeAutoFDOData(opts::OutputFilename))
- errs() << "Error writing autofdo data to file: " << EC.message() << "\n";
-
- deleteTempFiles();
- exit(0);
- }
-
// Special handling for memory events
if (prepareToParse("mem events", MemEventsPPI, MemEventsErrorCallback))
return Error::success();
@@ -1158,14 +1077,6 @@ ErrorOr<DataAggregator::PerfBranchSample> DataAggregator::parseBranchSample() {
return make_error_code(errc::no_such_process);
}
- while (checkAndConsumeFS()) {
- }
-
- ErrorOr<uint64_t> PCRes = parseHexField(FieldSeparator, true);
- if (std::error_code EC = PCRes.getError())
- return EC;
- Res.PC = PCRes.get();
-
if (checkAndConsumeNewLine())
return Res;
@@ -1472,9 +1383,9 @@ std::error_code DataAggregator::printLBRHeatMap() {
uint64_t DataAggregator::parseLBRSample(const PerfBranchSample &Sample,
bool NeedsSkylakeFix) {
uint64_t NumTraces{0};
- // LBRs are stored in reverse execution order. NextPC refers to the next
- // recorded executed PC.
- uint64_t NextPC = opts::UseEventPC ? Sample.PC : 0;
+ // LBRs are stored in reverse execution order. NextLBR refers to the next
+ // executed branch record.
+ const LBREntry *NextLBR{nullptr};
uint32_t NumEntry = 0;
for (const LBREntry &LBR : Sample.LBR) {
++NumEntry;
@@ -1486,10 +1397,10 @@ uint64_t DataAggregator::parseLBRSample(const PerfBranchSample &Sample,
// chronological order)
if (NeedsSkylakeFix && NumEntry <= 2)
continue;
- if (NextPC) {
+ if (NextLBR) {
// Record fall-through trace.
const uint64_t TraceFrom = LBR.To;
- const uint64_t TraceTo = NextPC;
+ const uint64_t TraceTo = NextLBR->From;
const BinaryFunction *TraceBF =
getBinaryFunctionContainingAddress(TraceFrom);
if (TraceBF && TraceBF->containsAddress(TraceTo)) {
@@ -1524,7 +1435,7 @@ uint64_t DataAggregator::parseLBRSample(const PerfBranchSample &Sample,
}
++NumTraces;
}
- NextPC = LBR.From;
+ NextLBR = &LBR;
uint64_t From = getBinaryFunctionContainingAddress(LBR.From) ? LBR.From : 0;
uint64_t To = getBinaryFunctionContainingAddress(LBR.To) ? LBR.To : 0;
@@ -1561,8 +1472,6 @@ std::error_code DataAggregator::parseBranchEvents() {
++NumSamples;
PerfBranchSample &Sample = SampleRes.get();
- if (opts::WriteAutoFDOData)
- ++BasicSamples[Sample.PC];
if (Sample.LBR.empty()) {
++NumSamplesNoLBR;
|
ayermolo
reviewed
Jan 21, 2025
maksfb
approved these changes
Jan 21, 2025
aaupov
added a commit
to rafaelauler/bolt-tests
that referenced
this pull request
Jan 27, 2025
llvm/llvm-project#123420 dropped support for AutoFDO profile generation in BOLT.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Remove options to generate autofdo data (unused) and
use-event-pc
(not beneficial).
Cuts down perf2bolt time for 11GB perf.data by 40s (11:10->10:30).