Skip to content

[BOLT][AArch64] Introduce SPE mode in BasicAggregation #120741

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions bolt/include/bolt/Profile/DataAggregator.h
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,8 @@ class DataAggregator : public DataReader {
static bool checkPerfDataMagic(StringRef FileName);

private:
friend struct PerfSpeEventsTestHelper;

struct PerfBranchSample {
SmallVector<LBREntry, 32> LBR;
};
Expand Down Expand Up @@ -296,6 +298,15 @@ class DataAggregator : public DataReader {
/// and a PC
ErrorOr<PerfBasicSample> parseBasicSample();

/// Parse an Arm SPE entry into the non-lbr format by generating two basic
/// samples. The format of an input SPE entry is:
/// ```
/// PID EVENT-TYPE ADDR IP
/// ```
/// SPE branch events will have 'ADDR' set to a branch target address while
/// other perf or SPE events will have it set to zero.
ErrorOr<std::pair<PerfBasicSample, PerfBasicSample>> parseSpeAsBasicSamples();

/// Parse a single perf sample containing a PID associated with an IP and
/// address.
ErrorOr<PerfMemSample> parseMemSample();
Expand Down Expand Up @@ -342,6 +353,9 @@ class DataAggregator : public DataReader {
/// Process non-LBR events.
void processBasicEvents();

/// Parse Arm SPE events into the non-LBR format.
std::error_code parseSpeAsBasicEvents();

/// Parse the full output generated by perf script to report memory events.
std::error_code parseMemEvents();

Expand Down
1 change: 1 addition & 0 deletions bolt/include/bolt/Utils/CommandLineOpts.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ extern llvm::cl::OptionCategory BinaryAnalysisCategory;
extern llvm::cl::opt<unsigned> AlignText;
extern llvm::cl::opt<unsigned> AlignFunctions;
extern llvm::cl::opt<bool> AggregateOnly;
extern llvm::cl::opt<bool> ArmSPE;
extern llvm::cl::opt<unsigned> BucketsPerLine;
extern llvm::cl::opt<bool> DiffOnly;
extern llvm::cl::opt<bool> EnableBAT;
Expand Down
140 changes: 132 additions & 8 deletions bolt/lib/Profile/DataAggregator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,13 @@ static cl::opt<bool>
cl::desc("aggregate basic samples (without LBR info)"),
cl::cat(AggregatorCategory));

cl::opt<bool> ArmSPE(
"spe",
cl::desc(
"Enable Arm SPE mode. Used in conjuction with no-lbr mode, ie `--spe "
"--nl`"),
cl::cat(AggregatorCategory));

static cl::opt<std::string>
ITraceAggregation("itrace",
cl::desc("Generate LBR info with perf itrace argument"),
Expand Down Expand Up @@ -171,11 +178,19 @@ void DataAggregator::start() {

findPerfExecutable();

if (opts::BasicAggregation) {
launchPerfProcess("events without LBR",
MainEventsPPI,
if (opts::ArmSPE) {
if (!opts::BasicAggregation) {
errs() << "PERF2BOLT-ERROR: Arm SPE mode is combined only with "
"BasicAggregation.\n";
exit(1);
}
launchPerfProcess("branch events with SPE", MainEventsPPI,
"script -F pid,event,ip,addr --itrace=i1i",
/*Wait = */ false);
} else if (opts::BasicAggregation) {
launchPerfProcess("events without LBR", MainEventsPPI,
"script -F pid,event,ip",
/*Wait = */false);
/*Wait = */ false);
} else if (!opts::ITraceAggregation.empty()) {
std::string ItracePerfScriptArgs = llvm::formatv(
"script -F pid,brstack --itrace={0}", opts::ITraceAggregation);
Expand Down Expand Up @@ -459,14 +474,20 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
"not read one from input binary\n";
}

auto ErrorCallback = [](int ReturnCode, StringRef ErrBuf) {
const Regex NoData("Samples for '.*' event do not have ADDR attribute set. "
"Cannot print 'addr' field.");

auto ErrorCallback = [&NoData](int ReturnCode, StringRef ErrBuf) {
if (opts::ArmSPE && NoData.match(ErrBuf)) {
errs() << "PERF2BOLT-ERROR: perf data are incompatible for Arm SPE mode "
"consumption. ADDR attribute is unset.\n";
exit(1);
}
errs() << "PERF-ERROR: return code " << ReturnCode << "\n" << ErrBuf;
exit(1);
};

auto MemEventsErrorCallback = [&](int ReturnCode, StringRef ErrBuf) {
Regex NoData("Samples for '.*' event do not have ADDR attribute set. "
"Cannot print 'addr' field.");
if (!NoData.match(ErrBuf))
ErrorCallback(ReturnCode, ErrBuf);
};
Expand Down Expand Up @@ -507,7 +528,8 @@ Error DataAggregator::preprocessProfile(BinaryContext &BC) {
exit(0);
}

if ((!opts::BasicAggregation && parseBranchEvents()) ||
if (((!opts::BasicAggregation && !opts::ArmSPE) && parseBranchEvents()) ||
(opts::BasicAggregation && opts::ArmSPE && parseSpeAsBasicEvents()) ||
(opts::BasicAggregation && parseBasicEvents()))
errs() << "PERF2BOLT: failed to parse samples\n";

Expand Down Expand Up @@ -1138,6 +1160,68 @@ ErrorOr<DataAggregator::PerfBasicSample> DataAggregator::parseBasicSample() {
return PerfBasicSample{Event.get(), Address};
}

ErrorOr<
std::pair<DataAggregator::PerfBasicSample, DataAggregator::PerfBasicSample>>
DataAggregator::parseSpeAsBasicSamples() {
while (checkAndConsumeFS()) {
}

ErrorOr<int64_t> PIDRes = parseNumberField(FieldSeparator, true);
if (std::error_code EC = PIDRes.getError())
return EC;

constexpr PerfBasicSample EmptySample = PerfBasicSample{StringRef(), 0};
auto MMapInfoIter = BinaryMMapInfo.find(*PIDRes);
if (MMapInfoIter == BinaryMMapInfo.end()) {
consumeRestOfLine();
return std::make_pair(EmptySample, EmptySample);
}

while (checkAndConsumeFS()) {
}

ErrorOr<StringRef> Event = parseString(FieldSeparator);
if (std::error_code EC = Event.getError())
return EC;

while (checkAndConsumeFS()) {
}

ErrorOr<uint64_t> AddrResTo = parseHexField(FieldSeparator);
if (std::error_code EC = AddrResTo.getError())
return EC;

consumeAllRemainingFS();

ErrorOr<uint64_t> AddrResFrom = parseHexField(FieldSeparator, true);
if (std::error_code EC = AddrResFrom.getError())
return EC;

if (!checkAndConsumeNewLine()) {
reportError("expected end of line");
return make_error_code(llvm::errc::io_error);
}

auto genBasicSample = [&](uint64_t Address) {
// When fed with non SPE branch events the target address will be null.
// This is expected and ignored.
if (Address == 0x0)
return EmptySample;

if (!BC->HasFixedLoadAddress)
adjustAddress(Address, MMapInfoIter->second);

return PerfBasicSample{Event.get(), Address};
};

// Show more meaningful event names on boltdata.
if (Event->str() == "instructions:")
Event = *AddrResTo != 0x0 ? "branches-spe:" : "instructions-spe:";

return std::make_pair(genBasicSample(*AddrResFrom),
genBasicSample(*AddrResTo));
}

ErrorOr<DataAggregator::PerfMemSample> DataAggregator::parseMemSample() {
PerfMemSample Res{0, 0};

Expand Down Expand Up @@ -1643,6 +1727,46 @@ std::error_code DataAggregator::parseBasicEvents() {
return std::error_code();
}

std::error_code DataAggregator::parseSpeAsBasicEvents() {
outs() << "PERF2BOLT: parsing SPE data as basic events (no LBR)...\n";
NamedRegionTimer T("parseSPEBasic", "Parsing SPE as basic events",
TimerGroupName, TimerGroupDesc, opts::TimeAggregator);
uint64_t NumSpeBranchSamples = 0;

// Convert entries to one or two basic samples, depending on whether there is
// branch target information.
while (hasData()) {
auto SamplePair = parseSpeAsBasicSamples();
if (std::error_code EC = SamplePair.getError())
return EC;

auto registerSample = [this](const PerfBasicSample *Sample) {
if (!Sample->PC)
return;

if (BinaryFunction *BF = getBinaryFunctionContainingAddress(Sample->PC))
BF->setHasProfileAvailable();

++BasicSamples[Sample->PC];
EventNames.insert(Sample->EventName);
};

if (SamplePair->first.PC != 0x0 && SamplePair->second.PC != 0x0)
++NumSpeBranchSamples;

registerSample(&SamplePair->first);
registerSample(&SamplePair->second);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Am I correct in understanding that it is the case when we have sample for branch SRC -> TGT which was or was not be taken. However we increase hotness of SRC and TGT nodes in any case registering samples always for both nodes and not taking into account ratio of samples with this branch taken and not taken?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hey Pavel,

Reading this back, you are concerned whether storing samples on TGT branches that are not NOT-TAKEN might increase hotness in a block that it shouldn't have. Correct?

That should not be a concern, as regardless of whether a branch is taken or not, the reported TGT is what was architecturally executed. In other words, NOT-TAKEN (or it's absence) characterizes what had happen in the src branch (PC), while TGT will always point to the path we end up taking.

So, for fall-through SPE packets, the TGT address would always be the next address from PC (ie, 0xA00 + 4, which is the instruction size in AArch64):

PC 0xA00
B COND
EV RETIRED NOT-TAKEN
TGT 0xA04

For taken branches, the TGT can be at a distance further than just 4 :

PC 0xA00
B COND
EV RETIRED
TGT 0xBBB

In my previous examples I was using mock addresses for PC/TGT, so I've updated any relevant examples to avoid confusion.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, thank you @paschalis-mpeis for clarifying about taken/not taken information and updating examples. @aaupov @maksfb would you like any additional explanations regarding SPE packets? Generally speaking SPE is providing event based sampling for branches and doesn't have enough information to create trace of N>1 branches and inferring fall throughs. We are aiming to add BRBE (Branch Record Buffer Extension) support for this in BOLT and provide branch stack trace like LBR with it.

Copy link
Contributor

@kaadam kaadam Feb 17, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi, thanks Paschalis for your example.
Maybe it's worth to highlight that the not-taken event is only related to conditional instruction (conditional branch or compare-and-branch), it tells that failed its condition code check, that's it. Since TGT (what you mentioned) "will always point to the path we end up taking", in this case presence of the not-taken event type is not relevant us, accordingly we will always get the 'taken paths'. Theoretically these branch information support our optimization, bolt will be able to rely on them.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Correct, thanks Adam. This is irrelevant to any unconditional branching (including call/ret).
Skipping 'non-taken' conditional branches is the optimization LBR/BRBE can do, as that can be inferred in post-processing.

}

if (NumSpeBranchSamples == 0)
errs() << "PERF2BOLT-WARNING: no SPE branches found\n";
else
outs() << "PERF2BOLT: found " << NumSpeBranchSamples
<< " SPE branch sample pairs.\n";

return std::error_code();
}

void DataAggregator::processBasicEvents() {
outs() << "PERF2BOLT: processing basic events (without LBR)...\n";
NamedRegionTimer T("processBasic", "Processing basic events", TimerGroupName,
Expand Down
14 changes: 14 additions & 0 deletions bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
## Check that Arm SPE mode is available on AArch64 with BasicAggregation.

REQUIRES: system-linux,perf,target=aarch64{{.*}}

RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
RUN: touch %t.empty.perf.data
RUN: perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --nl --spe --pa %t.exe 2>&1 | FileCheck %s --check-prefix=CHECK-SPE-NO-LBR

CHECK-SPE-NO-LBR: PERF2BOLT: Starting data aggregation job

RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe
RUN: not perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2>&1 | FileCheck %s --check-prefix=CHECK-SPE-LBR

CHECK-SPE-LBR: PERF2BOLT-ERROR: Arm SPE mode is combined only with BasicAggregation.
9 changes: 9 additions & 0 deletions bolt/test/perf2bolt/X86/perf2bolt-spe.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
## Check that Arm SPE mode is unavailable on X86.

REQUIRES: system-linux,x86_64-linux

RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
RUN: touch %t.empty.perf.data
RUN: not perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --nl --spe --pa %t.exe 2>&1 | FileCheck %s

CHECK: perf2bolt: -spe is available only on AArch64.
7 changes: 7 additions & 0 deletions bolt/tools/driver/llvm-bolt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,13 @@ int main(int argc, char **argv) {
if (Error E = RIOrErr.takeError())
report_error(opts::InputFilename, std::move(E));
RewriteInstance &RI = *RIOrErr.get();

if (opts::AggregateOnly && !RI.getBinaryContext().isAArch64() &&
opts::ArmSPE) {
errs() << ToolName << ": -spe is available only on AArch64.\n";
exit(1);
}

if (!opts::PerfData.empty()) {
if (!opts::AggregateOnly) {
errs() << ToolName
Expand Down
14 changes: 14 additions & 0 deletions bolt/unittests/Profile/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
set(LLVM_LINK_COMPONENTS
DebugInfoDWARF
Object
${LLVM_TARGETS_TO_BUILD}
)

add_bolt_unittest(ProfileTests
DataAggregator.cpp
PerfSpeEvents.cpp

DISABLE_LLVM_LINK_LLVM_DYLIB
)

target_link_libraries(ProfileTests
PRIVATE
LLVMBOLTCore
LLVMBOLTProfile
LLVMTargetParser
LLVMTestingSupport
)

foreach (tgt ${BOLT_TARGETS_TO_BUILD})
string(TOUPPER "${tgt}" upper)
target_compile_definitions(ProfileTests PRIVATE "${upper}_AVAILABLE")
endforeach()
Loading