Skip to content

Add initial support for SPE brstack format #129231

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 18 commits into from
Jun 20, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bolt/include/bolt/Profile/DataAggregator.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,8 @@ class DataAggregator : public DataReader {
};
friend raw_ostream &operator<<(raw_ostream &OS, const LBREntry &);

friend struct PerfSpeEventsTestHelper;

struct PerfBranchSample {
SmallVector<LBREntry, 32> LBR;
};
Expand Down
1 change: 1 addition & 0 deletions bolt/include/bolt/Utils/CommandLineOpts.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ extern llvm::cl::OptionCategory BinaryAnalysisCategory;
extern llvm::cl::opt<unsigned> AlignText;
extern llvm::cl::opt<unsigned> AlignFunctions;
extern llvm::cl::opt<bool> AggregateOnly;
extern llvm::cl::opt<bool> ArmSPE;
extern llvm::cl::opt<unsigned> BucketsPerLine;
extern llvm::cl::opt<bool> CompactCodeModel;
extern llvm::cl::opt<bool> DiffOnly;
Expand Down
61 changes: 49 additions & 12 deletions bolt/lib/Profile/DataAggregator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@ static cl::opt<bool>
cl::desc("aggregate basic samples (without LBR info)"),
cl::cat(AggregatorCategory));

cl::opt<bool> ArmSPE("spe", cl::desc("Enable Arm SPE mode."),
cl::cat(AggregatorCategory));

static cl::opt<std::string>
ITraceAggregation("itrace",
cl::desc("Generate LBR info with perf itrace argument"),
Expand Down Expand Up @@ -181,11 +184,21 @@ void DataAggregator::start() {

findPerfExecutable();

if (opts::ArmSPE) {
// pid from_ip to_ip flags
// where flags could be:
// P/M: whether branch was Predicted or Mispredicted.
// N: optionally appears when the branch was Not-Taken (ie fall-through)
// 12345 0x123/0x456/PN/-/-/8/RET/-
opts::ITraceAggregation = "bl";
opts::ParseMemProfile = true;
opts::BasicAggregation = false;
}

if (opts::BasicAggregation) {
launchPerfProcess("events without LBR",
MainEventsPPI,
launchPerfProcess("events without LBR", MainEventsPPI,
"script -F pid,event,ip",
/*Wait = */false);
/*Wait = */ false);
} else if (!opts::ITraceAggregation.empty()) {
// Disable parsing memory profile from trace data, unless requested by user.
if (!opts::ParseMemProfile.getNumOccurrences())
Expand Down Expand Up @@ -994,9 +1007,22 @@ ErrorOr<DataAggregator::LBREntry> DataAggregator::parseLBREntry() {
if (std::error_code EC = MispredStrRes.getError())
return EC;
StringRef MispredStr = MispredStrRes.get();
if (MispredStr.size() != 1 ||
(MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-')) {
reportError("expected single char for mispred bit");
// SPE brstack mispredicted flags might be up to two characters long:
// 'PN' or 'MN'. Where 'N' optionally appears.
bool ValidStrSize = opts::ArmSPE
? MispredStr.size() >= 1 && MispredStr.size() <= 2
: MispredStr.size() == 1;
bool SpeTakenBitErr =
(opts::ArmSPE && MispredStr.size() == 2 && MispredStr[1] != 'N');
bool PredictionBitErr =
!ValidStrSize ||
(MispredStr[0] != 'P' && MispredStr[0] != 'M' && MispredStr[0] != '-');
if (SpeTakenBitErr)
reportError("expected 'N' as SPE prediction bit for a not-taken branch");
if (PredictionBitErr)
reportError("expected 'P', 'M' or '-' char as a prediction bit");

if (SpeTakenBitErr || PredictionBitErr) {
Diag << "Found: " << MispredStr << "\n";
return make_error_code(llvm::errc::io_error);
}
Expand Down Expand Up @@ -1497,7 +1523,9 @@ void DataAggregator::printBranchStacksDiagnostics(
}

std::error_code DataAggregator::parseBranchEvents() {
outs() << "PERF2BOLT: parse branch events...\n";
std::string BranchEventTypeStr =
opts::ArmSPE ? "SPE branch events in LBR-format" : "branch events";
outs() << "PERF2BOLT: parse " << BranchEventTypeStr << "...\n";
NamedRegionTimer T("parseBranch", "Parsing branch events", TimerGroupName,
TimerGroupDesc, opts::TimeAggregator);

Expand Down Expand Up @@ -1525,7 +1553,8 @@ std::error_code DataAggregator::parseBranchEvents() {
}

NumEntries += Sample.LBR.size();
if (BAT && Sample.LBR.size() == 32 && !NeedsSkylakeFix) {
if (this->BC->isX86() && BAT && Sample.LBR.size() == 32 &&
!NeedsSkylakeFix) {
errs() << "PERF2BOLT-WARNING: using Intel Skylake bug workaround\n";
NeedsSkylakeFix = true;
}
Expand All @@ -1548,10 +1577,18 @@ std::error_code DataAggregator::parseBranchEvents() {
if (NumSamples && NumSamplesNoLBR == NumSamples) {
// Note: we don't know if perf2bolt is being used to parse memory samples
// at this point. In this case, it is OK to parse zero LBRs.
errs() << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
"LBR. Record profile with perf record -j any or run perf2bolt "
"in no-LBR mode with -nl (the performance improvement in -nl "
"mode may be limited)\n";
if (!opts::ArmSPE)
errs()
<< "PERF2BOLT-WARNING: all recorded samples for this binary lack "
"LBR. Record profile with perf record -j any or run perf2bolt "
"in no-LBR mode with -nl (the performance improvement in -nl "
"mode may be limited)\n";
else
errs()
<< "PERF2BOLT-WARNING: All recorded samples for this binary lack "
"SPE brstack entries. Make sure you are running Linux perf 6.14 "
"or later, otherwise you get zero samples. Record the profile "
"with: perf record -e 'arm_spe_0/branch_filter=1/'.";
} else {
printBranchStacksDiagnostics(NumTotalSamples - NumSamples);
}
Expand Down
12 changes: 12 additions & 0 deletions bolt/test/perf2bolt/AArch64/perf2bolt-spe.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
## Check that Arm SPE mode is available on AArch64.

REQUIRES: system-linux,perf,target=aarch64{{.*}}

RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe

RUN: perf record -e cycles -q -o %t.perf.data -- %t.exe 2> /dev/null

RUN: (perf2bolt -p %t.perf.data -o %t.perf.boltdata --spe %t.exe 2> /dev/null; exit 0) | FileCheck %s --check-prefix=CHECK-SPE-LBR

CHECK-SPE-LBR: PERF2BOLT: parse SPE branch events in LBR-format

9 changes: 9 additions & 0 deletions bolt/test/perf2bolt/X86/perf2bolt-spe.test
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
## Check that Arm SPE mode is unavailable on X86.

REQUIRES: system-linux,x86_64-linux

RUN: %clang %cflags %p/../../Inputs/asm_foo.s %p/../../Inputs/asm_main.c -o %t.exe
RUN: touch %t.empty.perf.data
RUN: not perf2bolt -p %t.empty.perf.data -o %t.perf.boltdata --spe --pa %t.exe 2>&1 | FileCheck %s

CHECK: perf2bolt: -spe is available only on AArch64.
7 changes: 7 additions & 0 deletions bolt/tools/driver/llvm-bolt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,13 @@ int main(int argc, char **argv) {
if (Error E = RIOrErr.takeError())
report_error(opts::InputFilename, std::move(E));
RewriteInstance &RI = *RIOrErr.get();

if (opts::AggregateOnly && !RI.getBinaryContext().isAArch64() &&
opts::ArmSPE) {
errs() << ToolName << ": -spe is available only on AArch64.\n";
exit(1);
}

if (!opts::PerfData.empty()) {
if (!opts::AggregateOnly) {
errs() << ToolName
Expand Down
14 changes: 14 additions & 0 deletions bolt/unittests/Profile/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,25 @@
set(LLVM_LINK_COMPONENTS
DebugInfoDWARF
Object
${LLVM_TARGETS_TO_BUILD}
)

add_bolt_unittest(ProfileTests
DataAggregator.cpp
PerfSpeEvents.cpp

DISABLE_LLVM_LINK_LLVM_DYLIB
)

target_link_libraries(ProfileTests
PRIVATE
LLVMBOLTCore
LLVMBOLTProfile
LLVMTargetParser
LLVMTestingSupport
)

foreach (tgt ${BOLT_TARGETS_TO_BUILD})
string(TOUPPER "${tgt}" upper)
target_compile_definitions(ProfileTests PRIVATE "${upper}_AVAILABLE")
endforeach()
164 changes: 164 additions & 0 deletions bolt/unittests/Profile/PerfSpeEvents.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
//===- bolt/unittests/Profile/PerfSpeEvents.cpp ---------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifdef AARCH64_AVAILABLE

#include "bolt/Core/BinaryContext.h"
#include "bolt/Profile/DataAggregator.h"
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/DebugInfo/DWARF/DWARFContext.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/TargetSelect.h"
#include "gtest/gtest.h"

using namespace llvm;
using namespace llvm::bolt;
using namespace llvm::object;
using namespace llvm::ELF;

namespace opts {
extern cl::opt<std::string> ReadPerfEvents;
extern cl::opt<bool> ArmSPE;
} // namespace opts

namespace llvm {
namespace bolt {

/// Perform checks on perf SPE branch events.
struct PerfSpeEventsTestHelper : public testing::Test {
void SetUp() override {
initalizeLLVM();
prepareElf();
initializeBOLT();
}

protected:
using Trace = DataAggregator::Trace;
using TakenBranchInfo = DataAggregator::TakenBranchInfo;

void initalizeLLVM() {
llvm::InitializeAllTargetInfos();
llvm::InitializeAllTargetMCs();
llvm::InitializeAllAsmParsers();
llvm::InitializeAllDisassemblers();
llvm::InitializeAllTargets();
llvm::InitializeAllAsmPrinters();
}

void prepareElf() {
memcpy(ElfBuf, "\177ELF", 4);
ELF64LE::Ehdr *EHdr = reinterpret_cast<typename ELF64LE::Ehdr *>(ElfBuf);
EHdr->e_ident[llvm::ELF::EI_CLASS] = llvm::ELF::ELFCLASS64;
EHdr->e_ident[llvm::ELF::EI_DATA] = llvm::ELF::ELFDATA2LSB;
EHdr->e_machine = llvm::ELF::EM_AARCH64;
MemoryBufferRef Source(StringRef(ElfBuf, sizeof(ElfBuf)), "ELF");
ObjFile = cantFail(ObjectFile::createObjectFile(Source));
}

void initializeBOLT() {
Relocation::Arch = ObjFile->makeTriple().getArch();
BC = cantFail(BinaryContext::createBinaryContext(
ObjFile->makeTriple(), std::make_shared<orc::SymbolStringPool>(),
ObjFile->getFileName(), nullptr, /*IsPIC*/ false,
DWARFContext::create(*ObjFile.get()), {llvm::outs(), llvm::errs()}));
ASSERT_FALSE(!BC);
}

char ElfBuf[sizeof(typename ELF64LE::Ehdr)] = {};
std::unique_ptr<ObjectFile> ObjFile;
std::unique_ptr<BinaryContext> BC;

/// Helper function to export lists to show the mismatch.
void reportBrStackEventMismatch(
const std::vector<std::pair<Trace, TakenBranchInfo>> &Traces,
const std::vector<std::pair<Trace, TakenBranchInfo>> &ExpectedSamples) {
llvm::errs() << "Traces items: \n";
for (const auto &[Trace, BI] : Traces)
llvm::errs() << "{" << Trace.Branch << ", " << Trace.From << ","
<< Trace.To << ", " << BI.TakenCount << ", "
<< BI.MispredCount << "}" << "\n";

llvm::errs() << "Expected items: \n";
for (const auto &[Trace, BI] : ExpectedSamples)
llvm::errs() << "{" << Trace.Branch << ", " << Trace.From << ", "
<< Trace.To << ", " << BI.TakenCount << ", "
<< BI.MispredCount << "}" << "\n";
}

/// Parse and check SPE brstack as LBR.
void parseAndCheckBrstackEvents(
uint64_t PID,
const std::vector<std::pair<Trace, TakenBranchInfo>> &ExpectedSamples) {
DataAggregator DA("<pseudo input>");
DA.ParsingBuf = opts::ReadPerfEvents;
DA.BC = BC.get();
DataAggregator::MMapInfo MMap;
DA.BinaryMMapInfo.insert(std::make_pair(PID, MMap));

DA.parseBranchEvents();

EXPECT_EQ(DA.Traces.size(), ExpectedSamples.size());
if (DA.Traces.size() != ExpectedSamples.size())
reportBrStackEventMismatch(DA.Traces, ExpectedSamples);

const auto TracesBegin = DA.Traces.begin();
const auto TracesEnd = DA.Traces.end();
for (const auto &BI : ExpectedSamples) {
auto it = find_if(TracesBegin, TracesEnd,
[&BI](const auto &Tr) { return Tr.first == BI.first; });

EXPECT_NE(it, TracesEnd);
EXPECT_EQ(it->second.MispredCount, BI.second.MispredCount);
EXPECT_EQ(it->second.TakenCount, BI.second.TakenCount);
}
}
};

} // namespace bolt
} // namespace llvm

TEST_F(PerfSpeEventsTestHelper, SpeBranchesWithBrstack) {
// Check perf input with SPE branch events as brstack format.
// Example collection command:
// ```
// perf record -e 'arm_spe_0/branch_filter=1/u' -- BINARY
// ```
// How Bolt extracts the branch events:
// ```
// perf script -F pid,brstack --itrace=bl
// ```

opts::ArmSPE = true;
opts::ReadPerfEvents = " 1234 0xa001/0xa002/PN/-/-/10/COND/-\n"
" 1234 0xb001/0xb002/P/-/-/4/RET/-\n"
" 1234 0xc456/0xc789/P/-/-/13/-/-\n"
" 1234 0xd123/0xd456/M/-/-/7/RET/-\n"
" 1234 0xe001/0xe002/P/-/-/14/RET/-\n"
" 1234 0xd123/0xd456/M/-/-/7/RET/-\n"
" 1234 0xf001/0xf002/MN/-/-/8/COND/-\n"
" 1234 0xc456/0xc789/M/-/-/13/-/-\n";

// ExpectedSamples contains the aggregated information about
// a branch {{Branch From, To}, {TakenCount, MispredCount}}.
// Consider this example trace: {{0xd123, 0xd456, Trace::BR_ONLY},
// {2,2}}. This entry has a TakenCount = 2, as we have two samples for
// (0xd123, 0xd456) in our input. It also has MispredsCount = 2,
// as 'M' misprediction flag appears in both cases. BR_ONLY means
// the trace only contains branch data.
std::vector<std::pair<Trace, TakenBranchInfo>> ExpectedSamples = {
{{0xa001, 0xa002, Trace::BR_ONLY}, {1, 0}},
{{0xb001, 0xb002, Trace::BR_ONLY}, {1, 0}},
{{0xc456, 0xc789, Trace::BR_ONLY}, {2, 1}},
{{0xd123, 0xd456, Trace::BR_ONLY}, {2, 2}},
{{0xe001, 0xe002, Trace::BR_ONLY}, {1, 0}},
{{0xf001, 0xf002, Trace::BR_ONLY}, {1, 1}}};

parseAndCheckBrstackEvents(1234, ExpectedSamples);
}

#endif
Loading