-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[BOLT][AArch64] Implement PLTCall optimization #93584
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[BOLT][AArch64] Implement PLTCall optimization #93584
Conversation
@llvm/pr-subscribers-bolt Author: Paschalis Mpeis (paschalis-mpeis) Changes
Tests:
Full diff: https://github.com/llvm/llvm-project/pull/93584.diff 6 Files Affected:
diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h
index f7614cf9ac977..01be123894869 100644
--- a/bolt/include/bolt/Core/MCPlusBuilder.h
+++ b/bolt/include/bolt/Core/MCPlusBuilder.h
@@ -14,6 +14,7 @@
#ifndef BOLT_CORE_MCPLUSBUILDER_H
#define BOLT_CORE_MCPLUSBUILDER_H
+#include "bolt/Core/BinaryBasicBlock.h"
#include "bolt/Core/MCPlus.h"
#include "bolt/Core/Relocation.h"
#include "llvm/ADT/ArrayRef.h"
@@ -1412,9 +1413,15 @@ class MCPlusBuilder {
return false;
}
- /// Modify a direct call instruction \p Inst with an indirect call taking
- /// a destination from a memory location pointed by \p TargetLocation symbol.
- virtual bool convertCallToIndirectCall(MCInst &Inst,
+ /// Modify a direct call instruction pointed by the iterator \p It, with an
+ /// indirect call taking a destination from a memory location pointed by \p
+ /// TargetLocation symbol. If additional instructions need to be prepended
+ /// before \p It, then the iterator must be updated to point to the indirect
+ /// call instruction.
+ ///
+ /// \return true on success
+ virtual bool convertCallToIndirectCall(BinaryBasicBlock &BB,
+ BinaryBasicBlock::iterator &It,
const MCSymbol *TargetLocation,
MCContext *Ctx) {
llvm_unreachable("not implemented");
diff --git a/bolt/lib/Passes/PLTCall.cpp b/bolt/lib/Passes/PLTCall.cpp
index d0276f22e14ef..00e47ea2b25e7 100644
--- a/bolt/lib/Passes/PLTCall.cpp
+++ b/bolt/lib/Passes/PLTCall.cpp
@@ -61,19 +61,23 @@ Error PLTCall::runOnFunctions(BinaryContext &BC) {
if (opts::PLT == OT_HOT && !BB.getKnownExecutionCount())
continue;
- for (MCInst &Instr : BB) {
- if (!BC.MIB->isCall(Instr))
+ for (auto It = BB.begin(); It != BB.end(); It++) {
+ if (!BC.MIB->isCall(*It))
continue;
- const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(Instr);
+ const MCSymbol *CallSymbol = BC.MIB->getTargetSymbol(*It);
if (!CallSymbol)
continue;
const BinaryFunction *CalleeBF = BC.getFunctionForSymbol(CallSymbol);
if (!CalleeBF || !CalleeBF->isPLTFunction())
continue;
- BC.MIB->convertCallToIndirectCall(Instr, CalleeBF->getPLTSymbol(),
- BC.Ctx.get());
- BC.MIB->addAnnotation(Instr, "PLTCall", true);
- ++NumCallsOptimized;
+ if (BC.MIB->convertCallToIndirectCall(BB, It, CalleeBF->getPLTSymbol(),
+ BC.Ctx.get())) {
+ assert(BC.MIB->isCall(*It) &&
+ "Iterator must point to the optimized call");
+
+ BC.MIB->addAnnotation(*It, "PLTCall", true);
+ ++NumCallsOptimized;
+ }
}
}
}
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 0ae9d3668b93b..116fe482aa2b9 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -1055,6 +1055,52 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
return true;
}
+ bool convertCallToIndirectCall(BinaryBasicBlock &BB,
+ BinaryBasicBlock::iterator &It,
+ const MCSymbol *TargetLocation,
+ MCContext *Ctx) override {
+ // Generated code:
+ // adrp x16, 0x11000 <Symbol>
+ // ldr x17, [x16, <offset>]
+ // bl <label> -> blr x17 (or 'b -> br' for tail calls)
+
+ MCInst &InstCall = *It;
+ bool IsTailCall = isTailCall(InstCall);
+ assert((InstCall.getOpcode() == AArch64::BL ||
+ (InstCall.getOpcode() == AArch64::B && IsTailCall)) &&
+ "64-bit direct (tail) call instruction expected");
+
+ // Convert the call to an indicrect one by modifying the instruction.
+ InstCall.clear();
+ InstCall.setOpcode(IsTailCall ? AArch64::BR : AArch64::BLR);
+ InstCall.addOperand(MCOperand::createReg(AArch64::X17));
+ if (IsTailCall)
+ setTailCall(*It);
+
+ // Prepend instructions to load PLT call address from the input symbol.
+
+ MCInst InstLoad;
+ InstLoad.setOpcode(AArch64::LDRXui);
+ InstLoad.addOperand(MCOperand::createReg(AArch64::X17));
+ InstLoad.addOperand(MCOperand::createReg(AArch64::X16));
+ InstLoad.addOperand(MCOperand::createImm(0));
+ setOperandToSymbolRef(InstLoad, /* OpNum */ 2, TargetLocation,
+ /* Addend */ 0, Ctx, ELF::R_AARCH64_LD64_GOT_LO12_NC);
+ It = BB.insertInstruction(It, InstLoad);
+
+ MCInst InstAdrp;
+ InstAdrp.setOpcode(AArch64::ADRP);
+ InstAdrp.clear();
+ InstAdrp.addOperand(MCOperand::createReg(AArch64::X16));
+ InstAdrp.addOperand(MCOperand::createImm(0));
+ setOperandToSymbolRef(InstAdrp, /* OpNum */ 1, TargetLocation,
+ /* Addend */ 0, Ctx, ELF::R_AARCH64_ADR_GOT_PAGE);
+ It = BB.insertInstruction(It, InstAdrp);
+
+ It = It + 2;
+ return true;
+ }
+
bool lowerTailCall(MCInst &Inst) override {
removeAnnotation(Inst, MCPlus::MCAnnotation::kTailCall);
if (getConditionalTailCall(Inst))
diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
index 8b1894953f375..ca85600ea4363 100644
--- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
+++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp
@@ -1639,8 +1639,11 @@ class X86MCPlusBuilder : public MCPlusBuilder {
return true;
}
- bool convertCallToIndirectCall(MCInst &Inst, const MCSymbol *TargetLocation,
+ bool convertCallToIndirectCall(BinaryBasicBlock &BB,
+ BinaryBasicBlock::iterator &It,
+ const MCSymbol *TargetLocation,
MCContext *Ctx) override {
+ MCInst &Inst = (*It);
assert((Inst.getOpcode() == X86::CALL64pcrel32 ||
(Inst.getOpcode() == X86::JMP_4 && isTailCall(Inst))) &&
"64-bit direct (tail) call instruction expected");
diff --git a/bolt/test/AArch64/plt-call.test b/bolt/test/AArch64/plt-call.test
new file mode 100644
index 0000000000000..5525367c5efe7
--- /dev/null
+++ b/bolt/test/AArch64/plt-call.test
@@ -0,0 +1,16 @@
+// Verify that PLTCall optimization works, including when PLT calls were
+// tail-call optimized.
+
+RUN: %clang %cflags %p/../Inputs/plt-tailcall.c \
+RUN: -o %t -Wl,-q
+RUN: llvm-bolt %t -o %t.bolt --plt=all --print-plt --print-only=foo | FileCheck %s
+
+// Call to printf
+CHECK: adrp x16, printf@GOT
+CHECK: ldr x17, [x16, :lo12:printf@GOT]
+CHECK: blr x17 # PLTCall: 1
+
+// Call to puts, that was tail-call optimized
+CHECK: adrp x16, puts@GOT
+CHECK: ldr x17, [x16, :lo12:puts@GOT]
+CHECK: br x17 # TAILCALL # PLTCall: 1
\ No newline at end of file
diff --git a/bolt/test/Inputs/plt-tailcall.c b/bolt/test/Inputs/plt-tailcall.c
new file mode 100644
index 0000000000000..ebc04b2041861
--- /dev/null
+++ b/bolt/test/Inputs/plt-tailcall.c
@@ -0,0 +1,10 @@
+#include "stub.h"
+
+int foo(char *c) {
+ printf("");
+ __attribute__((musttail)) return puts(c);
+}
+
+int main() {
+ return foo("a");
+}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
f8681fd
to
8e95c34
Compare
8e95c34
to
206b4c9
Compare
/// call instruction. | ||
/// | ||
/// \return true on success | ||
virtual bool convertCallToIndirectCall(BinaryBasicBlock &BB, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Paschalis,
Thanks a lot for submitting this contribution to the project, I appreciate it!
One thing that might not be obvious in the code base is that the MCPlusBuilder layer (or the BOLT targets lib, for that matter) do not depend on BOLT IR classes, such as BinaryBasicBlock. They are limited to use MCInst and the class MCPlusBuilder::InstructionIterator whenever the API needs to communicate a walkable MCInst pointer to target-specific code.
If you add this dependency of bolt target libs on bolt IR classes, this will create a cyclic dependency that will likely break our shared lib build (if you build BOLT with enable_shared you might see this). I would suggest you taking a look at other methods in X86MCPlusBuilder to see how we implement this kind of optimization -- usually by moving code that depends on BB knowledge to the pass itself, instead of doing that logic in target libs.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I checked and apparently my BUILD_SHARED_LIBS build succeeds, but I think we shouldn't rely on this. If you check the CMakeLists.txt file in the AArch64 folder, you will see it does not depend on BOLTCore lib, which defines BinaryBasicBlock.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@paschalis-mpeis There is a polymorphic iterator InstructionIterator
, which is defined in MCPlusBuilder.h
and there is a pattern to pass every required instruction sequence to MCPlusBuilder
's methods by this iterator (but, as I see, it is used to pass a chain of instructions with no modifications of the iterator itself). Also there is the InstructionListType
and, please have a look at the createLoadImmediate
for example, the generated in MCPlusBuilder
instruction list should be returned by value.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks @rafaelauler and @samolisov for the suggestions. I think the way to go is using InstructionListType
.
I will rename convertCallToIndirectCall
to createIndirectPltCall
, addressing my previous concern here, but feel free to suggest a better name.
The function will be adjusted to return an InstructionListType
, and anything that relates to Binary BB logic will be moved to the pass itself.
I think the closest existing example of this is with the InlineMemcpy
pass and createInlineMemcpy
.
BC.MIB->addAnnotation(Instr, "PLTCall", true); | ||
const InstructionListType NewCode = BC.MIB->createIndirectPltCall( | ||
*II, CalleeBF->getPLTSymbol(), BC.Ctx.get()); | ||
II = BB.replaceInstruction(II, NewCode); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
std::advance()
after this line uses NewCode.size() - 1
as the second argument. I think it may make sense to ensure that the size is not zero adding a corresponding assert
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the suggestion, added.
|
||
return Code; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function looks good for me now.
`convertCallToIndirectCall` applies the PLTCall optimization and returns an (updated if needed) iterator to the converted call instruction. Since AArch64 requires to inject additional instructions to implement this pass, the relevant BasicBlock and an iterator was passed to the `convertCallToIndirectCall`. `NumCallsOptimized` is updated only on successful application of the pass. Tests: - Inputs/plt-tailcall.c: an example of a tail call optimized PLT call. - AArch64/plt-call.test: it is the actual A64 test, that runs the PLTCall optimization on the above input file and verifies the application of the pass to the calls: 'printf' and 'puts'.
createIndirectPltCall now returns a list of instructions. All BinaryBasicBlock-related logic is now on PLTCall pass. Added the same test for x86, to ensure nothing breaks on that backend.
44fa7e1
to
c5cfed0
Compare
Rebased to latest main and addressed latest review. NOTE: some unrelated commits on main were causing failures on unrelated BOLT tests. Those commits were:
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM Thank you! Give 1-2 days for other reviewers to evaluate change please
Recommit 435ea21. As the comment added by a077271 suggests, these `*Triples` lists should shrink over time. https://reviews.llvm.org/D158183 allows *-unknown-linux-gnu to detect *-linux-gnu. If we additionally allow x86_64-unknown-linux-gnu -m32/-mx32 to detect x86_64-linux-gnu, we can mostly remove these *-linux-gnu elements. Retain x86_64-linux-gnu for now to work around #93609. (In addition, Debian /usr/bin/clang --version uses x86_64-pc-linux-gnu). Retain i586-linux-gnu for now to work around #93502.
Hi all, if there are no further comments, I will be merging this PR by EOD. |
X86/plt-call.test and AArch64/plt-call.test test PLT handling . They should link against a dummy DSO to not rely on an unguaranteed linker behavior. Sent #125625 |
convertCallToIndirectCall
applies the PLTCall optimization and returns an (updated if needed) iterator to the converted call instruction. Since AArch64 requires to inject additional instructions to implement this pass, the relevant BasicBlock and an iterator was passed to theconvertCallToIndirectCall
.NumCallsOptimized
is updated only on successful application of the pass.Tests: