Skip to content

Commit 082251b

Browse files
authored
[AArch64] fix trampoline implementation: use X15 (#126743)
AAPCS64 reserves any of X9-X15 for a compiler to choose to use for this purpose, and says not to use X16 or X18 like GCC (and the previous implementation) chose to use. The X18 register may need to get used by the kernel in some circumstances, as specified by the platform ABI, so it is generally an unwise choice. Simply choosing a different register fixes the problem of this being broken on any platform that actually follows the platform ABI (which is all of them except EABI, if I am reading this linux kernel bug correctly https://lkml2.uits.iu.edu/hypermail/linux/kernel/2001.2/01502.html). As a side benefit, also generate slightly better code and avoids needing the compiler-rt to be present. I did that by following the XCore implementation instead of PPC (although in hindsight, following the RISCV might have been slightly more readable). That X18 is wrong to use for this purpose has been known for many years (e.g. https://www.mail-archive.com/[email protected]/msg76934.html) and also known that fixing this to use one of the correct registers is not an ABI break, since this only appears inside of a translation unit. Some of the other temporary registers (e.g. X9) are already reserved inside llvm for internal use as a generic temporary register in the prologue before saving registers, while X15 was already used in rare cases as a scratch register in the prologue as well, so I felt that seemed the most logical choice to choose here.
1 parent 52360d1 commit 082251b

File tree

14 files changed

+421
-167
lines changed

14 files changed

+421
-167
lines changed

compiler-rt/lib/builtins/README.txt

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -272,11 +272,6 @@ switch32
272272
switch8
273273
switchu8
274274

275-
// This function generates a custom trampoline function with the specific
276-
// realFunc and localsPtr values.
277-
void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated,
278-
const void* realFunc, void* localsPtr);
279-
280275
// There is no C interface to the *_vfp_d8_d15_regs functions. There are
281276
// called in the prolog and epilog of Thumb1 functions. When the C++ ABI use
282277
// SJLJ for exceptions, each function with a catch clause or destructors needs

compiler-rt/lib/builtins/trampoline_setup.c

Lines changed: 0 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -41,45 +41,3 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
4141
__clear_cache(trampOnStack, &trampOnStack[10]);
4242
}
4343
#endif // __powerpc__ && !defined(__powerpc64__)
44-
45-
// The AArch64 compiler generates calls to __trampoline_setup() when creating
46-
// trampoline functions on the stack for use with nested functions.
47-
// This function creates a custom 36-byte trampoline function on the stack
48-
// which loads x18 with a pointer to the outer function's locals
49-
// and then jumps to the target nested function.
50-
// Note: x18 is a reserved platform register on Windows and macOS.
51-
52-
#if defined(__aarch64__) && defined(__ELF__)
53-
COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack,
54-
int trampSizeAllocated,
55-
const void *realFunc, void *localsPtr) {
56-
// This should never happen, but if compiler did not allocate
57-
// enough space on stack for the trampoline, abort.
58-
if (trampSizeAllocated < 36)
59-
compilerrt_abort();
60-
61-
// create trampoline
62-
// Load realFunc into x17. mov/movk 16 bits at a time.
63-
trampOnStack[0] =
64-
0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11;
65-
trampOnStack[1] =
66-
0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11;
67-
trampOnStack[2] =
68-
0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11;
69-
trampOnStack[3] =
70-
0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11;
71-
// Load localsPtr into x18
72-
trampOnStack[4] =
73-
0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12;
74-
trampOnStack[5] =
75-
0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12;
76-
trampOnStack[6] =
77-
0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12;
78-
trampOnStack[7] =
79-
0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12;
80-
trampOnStack[8] = 0xd61f0220; // br x17
81-
82-
// Clear instruction cache.
83-
__clear_cache(trampOnStack, &trampOnStack[9]);
84-
}
85-
#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64)

compiler-rt/test/builtins/Unit/trampoline_setup_test.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
/*
99
* Tests nested functions
10-
* The ppc and aarch64 compilers generates a call to __trampoline_setup
10+
* The ppc compiler generates a call to __trampoline_setup
1111
* The i386 and x86_64 compilers generate a call to ___enable_execute_stack
1212
*/
1313

flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -274,12 +274,12 @@ class BoxedProcedurePass
274274
auto loc = embox.getLoc();
275275
mlir::Type i8Ty = builder.getI8Type();
276276
mlir::Type i8Ptr = builder.getRefType(i8Ty);
277-
// For AArch64, PPC32 and PPC64, the thunk is populated by a call to
277+
// For PPC32 and PPC64, the thunk is populated by a call to
278278
// __trampoline_setup, which is defined in
279279
// compiler-rt/lib/builtins/trampoline_setup.c and requires the
280-
// thunk size greater than 32 bytes. For RISCV and x86_64, the
281-
// thunk setup doesn't go through __trampoline_setup and fits in 32
282-
// bytes.
280+
// thunk size greater than 32 bytes. For AArch64, RISCV and x86_64,
281+
// the thunk setup doesn't go through __trampoline_setup and fits in
282+
// 32 bytes.
283283
fir::SequenceType::Extent thunkSize = triple.getTrampolineSize();
284284
mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty);
285285
auto buffer = builder.create<AllocaOp>(loc, buffTy);

flang/test/Fir/boxproc.fir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
// RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %}
44

55
// CHECK-LABEL: define void @_QPtest_proc_dummy()
6-
// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1
6+
// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
77
// CHECK-X86: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1
88
// CHECK-PPC: %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
99
// CHECK: %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8
@@ -63,7 +63,7 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) {
6363
}
6464

6565
// CHECK-LABEL: define void @_QPtest_proc_dummy_char()
66-
// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1
66+
// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
6767
// CHECK-X86: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1
6868
// CHECK-PPC: %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1
6969
// CHECK: %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8

llvm/lib/Target/AArch64/AArch64CallingConvention.td

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ class CCIfSubtarget<string F, CCAction A>
2828
//===----------------------------------------------------------------------===//
2929

3030
defvar AArch64_Common = [
31+
// The 'nest' parameter, if any, is passed in X15.
32+
// The previous register used here (X18) is also defined to be unavailable
33+
// for this purpose, while all of X9-X15 were defined to be free for LLVM to
34+
// use for this, so use X15 (which LLVM often already clobbers anyways).
35+
CCIfNest<CCAssignToReg<[X15]>>,
36+
3137
CCIfType<[iPTR], CCBitConvertToType<i64>>,
3238
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
3339
CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
@@ -117,13 +123,7 @@ defvar AArch64_Common = [
117123
];
118124

119125
let Entry = 1 in
120-
def CC_AArch64_AAPCS : CallingConv<!listconcat(
121-
// The 'nest' parameter, if any, is passed in X18.
122-
// Darwin and Windows use X18 as the platform register and hence 'nest' isn't
123-
// currently supported there.
124-
[CCIfNest<CCAssignToReg<[X18]>>],
125-
AArch64_Common
126-
)>;
126+
def CC_AArch64_AAPCS : CallingConv<AArch64_Common>;
127127

128128
let Entry = 1 in
129129
def RetCC_AArch64_AAPCS : CallingConv<[
@@ -177,6 +177,8 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
177177
// a stack layout compatible with the x64 calling convention.
178178
let Entry = 1 in
179179
def CC_AArch64_Arm64EC_VarArg : CallingConv<[
180+
CCIfNest<CCAssignToReg<[X15]>>,
181+
180182
// Convert small floating-point values to integer.
181183
CCIfType<[f16, bf16], CCBitConvertToType<i16>>,
182184
CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -353,6 +355,8 @@ def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
353355
// + Stack slots are sized as needed rather than being at least 64-bit.
354356
let Entry = 1 in
355357
def CC_AArch64_DarwinPCS : CallingConv<[
358+
CCIfNest<CCAssignToReg<[X15]>>,
359+
356360
CCIfType<[iPTR], CCBitConvertToType<i64>>,
357361
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
358362
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -427,6 +431,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
427431

428432
let Entry = 1 in
429433
def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
434+
CCIfNest<CCAssignToReg<[X15]>>,
435+
430436
CCIfType<[iPTR], CCBitConvertToType<i64>>,
431437
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
432438
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
@@ -450,6 +456,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
450456
// same as the normal Darwin VarArgs handling.
451457
let Entry = 1 in
452458
def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
459+
CCIfNest<CCAssignToReg<[X15]>>,
460+
453461
CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
454462
CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
455463

@@ -494,6 +502,8 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
494502

495503
let Entry = 1 in
496504
def CC_AArch64_GHC : CallingConv<[
505+
CCIfNest<CCAssignToReg<[X15]>>,
506+
497507
CCIfType<[iPTR], CCBitConvertToType<i64>>,
498508

499509
// Handle all vector types as either f64 or v2f64.
@@ -522,6 +532,7 @@ def CC_AArch64_Preserve_None : CallingConv<[
522532

523533
// We can pass arguments in all general registers, except:
524534
// - X8, used for sret
535+
// - X15 (on Windows), used as a temporary register in the prologue when allocating call frames
525536
// - X16/X17, used by the linker as IP0/IP1
526537
// - X18, the platform register
527538
// - X19, the base pointer

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 64 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -331,7 +331,9 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF,
331331
static bool produceCompactUnwindFrame(MachineFunction &MF);
332332
static bool needsWinCFI(const MachineFunction &MF);
333333
static StackOffset getSVEStackSize(const MachineFunction &MF);
334-
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB);
334+
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
335+
bool HasCall = false);
336+
static bool requiresSaveVG(const MachineFunction &MF);
335337

336338
/// Returns true if a homogeneous prolog or epilog code can be emitted
337339
/// for the size optimization. If possible, a frame helper call is injected.
@@ -1006,6 +1008,16 @@ void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero,
10061008
}
10071009
}
10081010

1011+
static bool windowsRequiresStackProbe(const MachineFunction &MF,
1012+
uint64_t StackSizeInBytes) {
1013+
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1014+
const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
1015+
// TODO: When implementing stack protectors, take that into account
1016+
// for the probe threshold.
1017+
return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
1018+
StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
1019+
}
1020+
10091021
static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
10101022
const MachineBasicBlock &MBB) {
10111023
const MachineFunction *MF = MBB.getParent();
@@ -1027,7 +1039,8 @@ static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs,
10271039
// but we would then have to make sure that we were in fact saving at least one
10281040
// callee-save register in the prologue, which is additional complexity that
10291041
// doesn't seem worth the benefit.
1030-
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
1042+
static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB,
1043+
bool HasCall) {
10311044
MachineFunction *MF = MBB->getParent();
10321045

10331046
// If MBB is an entry block, use X9 as the scratch register
@@ -1041,6 +1054,11 @@ static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
10411054
const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo();
10421055
LivePhysRegs LiveRegs(TRI);
10431056
getLiveRegsForEntryMBB(LiveRegs, *MBB);
1057+
if (HasCall) {
1058+
LiveRegs.addReg(AArch64::X16);
1059+
LiveRegs.addReg(AArch64::X17);
1060+
LiveRegs.addReg(AArch64::X18);
1061+
}
10441062

10451063
// Prefer X9 since it was historically used for the prologue scratch reg.
10461064
const MachineRegisterInfo &MRI = MF->getRegInfo();
@@ -1081,23 +1099,18 @@ bool AArch64FrameLowering::canUseAsPrologue(
10811099
MBB.isLiveIn(AArch64::NZCV))
10821100
return false;
10831101

1084-
// Don't need a scratch register if we're not going to re-align the stack or
1085-
// emit stack probes.
1086-
if (!RegInfo->hasStackRealignment(*MF) && !TLI->hasInlineStackProbe(*MF))
1087-
return true;
1088-
// Otherwise, we can use any block as long as it has a scratch register
1089-
// available.
1090-
return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister;
1091-
}
1102+
if (RegInfo->hasStackRealignment(*MF) || TLI->hasInlineStackProbe(*MF))
1103+
if (findScratchNonCalleeSaveRegister(TmpMBB) == AArch64::NoRegister)
1104+
return false;
10921105

1093-
static bool windowsRequiresStackProbe(MachineFunction &MF,
1094-
uint64_t StackSizeInBytes) {
1095-
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
1096-
const AArch64FunctionInfo &MFI = *MF.getInfo<AArch64FunctionInfo>();
1097-
// TODO: When implementing stack protectors, take that into account
1098-
// for the probe threshold.
1099-
return Subtarget.isTargetWindows() && MFI.hasStackProbing() &&
1100-
StackSizeInBytes >= uint64_t(MFI.getStackProbeSize());
1106+
// May need a scratch register (for return value) if require making a special
1107+
// call
1108+
if (requiresSaveVG(*MF) ||
1109+
windowsRequiresStackProbe(*MF, std::numeric_limits<uint64_t>::max()))
1110+
if (findScratchNonCalleeSaveRegister(TmpMBB, true) == AArch64::NoRegister)
1111+
return false;
1112+
1113+
return true;
11011114
}
11021115

11031116
static bool needsWinCFI(const MachineFunction &MF) {
@@ -1378,8 +1391,8 @@ bool requiresGetVGCall(MachineFunction &MF) {
13781391
!MF.getSubtarget<AArch64Subtarget>().hasSVE();
13791392
}
13801393

1381-
static bool requiresSaveVG(MachineFunction &MF) {
1382-
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1394+
static bool requiresSaveVG(const MachineFunction &MF) {
1395+
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
13831396
// For Darwin platforms we don't save VG for non-SVE functions, even if SME
13841397
// is enabled with streaming mode changes.
13851398
if (!AFI->hasStreamingModeChanges())
@@ -2049,6 +2062,29 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
20492062
if (AFI->getSVECalleeSavedStackSize())
20502063
report_fatal_error(
20512064
"SVE callee saves not yet supported with stack probing");
2065+
2066+
// Find an available register to spill the value of X15 to, if X15 is being
2067+
// used already for nest.
2068+
unsigned X15Scratch = AArch64::NoRegister;
2069+
const AArch64Subtarget &STI = MF.getSubtarget<AArch64Subtarget>();
2070+
if (llvm::any_of(MBB.liveins(),
2071+
[&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) {
2072+
return STI.getRegisterInfo()->isSuperOrSubRegisterEq(
2073+
AArch64::X15, LiveIn.PhysReg);
2074+
})) {
2075+
X15Scratch = findScratchNonCalleeSaveRegister(&MBB, true);
2076+
assert(X15Scratch != AArch64::NoRegister &&
2077+
(X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17));
2078+
#ifndef NDEBUG
2079+
LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it
2080+
#endif
2081+
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch)
2082+
.addReg(AArch64::XZR)
2083+
.addReg(AArch64::X15, RegState::Undef)
2084+
.addReg(AArch64::X15, RegState::Implicit)
2085+
.setMIFlag(MachineInstr::FrameSetup);
2086+
}
2087+
20522088
uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4;
20532089
if (NeedsWinCFI) {
20542090
HasWinCFI = true;
@@ -2171,6 +2207,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
21712207
// we've set a frame pointer and already finished the SEH prologue.
21722208
assert(!NeedsWinCFI);
21732209
}
2210+
if (X15Scratch != AArch64::NoRegister) {
2211+
BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15)
2212+
.addReg(AArch64::XZR)
2213+
.addReg(X15Scratch, RegState::Undef)
2214+
.addReg(X15Scratch, RegState::Implicit)
2215+
.setMIFlag(MachineInstr::FrameSetup);
2216+
}
21742217
}
21752218

21762219
StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize;
@@ -3355,7 +3398,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
33553398
unsigned X0Scratch = AArch64::NoRegister;
33563399
if (Reg1 == AArch64::VG) {
33573400
// Find an available register to store value of VG to.
3358-
Reg1 = findScratchNonCalleeSaveRegister(&MBB);
3401+
Reg1 = findScratchNonCalleeSaveRegister(&MBB, true);
33593402
assert(Reg1 != AArch64::NoRegister);
33603403
SMEAttrs Attrs = AFI->getSMEFnAttrs();
33613404

0 commit comments

Comments
 (0)