Skip to content

Commit 52307ab

Browse files
committed
[AArch64] Lower @llvm.ret.popless in swiftcorocc functions.
On AArch64, swiftcorocc functions are the only functions yet that can support popless returns. In the backend, that's done by recognizing the musttail call to llvm.ret.popless preceding a ret instruction, and asking the target to adjust that ret to be popless. Throughout most of the backend, that's not an interesting difference. In frame lowering, these popless rets now induce several special behaviors in their (never shrink-wrapped) epilogues, all consequences of not restoring SP: - they of course don't do the SP adjustment or restore itself. - most importantly, they force the epilogue callee-save restores to be FP-based rather than SP-based. - they restore FP/LR last, as we still need the old FP, pointing at the frame being destroyed, to do the CSR restoring. - with ptrauth-returns, they first derive the entry SP from FP, into X16, to use as a discriminator for a standalone AUTIB. rdar://135984630
1 parent f563fd5 commit 52307ab

File tree

12 files changed

+396
-3
lines changed

12 files changed

+396
-3
lines changed

llvm/include/llvm/CodeGen/GlobalISel/CallLowering.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -537,6 +537,15 @@ class CallLowering {
537537
return false;
538538
}
539539

540+
/// This hook must be implemented to lower @llvm.ret.popless intrinsics,
541+
/// which are required to be musttail, and are effectively annotating a
542+
/// return instruction to mark it "popless", i.e., not restoring SP.
543+
/// This "adjustment" step runs after lowerReturn, and is only meant to make
544+
/// it a little less painful to maintain before we add this to the main hook.
545+
virtual bool adjustReturnToPopless(MachineIRBuilder &MIRBuilder) const {
546+
report_fatal_error("Popless returns not implemented for this target");
547+
}
548+
540549
virtual bool fallBackToDAGISel(const MachineFunction &MF) const {
541550
return false;
542551
}

llvm/include/llvm/CodeGen/TargetLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4787,6 +4787,10 @@ class TargetLowering : public TargetLoweringBase {
47874787
llvm_unreachable("Not Implemented");
47884788
}
47894789

4790+
virtual SDValue adjustReturnPopless(SDValue Chain, SelectionDAG &DAG) const {
4791+
report_fatal_error("Popless returns not implemented for this target");
4792+
}
4793+
47904794
/// Return true if result of the specified node is used by a return node
47914795
/// only. It also compute and return the input chain for the tail call.
47924796
///

llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -377,7 +377,14 @@ bool IRTranslator::translateRet(const User &U, MachineIRBuilder &MIRBuilder) {
377377
// The target may mess up with the insertion point, but
378378
// this is not important as a return is the last instruction
379379
// of the block anyway.
380-
return CLI->lowerReturn(MIRBuilder, Ret, VRegs, FuncInfo, SwiftErrorVReg);
380+
bool Success =
381+
CLI->lowerReturn(MIRBuilder, Ret, VRegs, FuncInfo, SwiftErrorVReg);
382+
383+
if (auto *MustTailCI = RI.getParent()->getTerminatingMustTailCall())
384+
if (MustTailCI->getIntrinsicID() == Intrinsic::ret_popless)
385+
Success &= CLI->adjustReturnToPopless(MIRBuilder);
386+
387+
return Success;
381388
}
382389

383390
void IRTranslator::emitBranchForMergedCondition(

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2198,6 +2198,13 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
21982198
return;
21992199
}
22002200

2201+
// Musttail calls to @llvm.ret.popless are used to annotate the ret as
2202+
// "popless". Keep track of it here, and ask the target to do so later.
2203+
bool IsPoplessReturn = false;
2204+
if (auto *MustTailCI = I.getParent()->getTerminatingMustTailCall())
2205+
if (MustTailCI->getIntrinsicID() == Intrinsic::ret_popless)
2206+
IsPoplessReturn = true;
2207+
22012208
if (!FuncInfo.CanLowerReturn) {
22022209
unsigned DemoteReg = FuncInfo.DemoteRegister;
22032210
const Function *F = I.getParent()->getParent();
@@ -2336,6 +2343,18 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
23362343
Chain = DAG.getTargetLoweringInfo().LowerReturn(
23372344
Chain, CallConv, isVarArg, Outs, OutVals, getCurSDLoc(), DAG);
23382345

2346+
// If we did find this return instruction to be popless, make it so now.
2347+
// It's still a normal return in almost all regards, we just need to remember
2348+
// it's popless, for when we lower the return and emit the epilogue later.
2349+
// Ideally we'd ask LowerReturn to do that, but the API is enough of a pain
2350+
// as it is, and all targets would have to learn about that.
2351+
if (IsPoplessReturn) {
2352+
SDValue NewChain =
2353+
DAG.getTargetLoweringInfo().adjustReturnPopless(Chain, DAG);
2354+
DAG.RemoveDeadNode(Chain.getNode());
2355+
Chain = NewChain;
2356+
}
2357+
23392358
// Verify that the target's LowerReturn behaved as expected.
23402359
assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
23412360
"LowerReturn didn't return a valid chain!");
@@ -7937,6 +7956,11 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
79377956
setValue(&I, DAG.getNode(ISD::AND, sdl, PtrVT, Ptr, Mask));
79387957
return;
79397958
}
7959+
7960+
case Intrinsic::ret_popless:
7961+
// We're handling this on the associated ret itself.
7962+
return;
7963+
79407964
case Intrinsic::threadlocal_address: {
79417965
setValue(&I, getValue(I.getOperand(0)));
79427966
return;

llvm/lib/Target/AArch64/AArch64FrameLowering.cpp

Lines changed: 143 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -504,6 +504,11 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
504504
MFI.hasStackMap() || MFI.hasPatchPoint() ||
505505
RegInfo->hasStackRealignment(MF))
506506
return true;
507+
508+
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
509+
if (AFI->hasPoplessEpilogue())
510+
return true;
511+
507512
// With large callframes around we may need to use FP to access the scavenging
508513
// emergency spillslot.
509514
//
@@ -1119,6 +1124,12 @@ bool AArch64FrameLowering::canUseAsPrologue(
11191124
return false;
11201125
}
11211126

1127+
// If we have some return path that's popless, it needs its own very-special
1128+
// epilogue, so we can't shrink-wrap it away.
1129+
// FIXME: this and some of the below checks belong in enableShrinkWrapping.
1130+
if (AFI->hasPoplessEpilogue())
1131+
return false;
1132+
11221133
// Certain stack probing sequences might clobber flags, then we can't use
11231134
// the block as a prologue if the flags register is a live-in.
11241135
if (MF->getInfo<AArch64FunctionInfo>()->hasStackProbing() &&
@@ -1204,6 +1215,12 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
12041215

12051216
bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
12061217
MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
1218+
1219+
MachineFunction &MF = *MBB.getParent();
1220+
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
1221+
if (AFI->hasPoplessEpilogue())
1222+
return false;
1223+
12071224
if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
12081225
return false;
12091226

@@ -1560,6 +1577,47 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
15601577
return std::prev(MBB.erase(MBBI));
15611578
}
15621579

1580+
static void fixupCalleeSaveRestoreToFPBased(MachineInstr &MI,
1581+
uint64_t FPSPOffset) {
1582+
assert(!AArch64InstrInfo::isSEHInstruction(MI));
1583+
1584+
unsigned Opc = MI.getOpcode();
1585+
unsigned Scale;
1586+
switch (Opc) {
1587+
case AArch64::STPXi:
1588+
case AArch64::STRXui:
1589+
case AArch64::STPDi:
1590+
case AArch64::STRDui:
1591+
case AArch64::LDPXi:
1592+
case AArch64::LDRXui:
1593+
case AArch64::LDPDi:
1594+
case AArch64::LDRDui:
1595+
Scale = 8;
1596+
break;
1597+
case AArch64::STPQi:
1598+
case AArch64::STRQui:
1599+
case AArch64::LDPQi:
1600+
case AArch64::LDRQui:
1601+
Scale = 16;
1602+
break;
1603+
default:
1604+
llvm_unreachable("Unexpected callee-save save/restore opcode!");
1605+
}
1606+
1607+
unsigned OffsetIdx = MI.getNumExplicitOperands() - 1;
1608+
1609+
MachineOperand &BaseRegOpnd = MI.getOperand(OffsetIdx - 1);
1610+
assert(BaseRegOpnd.getReg() == AArch64::SP &&
1611+
"Unexpected base register in callee-save save/restore instruction!");
1612+
BaseRegOpnd.setReg(AArch64::FP); // XXX TRI
1613+
1614+
// Last operand is immediate offset that needs fixing.
1615+
MachineOperand &OffsetOpnd = MI.getOperand(OffsetIdx);
1616+
// All generated opcodes have scaled offsets.
1617+
assert(FPSPOffset % Scale == 0);
1618+
OffsetOpnd.setImm(OffsetOpnd.getImm() - FPSPOffset / Scale);
1619+
}
1620+
15631621
// Fixup callee-save register save/restore instructions to take into account
15641622
// combined SP bump by adding the local stack size to the stack offsets.
15651623
static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
@@ -2298,10 +2356,22 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
22982356
bool EmitCFI = AFI->needsAsyncDwarfUnwindInfo(MF);
22992357
bool HasWinCFI = false;
23002358
bool IsFunclet = false;
2359+
bool IsSwiftCoroPartialReturn = false;
23012360

23022361
if (MBB.end() != MBBI) {
23032362
DL = MBBI->getDebugLoc();
23042363
IsFunclet = isFuncletReturnInstr(*MBBI);
2364+
IsSwiftCoroPartialReturn = MBBI->getOpcode() == AArch64::RET_POPLESS;
2365+
}
2366+
2367+
if (IsSwiftCoroPartialReturn) {
2368+
// The partial-return intrin/instr requires the swiftcoro cc
2369+
if (MF.getFunction().getCallingConv() != CallingConv::SwiftCoro)
2370+
report_fatal_error("llvm.ret.popless requires swiftcorocc");
2371+
assert(MBBI->getOpcode() == AArch64::RET_POPLESS);
2372+
BuildMI(MBB, MBBI, DL, TII->get(AArch64::RET_ReallyLR))
2373+
.setMIFlag(MachineInstr::FrameDestroy);
2374+
MBB.erase(MBBI);
23052375
}
23062376

23072377
MachineBasicBlock::iterator EpilogStartI = MBB.end();
@@ -2350,6 +2420,39 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
23502420
if (Info.getReg() != AArch64::LR)
23512421
continue;
23522422
MachineBasicBlock::iterator TI = MBB.getFirstTerminator();
2423+
2424+
// When we're doing a popless ret (i.e., that doesn't restore SP), we
2425+
// can't rely on the exit SP being the same as the entry, but they need
2426+
// to match for the LR auth to succeed. Instead, derive the entry SP
2427+
// from our FP (using a -16 static offset for the size of the frame
2428+
// record itself), save that into X16, and use that as the discriminator
2429+
// in an AUTIB.
2430+
if (IsSwiftCoroPartialReturn) {
2431+
const auto *TRI = Subtarget.getRegisterInfo();
2432+
2433+
MachineBasicBlock::iterator EpilogStartI = MBB.getFirstTerminator();
2434+
MachineBasicBlock::iterator Begin = MBB.begin();
2435+
while (EpilogStartI != Begin) {
2436+
--EpilogStartI;
2437+
if (!EpilogStartI->getFlag(MachineInstr::FrameDestroy)) {
2438+
++EpilogStartI;
2439+
break;
2440+
}
2441+
if (EpilogStartI->readsRegister(AArch64::X16, TRI) ||
2442+
EpilogStartI->modifiesRegister(AArch64::X16, TRI))
2443+
report_fatal_error("unable to use x16 for popless ret LR auth");
2444+
}
2445+
2446+
emitFrameOffset(MBB, EpilogStartI, DL, AArch64::X16, AArch64::FP,
2447+
StackOffset::getFixed(16), TII,
2448+
MachineInstr::FrameDestroy);
2449+
BuildMI(MBB, TI, DL, TII->get(AArch64::AUTIB), AArch64::LR)
2450+
.addUse(AArch64::LR)
2451+
.addUse(AArch64::X16)
2452+
.setMIFlag(MachineInstr::FrameDestroy);
2453+
return;
2454+
}
2455+
23532456
if (TI != MBB.end() && TI->getOpcode() == AArch64::RET_ReallyLR) {
23542457
// If there is a terminator and it's a RET, we can fold AUTH into it.
23552458
// Be careful to keep the implicitly returned registers.
@@ -2383,6 +2486,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
23832486
AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
23842487
if (homogeneousPrologEpilog(MF, &MBB)) {
23852488
assert(!NeedsWinCFI);
2489+
assert(!IsSwiftCoroPartialReturn);
23862490
auto LastPopI = MBB.getFirstTerminator();
23872491
if (LastPopI != MBB.begin()) {
23882492
auto HomogeneousEpilog = std::prev(LastPopI);
@@ -2404,7 +2508,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
24042508
// Assume we can't combine the last pop with the sp restore.
24052509

24062510
bool CombineAfterCSRBump = false;
2407-
if (!CombineSPBump && PrologueSaveSize != 0) {
2511+
if (!CombineSPBump && PrologueSaveSize != 0 && !IsSwiftCoroPartialReturn) {
24082512
MachineBasicBlock::iterator Pop = std::prev(MBB.getFirstTerminator());
24092513
while (Pop->getOpcode() == TargetOpcode::CFI_INSTRUCTION ||
24102514
AArch64InstrInfo::isSEHInstruction(*Pop))
@@ -2440,6 +2544,14 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
24402544
IsSVECalleeSave(LastPopI)) {
24412545
++LastPopI;
24422546
break;
2547+
} else if (IsSwiftCoroPartialReturn) {
2548+
assert(!EmitCFI);
2549+
assert(hasFP(MF));
2550+
fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
2551+
NeedsWinCFI, &HasWinCFI);
2552+
// if FP-based addressing, rewrite CSR restores from SP to FP
2553+
fixupCalleeSaveRestoreToFPBased(
2554+
*LastPopI, AFI->getCalleeSaveBaseToFrameRecordOffset());
24432555
} else if (CombineSPBump)
24442556
fixupCalleeSaveRestoreStackOffset(*LastPopI, AFI->getLocalStackSize(),
24452557
NeedsWinCFI, &HasWinCFI);
@@ -2459,6 +2571,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
24592571
}
24602572

24612573
if (hasFP(MF) && AFI->hasSwiftAsyncContext()) {
2574+
assert(!IsSwiftCoroPartialReturn);
24622575
switch (MF.getTarget().Options.SwiftAsyncFramePointer) {
24632576
case SwiftAsyncFramePointerMode::DeploymentBased:
24642577
// Avoid the reload as it is GOT relative, and instead fall back to the
@@ -2492,6 +2605,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
24922605
// If there is a single SP update, insert it before the ret and we're done.
24932606
if (CombineSPBump) {
24942607
assert(!SVEStackSize && "Cannot combine SP bump with SVE");
2608+
assert(!IsSwiftCoroPartialReturn);
24952609

24962610
// When we are about to restore the CSRs, the CFA register is SP again.
24972611
if (EmitCFI && hasFP(MF)) {
@@ -2577,6 +2691,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
25772691
}
25782692

25792693
if (!hasFP(MF)) {
2694+
assert(!IsSwiftCoroPartialReturn);
25802695
bool RedZone = canUseRedZone(MF);
25812696
// If this was a redzone leaf function, we don't need to restore the
25822697
// stack pointer (but we may need to pop stack args for fastcc).
@@ -2607,6 +2722,9 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
26072722
NumBytes = 0;
26082723
}
26092724

2725+
if (IsSwiftCoroPartialReturn)
2726+
return;
2727+
26102728
// Restore the original stack pointer.
26112729
// FIXME: Rather than doing the math here, we should instead just use
26122730
// non-post-indexed loads for the restores if we aren't actually going to
@@ -3449,9 +3567,17 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
34493567
DebugLoc DL;
34503568
SmallVector<RegPairInfo, 8> RegPairs;
34513569
bool NeedsWinCFI = needsWinCFI(MF);
3570+
bool IsSwiftCoroPartialReturn = false;
34523571

3453-
if (MBBI != MBB.end())
3572+
if (MBBI != MBB.end()) {
34543573
DL = MBBI->getDebugLoc();
3574+
IsSwiftCoroPartialReturn = MBBI->getOpcode() == AArch64::RET_POPLESS;
3575+
}
3576+
3577+
// The partial-return intrin/instr requires the swiftcoro cc
3578+
if (IsSwiftCoroPartialReturn &&
3579+
MF.getFunction().getCallingConv() != CallingConv::SwiftCoro)
3580+
report_fatal_error("llvm.ret.popless requires swiftcorocc");
34553581

34563582
computeCalleeSaveRegisterPairs(MF, CSI, TRI, RegPairs, hasFP(MF));
34573583
if (homogeneousPrologEpilog(MF, &MBB)) {
@@ -3464,6 +3590,17 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
34643590
return true;
34653591
}
34663592

3593+
// If doing a partial/popless return, CSR restores are from FP, so do it last.
3594+
if (IsSwiftCoroPartialReturn) {
3595+
auto IsFPLR = [](const RegPairInfo &c) {
3596+
return c.Reg1 == AArch64::LR && c.Reg2 == AArch64::FP;
3597+
};
3598+
auto FPLRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsFPLR);
3599+
const RegPairInfo FPLRRPI = *FPLRBegin;
3600+
FPLRBegin = std::remove_if(RegPairs.begin(), RegPairs.end(), IsFPLR);
3601+
*FPLRBegin = FPLRRPI;
3602+
}
3603+
34673604
// For performance reasons restore SVE register in increasing order
34683605
auto IsPPR = [](const RegPairInfo &c) { return c.Type == RegPairInfo::PPR; };
34693606
auto PPRBegin = std::find_if(RegPairs.begin(), RegPairs.end(), IsPPR);
@@ -4796,6 +4933,10 @@ void AArch64FrameLowering::orderFrameObjects(
47964933

47974934
const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
47984935
const MachineFrameInfo &MFI = MF.getFrameInfo();
4936+
4937+
if (AFI.hasPoplessEpilogue())
4938+
return;
4939+
47994940
std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
48004941
for (auto &Obj : ObjectsToAllocate) {
48014942
FrameObjects[Obj].IsValid = true;

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2560,6 +2560,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
25602560
MAKE_CASE(AArch64ISD::AUTH_CALL_RVMARKER)
25612561
MAKE_CASE(AArch64ISD::LOADgot)
25622562
MAKE_CASE(AArch64ISD::RET_GLUE)
2563+
MAKE_CASE(AArch64ISD::RET_POPLESS)
25632564
MAKE_CASE(AArch64ISD::BRCOND)
25642565
MAKE_CASE(AArch64ISD::CSEL)
25652566
MAKE_CASE(AArch64ISD::CSINV)
@@ -8156,6 +8157,18 @@ bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
81568157
CallCC == CallingConv::Tail || CallCC == CallingConv::SwiftTail;
81578158
}
81588159

8160+
SDValue AArch64TargetLowering::adjustReturnPopless(SDValue RetChain,
8161+
SelectionDAG &DAG) const {
8162+
if (RetChain.getOpcode() != AArch64ISD::RET_GLUE)
8163+
report_fatal_error("Unsupported aarch64 return for popless ret lowering");
8164+
8165+
auto *AFI = DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
8166+
AFI->setHasPoplessEpilogue();
8167+
8168+
return DAG.getNode(AArch64ISD::RET_POPLESS, SDLoc(RetChain),
8169+
MVT::Other, RetChain->ops());
8170+
}
8171+
81598172
// Check if the value is zero-extended from i1 to i8
81608173
static bool checkZExtBool(SDValue Arg, const SelectionDAG &DAG) {
81618174
unsigned SizeInBits = Arg.getValueType().getSizeInBits();

llvm/lib/Target/AArch64/AArch64ISelLowering.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ enum NodeType : unsigned {
9191
LOADgot, // Load from automatically generated descriptor (e.g. Global
9292
// Offset Table, TLS record).
9393
RET_GLUE, // Return with a glue operand. Operand 0 is the chain operand.
94+
RET_POPLESS, // Same as RET_GLUE, though "popless", = doesn't clean the stack.
9495
BRCOND, // Conditional branch instruction; "b.cond".
9596
CSEL,
9697
CSINV, // Conditional select invert.
@@ -1094,6 +1095,9 @@ class AArch64TargetLowering : public TargetLowering {
10941095
void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, const SDLoc &DL,
10951096
SDValue &Chain) const;
10961097

1098+
SDValue adjustReturnPopless(SDValue RetChain,
1099+
SelectionDAG &DAG) const override;
1100+
10971101
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
10981102
bool isVarArg,
10991103
const SmallVectorImpl<ISD::OutputArg> &Outs,

0 commit comments

Comments
 (0)