Skip to content

Commit c1eb790

Browse files
committed
[ARM] Tail-calls do not require caller and callee arguments to match
The ARM backend was checking that the outgoing values for a tail-call matched the incoming argument values of the caller. This isn't necessary, because the caller can change the values in both registers and the stack before doing the tail-call. The actual limitation is that the callee can't need more stack space for it's arguments than the caller does. This is needed for code using the musttail attribute, as well as enabling tail calls as an optimisation in more cases.
1 parent 246baeb commit c1eb790

File tree

4 files changed

+134
-143
lines changed

4 files changed

+134
-143
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 9 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -2962,50 +2962,6 @@ void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
29622962
Size = std::max<int>(Size - Excess, 0);
29632963
}
29642964

2965-
/// MatchingStackOffset - Return true if the given stack call argument is
2966-
/// already available in the same position (relatively) of the caller's
2967-
/// incoming argument stack.
2968-
static
2969-
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
2970-
MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
2971-
const TargetInstrInfo *TII) {
2972-
unsigned Bytes = Arg.getValueSizeInBits() / 8;
2973-
int FI = std::numeric_limits<int>::max();
2974-
if (Arg.getOpcode() == ISD::CopyFromReg) {
2975-
Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
2976-
if (!VR.isVirtual())
2977-
return false;
2978-
MachineInstr *Def = MRI->getVRegDef(VR);
2979-
if (!Def)
2980-
return false;
2981-
if (!Flags.isByVal()) {
2982-
if (!TII->isLoadFromStackSlot(*Def, FI))
2983-
return false;
2984-
} else {
2985-
return false;
2986-
}
2987-
} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
2988-
if (Flags.isByVal())
2989-
// ByVal argument is passed in as a pointer but it's now being
2990-
// dereferenced. e.g.
2991-
// define @foo(%struct.X* %A) {
2992-
// tail call @bar(%struct.X* byval %A)
2993-
// }
2994-
return false;
2995-
SDValue Ptr = Ld->getBasePtr();
2996-
FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
2997-
if (!FINode)
2998-
return false;
2999-
FI = FINode->getIndex();
3000-
} else
3001-
return false;
3002-
3003-
assert(FI != std::numeric_limits<int>::max());
3004-
if (!MFI.isFixedObjectIndex(FI))
3005-
return false;
3006-
return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
3007-
}
3008-
30092965
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
30102966
/// for tail call optimization. Targets which want to do tail call
30112967
/// optimization should implement this function. Note that this function also
@@ -3130,64 +3086,17 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
31303086

31313087
// If the callee takes no arguments then go on to check the results of the
31323088
// call.
3133-
if (!Outs.empty()) {
3134-
if (CCInfo.getStackSize()) {
3135-
// Check if the arguments are already laid out in the right way as
3136-
// the caller's fixed stack objects.
3137-
MachineFrameInfo &MFI = MF.getFrameInfo();
3138-
const MachineRegisterInfo *MRI = &MF.getRegInfo();
3139-
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3140-
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
3141-
i != e;
3142-
++i, ++realArgIdx) {
3143-
CCValAssign &VA = ArgLocs[i];
3144-
EVT RegVT = VA.getLocVT();
3145-
SDValue Arg = OutVals[realArgIdx];
3146-
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
3147-
if (VA.getLocInfo() == CCValAssign::Indirect) {
3148-
LLVM_DEBUG(dbgs() << "false (indirect arg)\n");
3149-
return false;
3150-
}
3151-
if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
3152-
// f64 and vector types are split into multiple registers or
3153-
// register/stack-slot combinations. The types will not match
3154-
// the registers; give up on memory f64 refs until we figure
3155-
// out what to do about this.
3156-
if (!VA.isRegLoc()) {
3157-
LLVM_DEBUG(dbgs() << "false (f64 not in register)\n");
3158-
return false;
3159-
}
3160-
if (!ArgLocs[++i].isRegLoc()) {
3161-
LLVM_DEBUG(dbgs() << "false (f64 not in register, second half)\n");
3162-
return false;
3163-
}
3164-
if (RegVT == MVT::v2f64) {
3165-
if (!ArgLocs[++i].isRegLoc()) {
3166-
LLVM_DEBUG(dbgs() << "false (v2f64 not in register)\n");
3167-
return false;
3168-
}
3169-
if (!ArgLocs[++i].isRegLoc()) {
3170-
LLVM_DEBUG(dbgs() << "false (v2f64 not in register, second half)\n");
3171-
return false;
3172-
}
3173-
}
3174-
} else if (!VA.isRegLoc()) {
3175-
if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
3176-
MFI, MRI, TII)) {
3177-
LLVM_DEBUG(dbgs() << "false (non-matching stack offset)\n");
3178-
return false;
3179-
}
3180-
}
3181-
}
3182-
}
3183-
3184-
const MachineRegisterInfo &MRI = MF.getRegInfo();
3185-
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
3186-
LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
3187-
return false;
3188-
}
3089+
const MachineRegisterInfo &MRI = MF.getRegInfo();
3090+
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
3091+
LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
3092+
return false;
31893093
}
31903094

3095+
// If the stack arguments for this call do not fit into our own save area then
3096+
// the call cannot be made tail.
3097+
if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
3098+
return false;
3099+
31913100
LLVM_DEBUG(dbgs() << "true\n");
31923101
return true;
31933102
}

llvm/test/CodeGen/ARM/fp-arg-shuffle.ll

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,31 +2,29 @@
22
; RUN: llc -mtriple=arm-eabi -mattr=+neon -float-abi=soft %s -o - | FileCheck %s
33

44
; CHECK: function1
5-
; CHECK-NOT: vmov
65
define double @function1(double %a, double %b, double %c, double %d, double %e, double %f) nounwind noinline ssp {
76
; CHECK-LABEL: function1:
87
; CHECK: @ %bb.0: @ %entry
98
; CHECK-NEXT: .save {r4, r5, r11, lr}
109
; CHECK-NEXT: push {r4, r5, r11, lr}
11-
; CHECK-NEXT: .pad #32
12-
; CHECK-NEXT: sub sp, sp, #32
13-
; CHECK-NEXT: add lr, sp, #64
14-
; CHECK-NEXT: vldr d16, [sp, #56]
15-
; CHECK-NEXT: str r2, [sp, #16]
16-
; CHECK-NEXT: ldm lr, {r4, r5, r12, lr}
17-
; CHECK-NEXT: str r3, [sp, #20]
18-
; CHECK-NEXT: mov r3, r5
19-
; CHECK-NEXT: str r0, [sp, #24]
10+
; CHECK-NEXT: vldr d16, [sp, #40]
11+
; CHECK-NEXT: vldr d17, [sp, #32]
12+
; CHECK-NEXT: vmov r12, lr, d16
13+
; CHECK-NEXT: vldr d16, [sp, #16]
14+
; CHECK-NEXT: vmov r4, r5, d17
15+
; CHECK-NEXT: vldr d17, [sp, #24]
16+
; CHECK-NEXT: str r3, [sp, #36]
17+
; CHECK-NEXT: str r2, [sp, #32]
18+
; CHECK-NEXT: str r1, [sp, #44]
19+
; CHECK-NEXT: str r0, [sp, #40]
20+
; CHECK-NEXT: vstr d17, [sp, #16]
21+
; CHECK-NEXT: vstr d16, [sp, #24]
2022
; CHECK-NEXT: mov r0, r12
21-
; CHECK-NEXT: str r1, [sp, #28]
2223
; CHECK-NEXT: mov r1, lr
2324
; CHECK-NEXT: mov r2, r4
24-
; CHECK-NEXT: vldr d17, [sp, #48]
25-
; CHECK-NEXT: vstmia sp, {d16, d17}
26-
; CHECK-NEXT: bl function2
27-
; CHECK-NEXT: add sp, sp, #32
25+
; CHECK-NEXT: mov r3, r5
2826
; CHECK-NEXT: pop {r4, r5, r11, lr}
29-
; CHECK-NEXT: mov pc, lr
27+
; CHECK-NEXT: b function2
3028
entry:
3129
%call = tail call double @function2(double %f, double %e, double %d, double %c, double %b, double %a) nounwind
3230
ret double %call

llvm/test/CodeGen/ARM/fp16-vector-argument.ll

Lines changed: 14 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -145,26 +145,21 @@ entry:
145145
define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x half>, <8 x half>) {
146146
; SOFT-LABEL: many_args_test:
147147
; SOFT: @ %bb.0: @ %entry
148-
; SOFT-NEXT: push {r11, lr}
149-
; SOFT-NEXT: sub sp, sp, #32
150-
; SOFT-NEXT: add r12, sp, #80
148+
; SOFT-NEXT: add r12, sp, #40
151149
; SOFT-NEXT: vld1.64 {d16, d17}, [r12]
152-
; SOFT-NEXT: add r12, sp, #48
150+
; SOFT-NEXT: add r12, sp, #8
153151
; SOFT-NEXT: vabs.f16 q8, q8
154152
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
155-
; SOFT-NEXT: add r12, sp, #64
153+
; SOFT-NEXT: add r12, sp, #24
156154
; SOFT-NEXT: vadd.f16 q8, q8, q9
157155
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
158156
; SOFT-NEXT: add r12, sp, #16
159157
; SOFT-NEXT: vmul.f16 q8, q9, q8
160158
; SOFT-NEXT: vst1.64 {d16, d17}, [r12]
161-
; SOFT-NEXT: mov r12, sp
162-
; SOFT-NEXT: vldr d16, [sp, #40]
163-
; SOFT-NEXT: vst1.16 {d16}, [r12:64]!
164-
; SOFT-NEXT: str r3, [r12]
165-
; SOFT-NEXT: bl use
166-
; SOFT-NEXT: add sp, sp, #32
167-
; SOFT-NEXT: pop {r11, pc}
159+
; SOFT-NEXT: vldr d16, [sp]
160+
; SOFT-NEXT: vstr d16, [sp]
161+
; SOFT-NEXT: str r3, [sp, #8]
162+
; SOFT-NEXT: b use
168163
;
169164
; HARD-LABEL: many_args_test:
170165
; HARD: @ %bb.0: @ %entry
@@ -177,33 +172,25 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
177172
;
178173
; SOFTEB-LABEL: many_args_test:
179174
; SOFTEB: @ %bb.0: @ %entry
180-
; SOFTEB-NEXT: .save {r11, lr}
181-
; SOFTEB-NEXT: push {r11, lr}
182-
; SOFTEB-NEXT: .pad #32
183-
; SOFTEB-NEXT: sub sp, sp, #32
184-
; SOFTEB-NEXT: add r12, sp, #80
185-
; SOFTEB-NEXT: mov lr, sp
175+
; SOFTEB-NEXT: add r12, sp, #40
186176
; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12]
187-
; SOFTEB-NEXT: add r12, sp, #48
177+
; SOFTEB-NEXT: add r12, sp, #8
188178
; SOFTEB-NEXT: vrev64.16 q8, q8
189179
; SOFTEB-NEXT: vabs.f16 q8, q8
190180
; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12]
191-
; SOFTEB-NEXT: add r12, sp, #64
181+
; SOFTEB-NEXT: add r12, sp, #24
192182
; SOFTEB-NEXT: vrev64.16 q9, q9
193183
; SOFTEB-NEXT: vadd.f16 q8, q8, q9
194184
; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12]
195185
; SOFTEB-NEXT: add r12, sp, #16
196186
; SOFTEB-NEXT: vrev64.16 q9, q9
197187
; SOFTEB-NEXT: vmul.f16 q8, q9, q8
198-
; SOFTEB-NEXT: vldr d18, [sp, #40]
199-
; SOFTEB-NEXT: vrev64.16 d18, d18
200-
; SOFTEB-NEXT: vst1.16 {d18}, [lr:64]!
201-
; SOFTEB-NEXT: str r3, [lr]
188+
; SOFTEB-NEXT: vldr d18, [sp]
202189
; SOFTEB-NEXT: vrev64.16 q8, q8
203190
; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12]
204-
; SOFTEB-NEXT: bl use
205-
; SOFTEB-NEXT: add sp, sp, #32
206-
; SOFTEB-NEXT: pop {r11, pc}
191+
; SOFTEB-NEXT: vstr d18, [sp]
192+
; SOFTEB-NEXT: str r3, [sp, #8]
193+
; SOFTEB-NEXT: b use
207194
;
208195
; HARDEB-LABEL: many_args_test:
209196
; HARDEB: @ %bb.0: @ %entry

llvm/test/CodeGen/ARM/musttail.ll

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=armv7a-none-eabi %s -o - | FileCheck %s
3+
4+
declare i32 @many_args_callee(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5)
5+
6+
define i32 @many_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
7+
; CHECK-LABEL: many_args_tail:
8+
; CHECK: @ %bb.0:
9+
; CHECK-NEXT: mov r0, #5
10+
; CHECK-NEXT: mov r1, #2
11+
; CHECK-NEXT: str r0, [sp]
12+
; CHECK-NEXT: mov r0, #6
13+
; CHECK-NEXT: str r0, [sp, #4]
14+
; CHECK-NEXT: mov r0, #1
15+
; CHECK-NEXT: mov r2, #3
16+
; CHECK-NEXT: mov r3, #4
17+
; CHECK-NEXT: b many_args_callee
18+
%ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
19+
ret i32 %ret
20+
}
21+
22+
define i32 @many_args_musttail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
23+
; CHECK-LABEL: many_args_musttail:
24+
; CHECK: @ %bb.0:
25+
; CHECK-NEXT: mov r0, #5
26+
; CHECK-NEXT: mov r1, #2
27+
; CHECK-NEXT: str r0, [sp]
28+
; CHECK-NEXT: mov r0, #6
29+
; CHECK-NEXT: str r0, [sp, #4]
30+
; CHECK-NEXT: mov r0, #1
31+
; CHECK-NEXT: mov r2, #3
32+
; CHECK-NEXT: mov r3, #4
33+
; CHECK-NEXT: b many_args_callee
34+
%ret = musttail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
35+
ret i32 %ret
36+
}
37+
38+
; This function has more arguments than it's tail-callee. This isn't valid for
39+
; the musttail attribute, but can still be tail-called as a non-guaranteed
40+
; optimisation, because the outgoing arguments to @many_args_callee fit in the
41+
; stack space allocated by the caller of @more_args_tail.
42+
define i32 @more_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
43+
; CHECK-LABEL: more_args_tail:
44+
; CHECK: @ %bb.0:
45+
; CHECK-NEXT: mov r0, #5
46+
; CHECK-NEXT: mov r1, #2
47+
; CHECK-NEXT: str r0, [sp]
48+
; CHECK-NEXT: mov r0, #6
49+
; CHECK-NEXT: str r0, [sp, #4]
50+
; CHECK-NEXT: mov r0, #1
51+
; CHECK-NEXT: mov r2, #3
52+
; CHECK-NEXT: mov r3, #4
53+
; CHECK-NEXT: b many_args_callee
54+
%ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
55+
ret i32 %ret
56+
}
57+
58+
; Again, this isn't valid for musttail, but can be tail-called in practice
59+
; because the stack size if the same.
60+
define i32 @different_args_tail(i64 %0, i64 %1, i64 %2) {
61+
; CHECK-LABEL: different_args_tail:
62+
; CHECK: @ %bb.0:
63+
; CHECK-NEXT: mov r0, #5
64+
; CHECK-NEXT: mov r1, #2
65+
; CHECK-NEXT: str r0, [sp]
66+
; CHECK-NEXT: mov r0, #6
67+
; CHECK-NEXT: str r0, [sp, #4]
68+
; CHECK-NEXT: mov r0, #1
69+
; CHECK-NEXT: mov r2, #3
70+
; CHECK-NEXT: mov r3, #4
71+
; CHECK-NEXT: b many_args_callee
72+
%ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
73+
ret i32 %ret
74+
}
75+
76+
; Here, the caller requires less stack space for it's arguments than the
77+
; callee, so it would not ba valid to do a tail-call.
78+
define i32 @fewer_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4) {
79+
; CHECK-LABEL: fewer_args_tail:
80+
; CHECK: @ %bb.0:
81+
; CHECK-NEXT: .save {r11, lr}
82+
; CHECK-NEXT: push {r11, lr}
83+
; CHECK-NEXT: .pad #8
84+
; CHECK-NEXT: sub sp, sp, #8
85+
; CHECK-NEXT: mov r1, #6
86+
; CHECK-NEXT: mov r0, #5
87+
; CHECK-NEXT: strd r0, r1, [sp]
88+
; CHECK-NEXT: mov r0, #1
89+
; CHECK-NEXT: mov r1, #2
90+
; CHECK-NEXT: mov r2, #3
91+
; CHECK-NEXT: mov r3, #4
92+
; CHECK-NEXT: bl many_args_callee
93+
; CHECK-NEXT: add sp, sp, #8
94+
; CHECK-NEXT: pop {r11, pc}
95+
%ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
96+
ret i32 %ret
97+
}

0 commit comments

Comments
 (0)