Skip to content

Commit 78ec2e2

Browse files
committed
[ARM] Allow tail calls with byval args
Byval arguments which are passed partially in registers get stored into the local stack frame, but it is valid to tail-call them because the part which gets spilled is always re-loaded into registers before doing the tail-call, so it's OK for the spill area to be deallocated.
1 parent 82e6472 commit 78ec2e2

File tree

4 files changed

+126
-23
lines changed

4 files changed

+126
-23
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3075,11 +3075,11 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
30753075
}
30763076
}
30773077

3078-
// If Caller's vararg or byval argument has been split between registers and
3079-
// stack, do not perform tail call, since part of the argument is in caller's
3080-
// local frame.
3078+
// If Caller's vararg argument has been split between registers and stack, do
3079+
// not perform tail call, since part of the argument is in caller's local
3080+
// frame.
30813081
const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
3082-
if (AFI_Caller->getArgRegsSaveSize()) {
3082+
if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
30833083
LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
30843084
return false;
30853085
}

llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -12,17 +12,11 @@ define void @check227(
1212
; arg1 --> SP+188
1313

1414
entry:
15-
16-
;CHECK: sub sp, sp, #12
17-
;CHECK: push {r11, lr}
18-
;CHECK: sub sp, sp, #4
19-
;CHECK: add r0, sp, #12
20-
;CHECK: stm r0, {r1, r2, r3}
21-
;CHECK: ldr r0, [sp, #212]
22-
;CHECK: bl useInt
23-
;CHECK: add sp, sp, #4
24-
;CHECK: pop {r11, lr}
25-
;CHECK: add sp, sp, #12
15+
; CHECK: sub sp, sp, #12
16+
; CHECK: stm sp, {r1, r2, r3}
17+
; CHECK: ldr r0, [sp, #200]
18+
; CHECK: add sp, sp, #12
19+
; CHECK: b useInt
2620

2721
%0 = ptrtoint ptr %arg1 to i32
2822
tail call void @useInt(i32 %0)

llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,11 @@
77
define void @foo(ptr byval(%struct4bytes) %p0, ; --> R0
88
ptr byval(%struct20bytes) %p1 ; --> R1,R2,R3, [SP+0 .. SP+8)
99
) {
10-
;CHECK: sub sp, sp, #16
11-
;CHECK: push {r11, lr}
12-
;CHECK: add r12, sp, #8
13-
;CHECK: stm r12, {r0, r1, r2, r3}
14-
;CHECK: add r0, sp, #12
15-
;CHECK: bl useInt
16-
;CHECK: pop {r11, lr}
17-
;CHECK: add sp, sp, #16
10+
;CHECK: sub sp, sp, #16
11+
;CHECK: stm sp, {r0, r1, r2, r3}
12+
;CHECK: add r0, sp, #4
13+
;CHECK: add sp, sp, #16
14+
;CHECK: b useInt
1815

1916
%1 = ptrtoint ptr %p1 to i32
2017
tail call void @useInt(i32 %1)

llvm/test/CodeGen/ARM/musttail.ll

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,3 +117,115 @@ entry:
117117
musttail call void @sret_callee(ptr sret({ double, double }) align 8 %result)
118118
ret void
119119
}
120+
121+
; Clang only uses byval for arguments of 65 bytes or larger, but we test with a
122+
; 20 byte struct to keep the tests more readable. This size was chosen to still
123+
; make sure that it will be split between registers and the stack, to test all
124+
; of the interesting code paths in the backend.
125+
%twenty_bytes = type { [5 x i32] }
126+
declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4)
127+
128+
; Functions with byval parameters can be tail-called, because the value is
129+
; actually passed in registers and the stack in the same way for the caller and
130+
; callee. Within @large_caller the first 16 bytes of the argument are spilled
131+
; to the local stack frame, but for the tail-call they are passed in r0-r3, so
132+
; it's safe to de-allocate that memory before the call. Most of the code
133+
; generated for this isn't needed, but that's a missed optimisation, not a
134+
; correctness issue.
135+
define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
136+
; CHECK-LABEL: large_caller:
137+
; CHECK: @ %bb.0: @ %entry
138+
; CHECK-NEXT: .pad #16
139+
; CHECK-NEXT: sub sp, sp, #16
140+
; CHECK-NEXT: .save {r4, lr}
141+
; CHECK-NEXT: push {r4, lr}
142+
; CHECK-NEXT: add r12, sp, #8
143+
; CHECK-NEXT: add lr, sp, #24
144+
; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
145+
; CHECK-NEXT: add r12, sp, #8
146+
; CHECK-NEXT: add r12, r12, #16
147+
; CHECK-NEXT: ldr r4, [r12], #4
148+
; CHECK-NEXT: str r4, [lr], #4
149+
; CHECK-NEXT: pop {r4, lr}
150+
; CHECK-NEXT: add sp, sp, #16
151+
; CHECK-NEXT: b large_callee
152+
entry:
153+
musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
154+
ret void
155+
}
156+
157+
; As above, but with some inline asm to test that the arguments in r0-r3 are
158+
; re-loaded before the call.
159+
define void @large_caller_check_regs(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
160+
; CHECK-LABEL: large_caller_check_regs:
161+
; CHECK: @ %bb.0: @ %entry
162+
; CHECK-NEXT: .pad #16
163+
; CHECK-NEXT: sub sp, sp, #16
164+
; CHECK-NEXT: .save {r4, lr}
165+
; CHECK-NEXT: push {r4, lr}
166+
; CHECK-NEXT: add r12, sp, #8
167+
; CHECK-NEXT: add lr, sp, #24
168+
; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
169+
; CHECK-NEXT: @APP
170+
; CHECK-NEXT: @NO_APP
171+
; CHECK-NEXT: add r3, sp, #8
172+
; CHECK-NEXT: add r0, sp, #8
173+
; CHECK-NEXT: add r12, r0, #16
174+
; CHECK-NEXT: ldm r3, {r0, r1, r2, r3}
175+
; CHECK-NEXT: ldr r4, [r12], #4
176+
; CHECK-NEXT: str r4, [lr], #4
177+
; CHECK-NEXT: pop {r4, lr}
178+
; CHECK-NEXT: add sp, sp, #16
179+
; CHECK-NEXT: b large_callee
180+
entry:
181+
tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
182+
musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
183+
ret void
184+
}
185+
186+
; The IR for this one looks dodgy, because it has an alloca passed to a
187+
; musttail function, but it is passed as a byval argument, so will be copied
188+
; into the stack space allocated by @large_caller_new_value's caller, so is
189+
; valid.
190+
define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
191+
; CHECK-LABEL: large_caller_new_value:
192+
; CHECK: @ %bb.0: @ %entry
193+
; CHECK-NEXT: .pad #36
194+
; CHECK-NEXT: sub sp, sp, #36
195+
; CHECK-NEXT: add r12, sp, #20
196+
; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
197+
; CHECK-NEXT: mov r0, #4
198+
; CHECK-NEXT: add r1, sp, #36
199+
; CHECK-NEXT: str r0, [sp, #16]
200+
; CHECK-NEXT: mov r0, #3
201+
; CHECK-NEXT: str r0, [sp, #12]
202+
; CHECK-NEXT: mov r0, #2
203+
; CHECK-NEXT: str r0, [sp, #8]
204+
; CHECK-NEXT: mov r0, #1
205+
; CHECK-NEXT: str r0, [sp, #4]
206+
; CHECK-NEXT: mov r0, #0
207+
; CHECK-NEXT: str r0, [sp]
208+
; CHECK-NEXT: mov r0, sp
209+
; CHECK-NEXT: add r0, r0, #16
210+
; CHECK-NEXT: mov r3, #3
211+
; CHECK-NEXT: ldr r2, [r0], #4
212+
; CHECK-NEXT: str r2, [r1], #4
213+
; CHECK-NEXT: mov r0, #0
214+
; CHECK-NEXT: mov r1, #1
215+
; CHECK-NEXT: mov r2, #2
216+
; CHECK-NEXT: add sp, sp, #36
217+
; CHECK-NEXT: b large_callee
218+
entry:
219+
%y = alloca %twenty_bytes, align 4
220+
store i32 0, ptr %y, align 4
221+
%0 = getelementptr inbounds i8, ptr %y, i32 4
222+
store i32 1, ptr %0, align 4
223+
%1 = getelementptr inbounds i8, ptr %y, i32 8
224+
store i32 2, ptr %1, align 4
225+
%2 = getelementptr inbounds i8, ptr %y, i32 12
226+
store i32 3, ptr %2, align 4
227+
%3 = getelementptr inbounds i8, ptr %y, i32 16
228+
store i32 4, ptr %3, align 4
229+
musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y)
230+
ret void
231+
}

0 commit comments

Comments
 (0)