Skip to content

Commit 914a399

Browse files
committed
[ARM] Avoid clobbering byval arguments when passing to tail-calls
When passing byval arguments to tail-calls, we need to store them into the stack memory in which this the caller received it's arguments. If any of the outgoing arguments are forwarded from incoming byval arguments, then the source of the copy is from the same stack memory. This can result in the copy corrupting a value which is still to be read. The fix is to first make a copy of the outgoing byval arguments in local stack space, and then copy them to their final location. This fixes the correctness issue, but results in extra copying, which could be optimised.
1 parent a96c14e commit 914a399

File tree

2 files changed

+183
-28
lines changed

2 files changed

+183
-28
lines changed

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2380,6 +2380,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
23802380

23812381
MachineFunction &MF = DAG.getMachineFunction();
23822382
ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2383+
MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
23832384
MachineFunction::CallSiteInfo CSInfo;
23842385
bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
23852386
bool isThisReturn = false;
@@ -2492,6 +2493,45 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
24922493
RegsToPassVector RegsToPass;
24932494
SmallVector<SDValue, 8> MemOpChains;
24942495

2496+
// If we are doing a tail-call, any byval arguments will be written to stack
2497+
// space which was used for incoming arguments. If any the values being used
2498+
// are incoming byval arguments to this function, then they might be
2499+
// overwritten by the stores of the outgoing arguments. To avoid this, we
2500+
// need to make a temporary copy of them in local stack space, then copy back
2501+
// to the argument area.
2502+
// TODO This could be optimised to skip byvals which are already being copied
2503+
// from local stack space, or which are copied from the incoming stack at the
2504+
// exact same location.
2505+
DenseMap<unsigned, SDValue> ByValTemporaries;
2506+
SDValue ByValTempChain;
2507+
if (isTailCall) {
2508+
for (unsigned ArgIdx = 0, e = OutVals.size(); ArgIdx != e; ++ArgIdx) {
2509+
SDValue Arg = OutVals[ArgIdx];
2510+
ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2511+
2512+
if (Flags.isByVal()) {
2513+
int FrameIdx = MFI.CreateStackObject(
2514+
Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2515+
SDValue Dst =
2516+
DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
2517+
2518+
SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2519+
SDValue AlignNode =
2520+
DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2521+
2522+
SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2523+
SDValue Ops[] = { Chain, Dst, Arg, SizeNode, AlignNode};
2524+
MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2525+
Ops));
2526+
ByValTemporaries[ArgIdx] = Dst;
2527+
}
2528+
}
2529+
if (!MemOpChains.empty()) {
2530+
ByValTempChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2531+
MemOpChains.clear();
2532+
}
2533+
}
2534+
24952535
// During a tail call, stores to the argument area must happen after all of
24962536
// the function's incoming arguments have been loaded because they may alias.
24972537
// This is done by folding in a TokenFactor from LowerFormalArguments, but
@@ -2529,6 +2569,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
25292569

25302570
if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
25312571
Chain = DAG.getStackArgumentTokenFactor(Chain);
2572+
if (ByValTempChain)
2573+
Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2574+
ByValTempChain);
25322575
AfterFormalArgLoads = true;
25332576
}
25342577

@@ -2600,6 +2643,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
26002643
unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
26012644
unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
26022645

2646+
SDValue ByValSrc;
2647+
if (ByValTemporaries.contains(realArgIdx))
2648+
ByValSrc = ByValTemporaries[realArgIdx];
2649+
else
2650+
ByValSrc = Arg;
2651+
26032652
if (CurByValIdx < ByValArgsCount) {
26042653

26052654
unsigned RegBegin, RegEnd;
@@ -2610,7 +2659,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
26102659
unsigned int i, j;
26112660
for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
26122661
SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2613-
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
2662+
SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
26142663
SDValue Load =
26152664
DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
26162665
DAG.InferPtrAlign(AddArg));
@@ -2632,7 +2681,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
26322681
std::tie(Dst, DstInfo) =
26332682
computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
26342683
SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2635-
SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
2684+
SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
26362685
SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
26372686
MVT::i32);
26382687
SDValue AlignNode =

llvm/test/CodeGen/ARM/musttail.ll

Lines changed: 132 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -139,13 +139,28 @@ define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
139139
; CHECK-NEXT: sub sp, sp, #16
140140
; CHECK-NEXT: .save {r4, lr}
141141
; CHECK-NEXT: push {r4, lr}
142-
; CHECK-NEXT: add r12, sp, #8
143-
; CHECK-NEXT: add lr, sp, #24
142+
; CHECK-NEXT: .pad #20
143+
; CHECK-NEXT: sub sp, sp, #20
144+
; CHECK-NEXT: add r12, sp, #28
145+
; CHECK-NEXT: add lr, sp, #44
144146
; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
145-
; CHECK-NEXT: add r12, sp, #8
146-
; CHECK-NEXT: add r12, r12, #16
147+
; CHECK-NEXT: add r0, sp, #28
148+
; CHECK-NEXT: mov r1, sp
149+
; CHECK-NEXT: ldr r2, [r0], #4
150+
; CHECK-NEXT: add r12, r1, #16
151+
; CHECK-NEXT: str r2, [r1], #4
152+
; CHECK-NEXT: ldr r2, [r0], #4
153+
; CHECK-NEXT: str r2, [r1], #4
154+
; CHECK-NEXT: ldr r2, [r0], #4
155+
; CHECK-NEXT: str r2, [r1], #4
156+
; CHECK-NEXT: ldr r2, [r0], #4
157+
; CHECK-NEXT: str r2, [r1], #4
158+
; CHECK-NEXT: ldr r2, [r0], #4
159+
; CHECK-NEXT: str r2, [r1], #4
160+
; CHECK-NEXT: ldm sp, {r0, r1, r2, r3}
147161
; CHECK-NEXT: ldr r4, [r12], #4
148162
; CHECK-NEXT: str r4, [lr], #4
163+
; CHECK-NEXT: add sp, sp, #20
149164
; CHECK-NEXT: pop {r4, lr}
150165
; CHECK-NEXT: add sp, sp, #16
151166
; CHECK-NEXT: b large_callee
@@ -163,17 +178,30 @@ define void @large_caller_check_regs(%twenty_bytes* byval(%twenty_bytes) align 4
163178
; CHECK-NEXT: sub sp, sp, #16
164179
; CHECK-NEXT: .save {r4, lr}
165180
; CHECK-NEXT: push {r4, lr}
166-
; CHECK-NEXT: add r12, sp, #8
167-
; CHECK-NEXT: add lr, sp, #24
181+
; CHECK-NEXT: .pad #20
182+
; CHECK-NEXT: sub sp, sp, #20
183+
; CHECK-NEXT: add r12, sp, #28
184+
; CHECK-NEXT: add lr, sp, #44
168185
; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
169186
; CHECK-NEXT: @APP
170187
; CHECK-NEXT: @NO_APP
171-
; CHECK-NEXT: add r3, sp, #8
172-
; CHECK-NEXT: add r0, sp, #8
173-
; CHECK-NEXT: add r12, r0, #16
174-
; CHECK-NEXT: ldm r3, {r0, r1, r2, r3}
188+
; CHECK-NEXT: add r0, sp, #28
189+
; CHECK-NEXT: mov r1, sp
190+
; CHECK-NEXT: add r12, r1, #16
191+
; CHECK-NEXT: ldr r2, [r0], #4
192+
; CHECK-NEXT: str r2, [r1], #4
193+
; CHECK-NEXT: ldr r2, [r0], #4
194+
; CHECK-NEXT: str r2, [r1], #4
195+
; CHECK-NEXT: ldr r2, [r0], #4
196+
; CHECK-NEXT: str r2, [r1], #4
197+
; CHECK-NEXT: ldr r2, [r0], #4
198+
; CHECK-NEXT: str r2, [r1], #4
199+
; CHECK-NEXT: ldr r2, [r0], #4
200+
; CHECK-NEXT: str r2, [r1], #4
201+
; CHECK-NEXT: ldm sp, {r0, r1, r2, r3}
175202
; CHECK-NEXT: ldr r4, [r12], #4
176203
; CHECK-NEXT: str r4, [lr], #4
204+
; CHECK-NEXT: add sp, sp, #20
177205
; CHECK-NEXT: pop {r4, lr}
178206
; CHECK-NEXT: add sp, sp, #16
179207
; CHECK-NEXT: b large_callee
@@ -190,30 +218,44 @@ entry:
190218
define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
191219
; CHECK-LABEL: large_caller_new_value:
192220
; CHECK: @ %bb.0: @ %entry
193-
; CHECK-NEXT: .pad #36
194-
; CHECK-NEXT: sub sp, sp, #36
195-
; CHECK-NEXT: add r12, sp, #20
221+
; CHECK-NEXT: .pad #16
222+
; CHECK-NEXT: sub sp, sp, #16
223+
; CHECK-NEXT: .save {r4, lr}
224+
; CHECK-NEXT: push {r4, lr}
225+
; CHECK-NEXT: .pad #40
226+
; CHECK-NEXT: sub sp, sp, #40
227+
; CHECK-NEXT: add r12, sp, #48
228+
; CHECK-NEXT: add lr, sp, #64
196229
; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
197230
; CHECK-NEXT: mov r0, #4
198-
; CHECK-NEXT: add r1, sp, #36
199-
; CHECK-NEXT: str r0, [sp, #16]
231+
; CHECK-NEXT: mov r1, sp
232+
; CHECK-NEXT: str r0, [sp, #36]
200233
; CHECK-NEXT: mov r0, #3
201-
; CHECK-NEXT: str r0, [sp, #12]
234+
; CHECK-NEXT: str r0, [sp, #32]
202235
; CHECK-NEXT: mov r0, #2
203-
; CHECK-NEXT: str r0, [sp, #8]
236+
; CHECK-NEXT: str r0, [sp, #28]
204237
; CHECK-NEXT: mov r0, #1
205-
; CHECK-NEXT: str r0, [sp, #4]
238+
; CHECK-NEXT: str r0, [sp, #24]
206239
; CHECK-NEXT: mov r0, #0
207-
; CHECK-NEXT: str r0, [sp]
208-
; CHECK-NEXT: mov r0, sp
209-
; CHECK-NEXT: add r0, r0, #16
210-
; CHECK-NEXT: mov r3, #3
240+
; CHECK-NEXT: str r0, [sp, #20]
241+
; CHECK-NEXT: add r0, sp, #20
242+
; CHECK-NEXT: add r12, r1, #16
211243
; CHECK-NEXT: ldr r2, [r0], #4
212244
; CHECK-NEXT: str r2, [r1], #4
213-
; CHECK-NEXT: mov r0, #0
214-
; CHECK-NEXT: mov r1, #1
215-
; CHECK-NEXT: mov r2, #2
216-
; CHECK-NEXT: add sp, sp, #36
245+
; CHECK-NEXT: ldr r2, [r0], #4
246+
; CHECK-NEXT: str r2, [r1], #4
247+
; CHECK-NEXT: ldr r2, [r0], #4
248+
; CHECK-NEXT: str r2, [r1], #4
249+
; CHECK-NEXT: ldr r2, [r0], #4
250+
; CHECK-NEXT: str r2, [r1], #4
251+
; CHECK-NEXT: ldr r2, [r0], #4
252+
; CHECK-NEXT: str r2, [r1], #4
253+
; CHECK-NEXT: ldm sp, {r0, r1, r2, r3}
254+
; CHECK-NEXT: ldr r4, [r12], #4
255+
; CHECK-NEXT: str r4, [lr], #4
256+
; CHECK-NEXT: add sp, sp, #40
257+
; CHECK-NEXT: pop {r4, lr}
258+
; CHECK-NEXT: add sp, sp, #16
217259
; CHECK-NEXT: b large_callee
218260
entry:
219261
%y = alloca %twenty_bytes, align 4
@@ -229,3 +271,67 @@ entry:
229271
musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y)
230272
ret void
231273
}
274+
275+
declare void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4, %twenty_bytes* byval(%twenty_bytes) align 4)
276+
define void @swap_byvals(%twenty_bytes* byval(%twenty_bytes) align 4 %a, %twenty_bytes* byval(%twenty_bytes) align 4 %b) {
277+
; CHECK-LABEL: swap_byvals:
278+
; CHECK: @ %bb.0: @ %entry
279+
; CHECK-NEXT: .pad #16
280+
; CHECK-NEXT: sub sp, sp, #16
281+
; CHECK-NEXT: .save {r4, r5, r11, lr}
282+
; CHECK-NEXT: push {r4, r5, r11, lr}
283+
; CHECK-NEXT: .pad #40
284+
; CHECK-NEXT: sub sp, sp, #40
285+
; CHECK-NEXT: add r12, sp, #56
286+
; CHECK-NEXT: add lr, sp, #20
287+
; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
288+
; CHECK-NEXT: add r0, sp, #56
289+
; CHECK-NEXT: mov r12, sp
290+
; CHECK-NEXT: ldr r1, [r0], #4
291+
; CHECK-NEXT: mov r2, r12
292+
; CHECK-NEXT: str r1, [r2], #4
293+
; CHECK-NEXT: add r3, sp, #20
294+
; CHECK-NEXT: ldr r1, [r0], #4
295+
; CHECK-NEXT: add r4, sp, #76
296+
; CHECK-NEXT: str r1, [r2], #4
297+
; CHECK-NEXT: ldr r1, [r0], #4
298+
; CHECK-NEXT: str r1, [r2], #4
299+
; CHECK-NEXT: ldr r1, [r0], #4
300+
; CHECK-NEXT: str r1, [r2], #4
301+
; CHECK-NEXT: ldr r1, [r0], #4
302+
; CHECK-NEXT: add r0, sp, #76
303+
; CHECK-NEXT: str r1, [r2], #4
304+
; CHECK-NEXT: mov r2, lr
305+
; CHECK-NEXT: ldr r1, [r0], #4
306+
; CHECK-NEXT: str r1, [r2], #4
307+
; CHECK-NEXT: ldr r1, [r0], #4
308+
; CHECK-NEXT: str r1, [r2], #4
309+
; CHECK-NEXT: ldr r1, [r0], #4
310+
; CHECK-NEXT: str r1, [r2], #4
311+
; CHECK-NEXT: ldr r1, [r0], #4
312+
; CHECK-NEXT: str r1, [r2], #4
313+
; CHECK-NEXT: ldr r1, [r0], #4
314+
; CHECK-NEXT: str r1, [r2], #4
315+
; CHECK-NEXT: ldm r3, {r0, r1, r2, r3}
316+
; CHECK-NEXT: ldr r5, [r12], #4
317+
; CHECK-NEXT: str r5, [r4], #4
318+
; CHECK-NEXT: ldr r5, [r12], #4
319+
; CHECK-NEXT: str r5, [r4], #4
320+
; CHECK-NEXT: ldr r5, [r12], #4
321+
; CHECK-NEXT: str r5, [r4], #4
322+
; CHECK-NEXT: ldr r5, [r12], #4
323+
; CHECK-NEXT: str r5, [r4], #4
324+
; CHECK-NEXT: ldr r5, [r12], #4
325+
; CHECK-NEXT: str r5, [r4], #4
326+
; CHECK-NEXT: add r5, lr, #16
327+
; CHECK-NEXT: add r12, sp, #72
328+
; CHECK-NEXT: ldr r4, [r5], #4
329+
; CHECK-NEXT: str r4, [r12], #4
330+
; CHECK-NEXT: add sp, sp, #40
331+
; CHECK-NEXT: pop {r4, r5, r11, lr}
332+
; CHECK-NEXT: add sp, sp, #16
333+
; CHECK-NEXT: b two_byvals_callee
334+
entry:
335+
musttail call void @two_byvals_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %b, %twenty_bytes* byval(%twenty_bytes) align 4 %a)
336+
ret void
337+
}

0 commit comments

Comments
 (0)