Skip to content

Commit 308ce8d

Browse files
authored
[ARM] Fix calling convention for __fp16 with big-endian (#126741)
AAPCS32 defines the fp16 and bf16 types as being passed as if they were extended to 32 bits, with the high 16 bits being unspecified. The extension is specified as happening as-if it was done in a register, which means that for big endian targets, the actual value gets passed in the higher addressed half of the stack slot, instead of the lower addressed half as for little endian. Previously, for targets with the fp16 extension, we were passing these types as a 16 bit stack slot, which worked for little endian because every later stack slot would be 4-byte aligned leaving the 2 byte gap, but was incorrect for big endian.
1 parent 298caeb commit 308ce8d

File tree

5 files changed

+787
-3
lines changed

5 files changed

+787
-3
lines changed

llvm/lib/Target/ARM/ARMCallingConv.cpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,8 @@ static bool CustomAssignInRegList(unsigned ValNo, MVT ValVT, MVT LocVT,
298298
static bool CC_ARM_AAPCS_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT,
299299
CCValAssign::LocInfo LocInfo,
300300
ISD::ArgFlagsTy ArgFlags, CCState &State) {
301-
// f16 arguments are extended to i32 and assigned to a register in [r0, r3]
301+
// f16 and bf16 arguments are extended to i32 and assigned to a register in
302+
// [r0, r3].
302303
return CustomAssignInRegList(ValNo, ValVT, MVT::i32, LocInfo, State,
303304
RRegList);
304305
}
@@ -307,10 +308,25 @@ static bool CC_ARM_AAPCS_VFP_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT,
307308
CCValAssign::LocInfo LocInfo,
308309
ISD::ArgFlagsTy ArgFlags,
309310
CCState &State) {
310-
// f16 arguments are extended to f32 and assigned to a register in [s0, s15]
311+
// f16 and bf16 arguments are extended to f32 and assigned to a register in
312+
// [s0, s15].
311313
return CustomAssignInRegList(ValNo, ValVT, MVT::f32, LocInfo, State,
312314
SRegList);
313315
}
314316

317+
static bool CC_ARM_AAPCS_Common_Custom_f16_Stack(unsigned ValNo, MVT ValVT,
318+
MVT LocVT,
319+
CCValAssign::LocInfo LocInfo,
320+
ISD::ArgFlagsTy ArgFlags,
321+
CCState &State) {
322+
// f16 and bf16 (if not passed in a register) are assigned to a 32-bit stack
323+
// slot, with the most-significant 16 bits unspecified. The 32-bit slot is
324+
// important to make sure that the byte ordering is correct for big endian
325+
// targets.
326+
State.addLoc(CCValAssign::getCustomMem(
327+
ValNo, ValVT, State.AllocateStack(4, Align(4)), MVT::i32, LocInfo));
328+
return true;
329+
}
330+
315331
// Include the table generated calling convention implementations.
316332
#include "ARMGenCallingConv.inc"

llvm/lib/Target/ARM/ARMCallingConv.td

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,8 @@ def CC_ARM_AAPCS_Common : CallingConv<[
139139

140140
CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>,
141141
CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
142-
CCIfType<[f16, bf16, f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
142+
CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
143+
CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_Common_Custom_f16_Stack">>,
143144
CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
144145
CCIfType<[v2f64], CCIfAlign<"16",
145146
CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>,

llvm/lib/Target/ARM/ARMISelLowering.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4759,6 +4759,25 @@ SDValue ARMTargetLowering::LowerFormalArguments(
47594759
VA.getLocMemOffset(), Flags.getByValSize());
47604760
InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
47614761
CCInfo.nextInRegsParam();
4762+
} else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4763+
VA.getValVT() == MVT::bf16)) {
4764+
// f16 and bf16 values are passed in the least-significant half of
4765+
// a 4 byte stack slot. This is done as-if the extension was done
4766+
// in a 32-bit register, so the actual bytes used for the value
4767+
// differ between little and big endian.
4768+
assert(VA.getLocVT().getSizeInBits() == 32);
4769+
unsigned FIOffset = VA.getLocMemOffset();
4770+
int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4771+
FIOffset, true);
4772+
4773+
SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4774+
if (DAG.getDataLayout().isBigEndian())
4775+
Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4776+
4777+
InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4778+
MachinePointerInfo::getFixedStack(
4779+
DAG.getMachineFunction(), FI)));
4780+
47624781
} else {
47634782
unsigned FIOffset = VA.getLocMemOffset();
47644783
int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,

0 commit comments

Comments
 (0)