-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[PowerPC][AIX] Support ByVals with greater alignment then pointer size #93341
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-powerpc Author: Zarko Todorovski (ZarkoT) ChangesImplementation is NOT compatible with XL. Instead it handles all ByVals This patch was originally written by Sean Fertile @mandlebug. Previously on Phabricator https://reviews.llvm.org/D105659 Full diff: https://github.com/llvm/llvm-project/pull/93341.diff 4 Files Affected:
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 8450ce9e0e3b3..0b628ea2b3f6a 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -6855,7 +6855,8 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
State.getMachineFunction().getSubtarget());
const bool IsPPC64 = Subtarget.isPPC64();
- const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);
+ const Align PtrAlign(PtrSize);
+ const Align StackAlign(16);
const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
if (ValVT == MVT::f128)
@@ -6876,12 +6877,16 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
PPC::V6, PPC::V7, PPC::V8, PPC::V9,
PPC::V10, PPC::V11, PPC::V12, PPC::V13};
+ const ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
+
if (ArgFlags.isByVal()) {
+ const Align ByValAlign(ArgFlags.getNonZeroByValAlign());
if (ArgFlags.getNonZeroByValAlign() > PtrAlign)
report_fatal_error("Pass-by-value arguments with alignment greater than "
- "register width are not supported.");
+ "16 are not supported.");
const unsigned ByValSize = ArgFlags.getByValSize();
+ const Align ObjAlign = ByValAlign > PtrAlign ? ByValAlign : PtrAlign;
// An empty aggregate parameter takes up no storage and no registers,
// but needs a MemLoc for a stack slot for the formal arguments side.
@@ -6891,11 +6896,23 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
return false;
}
- const unsigned StackSize = alignTo(ByValSize, PtrAlign);
- unsigned Offset = State.AllocateStack(StackSize, PtrAlign);
- for (const unsigned E = Offset + StackSize; Offset < E;
- Offset += PtrAlign.value()) {
- if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
+ // Shadow allocate any registers that are not properly aligned.
+ unsigned NextReg = State.getFirstUnallocated(GPRs);
+ while (NextReg != GPRs.size() &&
+ !isGPRShadowAligned(GPRs[NextReg], ObjAlign)) {
+ // Shadow allocate next registers since its aligment is not strict enough.
+ unsigned Reg = State.AllocateReg(GPRs);
+ // Allocate the stack space shadowed by said register.
+ State.AllocateStack(PtrSize, PtrAlign);
+ assert(Reg && "Alocating register unexpectedly failed.");
+ (void)Reg;
+ NextReg = State.getFirstUnallocated(GPRs);
+ }
+
+ const unsigned StackSize = alignTo(ByValSize, ObjAlign);
+ unsigned Offset = State.AllocateStack(StackSize, ObjAlign);
+ for (const unsigned E = Offset + StackSize; Offset < E; Offset += PtrSize) {
+ if (unsigned Reg = State.AllocateReg(GPRs))
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
else {
State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
@@ -6917,12 +6934,12 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
[[fallthrough]];
case MVT::i1:
case MVT::i32: {
- const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign);
+ const unsigned Offset = State.AllocateStack(PtrSize, PtrAlign);
// AIX integer arguments are always passed in register width.
if (ValVT.getFixedSizeInBits() < RegVT.getFixedSizeInBits())
LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
: CCValAssign::LocInfo::ZExt;
- if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
+ if (unsigned Reg = State.AllocateReg(GPRs))
State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
else
State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));
@@ -6942,8 +6959,8 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));
// Reserve and initialize GPRs or initialize the PSA as required.
- for (unsigned I = 0; I < StoreSize; I += PtrAlign.value()) {
- if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
+ for (unsigned I = 0; I < StoreSize; I += PtrSize) {
+ if (unsigned Reg = State.AllocateReg(GPRs)) {
assert(FReg && "An FPR should be available when a GPR is reserved.");
if (State.isVarArg()) {
// Successfully reserved GPRs are only initialized for vararg calls.
@@ -6995,9 +7012,6 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
return false;
}
- const unsigned PtrSize = IsPPC64 ? 8 : 4;
- ArrayRef<MCPhysReg> GPRs = IsPPC64 ? GPR_64 : GPR_32;
-
unsigned NextRegIndex = State.getFirstUnallocated(GPRs);
// Burn any underaligned registers and their shadowed stack space until
// we reach the required alignment.
@@ -7347,9 +7361,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
const MCPhysReg ArgReg = VA.getLocReg();
const PPCFrameLowering *FL = Subtarget.getFrameLowering();
- if (Flags.getNonZeroByValAlign() > PtrByteSize)
- report_fatal_error("Over aligned byvals not supported yet.");
-
const unsigned StackSize = alignTo(Flags.getByValSize(), PtrByteSize);
const int FI = MF.getFrameInfo().CreateFixedObject(
StackSize, mapArgRegToOffsetAIX(ArgReg, FL), /* IsImmutable */ false,
diff --git a/llvm/test/CodeGen/PowerPC/aix-cc-byval-limitation3.ll b/llvm/test/CodeGen/PowerPC/aix-cc-byval-limitation3.ll
index bb0231dbf5417..d4d9d67061fcb 100644
--- a/llvm/test/CodeGen/PowerPC/aix-cc-byval-limitation3.ll
+++ b/llvm/test/CodeGen/PowerPC/aix-cc-byval-limitation3.ll
@@ -13,4 +13,4 @@ entry:
declare void @foo(i32, i32, i32, i32, i32, i32, i32, i32, ptr byval(%struct.S) align 32)
-; CHECK: LLVM ERROR: Pass-by-value arguments with alignment greater than register width are not supported.
+; CHECK: LLVM ERROR: Pass-by-value arguments with alignment greater than 16 are not supported.
diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll b/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll
new file mode 100644
index 0000000000000..3491779b1002c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-byval-callee.ll
@@ -0,0 +1,50 @@
+TE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr7 \
+; RUN: -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefix=32BIT %s
+
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr7 \
+; RUN: -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefix=64BIT %s
+
+%struct.vec_struct = type { <4 x i32> }
+
+; Function Attrs: norecurse nounwind readonly
+define i32 @vec_struct_test(i32 %i, %struct.vec_struct* nocapture readonly byval(%struct.vec_struct) align 16 %vs) {
+ ; 32BIT-LABEL: name: vec_struct_test
+
+ ; 32BIT: fixedStack:
+ ; 32BIT: - { id: 0, type: default, offset: 32, size: 16, alignment: 16, stack-id: default,
+ ; 32BIT: isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
+ ; 32BIT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+
+ ; 32BIT: bb.0.entry:
+ ; 32BIT: liveins: $r3, $r5, $r6, $r7, $r8
+ ; 32BIT: renamable $r3 = nsw ADD4 renamable $r5, killed renamable $r3
+ ; 32BIT: STW killed renamable $r7, 8, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 8, align 8)
+ ; 32BIT: STW killed renamable $r6, 4, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 4)
+ ; 32BIT: STW killed renamable $r5, 0, %fixed-stack.0 :: (store (s32) into %fixed-stack.0, align 16)
+ ; 32BIT: STW killed renamable $r8, 12, %fixed-stack.0 :: (store (s32) into %fixed-stack.0 + 12)
+ ; 32BIT: BLR implicit $lr, implicit $rm, implicit $r3
+
+
+ ; 64BIT-LABEL: name: vec_struct_test
+ ; 64BIT: fixedStack:
+ ; 64BIT: - { id: 0, type: default, offset: 64, size: 16, alignment: 16, stack-id: default,
+ ; 64BIT: isImmutable: false, isAliased: true, callee-saved-register: '', callee-saved-restored: true,
+ ; 64BIT: debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+
+ ; 64BIT: bb.0.entry:
+ ; 64BIT: liveins: $x3, $x5, $x6
+ ; 64BIT: STD killed renamable $x5, 0, %fixed-stack.0 :: (store (s64) into %fixed-stack.0, align 16)
+ ; 64BIT: renamable $r4 = LWZ 0, %fixed-stack.0 :: (dereferenceable load (s32) from %ir.vsi1, align 16)
+ ; 64BIT: renamable $r3 = nsw ADD4 killed renamable $r4, renamable $r3, implicit killed $x3, implicit-def $x3
+ ; 64BIT: STD killed renamable $x6, 8, %fixed-stack.0 :: (store (s64) into %fixed-stack.0 + 8)
+ ; 64BIT: BLR8 implicit $lr8, implicit $rm, implicit $x3
+entry:
+ %vsi = getelementptr inbounds %struct.vec_struct, %struct.vec_struct* %vs, i32 0, i32 0
+ %0 = load <4 x i32>, <4 x i32>* %vsi, align 16
+ %vecext = extractelement <4 x i32> %0, i32 0
+ %add = add nsw i32 %vecext, %i
+ ret i32 %add
+}
diff --git a/llvm/test/CodeGen/PowerPC/aix-vector-byval.ll b/llvm/test/CodeGen/PowerPC/aix-vector-byval.ll
new file mode 100644
index 0000000000000..9e4139cdf0c6c
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/aix-vector-byval.ll
@@ -0,0 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+; RUN: llc -mtriple powerpc-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr7 \
+; RUN: -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefix=32BIT %s
+
+; RUN: llc -mtriple powerpc64-ibm-aix-xcoff -stop-after=machine-cp -mcpu=pwr7 \
+; RUN: -mattr=-altivec -verify-machineinstrs < %s | \
+; RUN: FileCheck --check-prefix=64BIT %s
+
+%struct.B = type { <8 x i16>, i32, i32, [8 x i8] }
+
+; Function Attrs: nounwind
+define i32 @caller() {
+ ; 32BIT-LABEL: name: caller
+ ; 32BIT: bb.0.entry:
+ ; 32BIT: renamable $r3 = LWZ 28, %stack.0.vs :: (load (s32) from unknown-address + 4)
+ ; 32BIT: STW killed renamable $r3, 60, $r1 :: (store (s32) into unknown-address + 4, basealign 16)
+ ; 32BIT: renamable $r3 = LWZ 24, %stack.0.vs :: (load (s32) from %stack.0.vs + 24, align 8, basealign 16)
+ ; 32BIT: STW killed renamable $r3, 56, $r1 :: (store (s32), align 16)
+ ; 32BIT: ADJCALLSTACKDOWN 64, 0, implicit-def dead $r1, implicit $r1
+ ; 32BIT: renamable $r10 = LWZ 20, %stack.0.vs :: (load (s32) from %stack.0.vs + 20)
+ ; 32BIT: renamable $r9 = LWZ 16, %stack.0.vs :: (load (s32) from %stack.0.vs + 16, align 16)
+ ; 32BIT: renamable $r8 = LWZ 12, %stack.0.vs :: (load (s32) from %stack.0.vs + 12)
+ ; 32BIT: renamable $r7 = LWZ 8, %stack.0.vs :: (load (s32) from %stack.0.vs + 8, align 8)
+ ; 32BIT: renamable $r6 = LWZ 4, %stack.0.vs :: (load (s32) from %stack.0.vs + 4)
+ ; 32BIT: renamable $r5 = LWZ 0, %stack.0.vs :: (load (s32) from %stack.0.vs, align 16)
+ ; 32BIT: $r3 = LI 0
+ ; 32BIT: BL_NOP <mcsymbol .vec_struct_test[PR]>, csr_aix32, implicit-def dead $lr, implicit $rm, implicit $r3, implicit $r5, implicit $r6, implicit $r7, implicit $r8, implicit $r9, implicit $r10, implicit $r2, implicit-def $r1, implicit-def $r3
+ ; 32BIT: ADJCALLSTACKUP 64, 0, implicit-def dead $r1, implicit $r1
+ ; 32BIT: BLR implicit $lr, implicit $rm, implicit $r3
+
+ ; 64BIT-LABEL: name: caller
+ ; 64BIT: bb.0.entry:
+ ; 64BIT: ADJCALLSTACKDOWN 112, 0, implicit-def dead $r1, implicit $r1
+ ; 64BIT: renamable $x8 = LD 24, %stack.0.vs :: (load (s64) from %stack.0.vs + 24)
+ ; 64BIT: renamable $x7 = LD 16, %stack.0.vs :: (load (s64) from %stack.0.vs + 16, align 16)
+ ; 64BIT: renamable $x6 = LD 8, %stack.0.vs :: (load (s64) from %stack.0.vs + 8)
+ ; 64BIT: renamable $x5 = LD 0, %stack.0.vs :: (load (s64) from %stack.0.vs, align 16)
+ ; 64BIT: $x3 = LI8 0
+ ; 64BIT: BL8_NOP <mcsymbol .vec_struct_test[PR]>, csr_ppc64, implicit-def dead $lr8, implicit $rm, implicit $x3, implicit $x5, implicit $x6, implicit $x7, implicit $x8, implicit $x2, implicit-def $r1, implicit-def $x3
+ ; 64BIT: ADJCALLSTACKUP 112, 0, implicit-def dead $r1, implicit $r1
+ ; 64BIT: BLR8 implicit $lr8, implicit $rm, implicit $x3
+ entry:
+ %vs = alloca %struct.B, align 16
+ %call = tail call i32 @vec_struct_test(i32 0, %struct.B* nonnull byval(%struct.B) align 16 %vs)
+ ret i32 %call
+}
+
+declare i32 @vec_struct_test(i32, %struct.B* byval(%struct.B) align 16)
|
%struct.vec_struct = type { <4 x i32> } | ||
|
||
; Function Attrs: norecurse nounwind readonly | ||
define i32 @vec_struct_test(i32 %i, %struct.vec_struct* nocapture readonly byval(%struct.vec_struct) align 16 %vs) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Shouldn't the type be ptr
instead of %struct.vec_struct*
?
; 64BIT-NEXT: renamable $r3 = nsw ADD4 renamable $r4, renamable $r3, implicit killed $x3, implicit killed $x4, implicit-def $x3 | ||
; 64BIT-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x3 | ||
entry: | ||
%vsi = getelementptr inbounds %struct.vec_struct, %struct.vec_struct* %vs, i32 0, i32 0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think if we want this test to look mire like the IR we produce now then we want to use opaque ptrs and i8 geps:
%vsi = getelementptr inbounds %struct.vec_struct, %struct.vec_struct* %vs, i32 0, i32 0 | |
%vsi = getelementptr inbounds i8, ptr %vs, i32 0 |
; 64BIT-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x3 | ||
entry: | ||
%vsi = getelementptr inbounds %struct.vec_struct, %struct.vec_struct* %vs, i32 0, i32 0 | ||
%0 = load <4 x i32>, <4 x i32>* %vsi, align 16 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
%0 = load <4 x i32>, <4 x i32>* %vsi, align 16 | |
%0 = load <4 x i32>, ptr %vsi, align 16 |
; 64BIT-NEXT: BLR8 implicit $lr8, implicit $rm, implicit $x3 | ||
entry: | ||
%vs = alloca %struct.B, align 16 | ||
%call = tail call i32 @vec_struct_test(i32 0, %struct.B* nonnull byval(%struct.B) align 16 %vs) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
%call = tail call i32 @vec_struct_test(i32 0, %struct.B* nonnull byval(%struct.B) align 16 %vs) | |
%call = tail call i32 @vec_struct_test(i32 0, ptr nonnull byval(%struct.B) align 16 %vs) |
ret i32 %call | ||
} | ||
|
||
declare i32 @vec_struct_test(i32, %struct.B* byval(%struct.B) align 16) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
declare i32 @vec_struct_test(i32, %struct.B* byval(%struct.B) align 16) | |
declare i32 @vec_struct_test(i32, ptr byval(%struct.B) align 16) |
Worth noting this approach is compatible with gcc on AIX (as well as OpenXL) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
Implementation is NOT compatible with IBM XL C 16.1 and earlier but is compatible with GCC.
It handles all ByVals with greater alignment then pointer width the same way IBM XL C handles Byvals
that have vector members. For overaligned objects that do not contain
vectors IBM XL C does not align them properly if they are passed in the GPR
argument registers.
This patch was originally written by Sean Fertile @mandlebug.
Previously on Phabricator https://reviews.llvm.org/D105659