-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64][GlobalISel] Implement selectVaStartAAPCS #106979
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
This commit adds the missing support for varargs in the instruction selection pass for AAPCS. Previously we only implemented this for Darwin. The implementation was according to AAPCS and SelectionDAG's LowerAAPCS_VASTART. It resolves all VA_START fallbacks in RAJAperf, llvm-test-suite, and SPEC CPU2017. These benchmarks now compile and pass without fallbacks due to varargs.
@llvm/pr-subscribers-backend-aarch64 Author: Him188 (Him188) ChangesThis commit adds the missing support for varargs in the instruction selection pass for AAPCS. Previously we only implemented this for Darwin. The implementation was according to AAPCS and SelectionDAG's LowerAAPCS_VASTART. It resolves all VA_START fallbacks in RAJAperf, llvm-test-suite, and SPEC CPU2017. These benchmarks now compile and pass without fallbacks due to varargs. Patch is 42.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106979.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index e9e6b6cb68d0d1..29d1392b51090c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1994,7 +1994,108 @@ bool AArch64InstructionSelector::selectVectorAshrLshr(
bool AArch64InstructionSelector::selectVaStartAAPCS(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
- return false;
+
+ if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
+ MF.getFunction().getCallingConv(), MF.getFunction().isVarArg()))
+ return false;
+
+ // The layout of the va_list struct is specified in the AArch64 Procedure Call
+ // Standard, section 10.1.5.
+
+ const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ const unsigned PtrSize = STI.isTargetILP32() ? 4 : 8;
+ const auto *PtrRegClass =
+ STI.isTargetILP32() ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
+
+ const MCInstrDesc &MCIDAddAddr =
+ TII.get(STI.isTargetILP32() ? AArch64::ADDWri : AArch64::ADDXri);
+ const MCInstrDesc &MCIDStoreAddr =
+ TII.get(STI.isTargetILP32() ? AArch64::STRWui : AArch64::STRXui);
+
+ /*
+ * typedef struct va_list {
+ * void * stack; // next stack param
+ * void * gr_top; // end of GP arg reg save area
+ * void * vr_top; // end of FP/SIMD arg reg save area
+ * int gr_offs; // offset from gr_top to next GP register arg
+ * int vr_offs; // offset from vr_top to next FP/SIMD register arg
+ * } va_list;
+ */
+ const auto VAList = I.getOperand(0).getReg();
+
+ // Our current offset in bytes from the va_list struct (VAList).
+ unsigned OffsetBytes = 0;
+
+ // Helper function to store (FrameIndex + Imm) to VAList at offset OffsetBytes
+ // and increment OffsetBytes by PtrSize.
+ const auto PushAddress = [&](const int FrameIndex, const int64_t Imm) {
+ const Register Top = MRI.createVirtualRegister(PtrRegClass);
+ auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDAddAddr)
+ .addDef(Top)
+ .addFrameIndex(FrameIndex)
+ .addImm(Imm)
+ .addImm(0);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+
+ MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDStoreAddr)
+ .addUse(Top)
+ .addUse(VAList)
+ .addImm(OffsetBytes / PtrSize)
+ .addMemOperand(MF.getMachineMemOperand(
+ (*I.memoperands_begin())
+ ->getPointerInfo()
+ .getWithOffset(OffsetBytes),
+ MachineMemOperand::MOStore, PtrSize, Align(PtrSize)));
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+
+ OffsetBytes += PtrSize;
+ };
+
+ // void* stack at offset 0
+ PushAddress(FuncInfo->getVarArgsStackIndex(), 0);
+
+ // void* gr_top at offset 8 (4 on ILP32)
+ const unsigned GPRSize = FuncInfo->getVarArgsGPRSize();
+ PushAddress(FuncInfo->getVarArgsGPRIndex(), GPRSize);
+
+ // void* vr_top at offset 16 (8 on ILP32)
+ const unsigned FPRSize = FuncInfo->getVarArgsFPRSize();
+ PushAddress(FuncInfo->getVarArgsFPRIndex(), FPRSize);
+
+ // Helper function to store a 4-byte integer constant to VAList at offset
+ // OffsetBytes, and increment OffsetBytes by 4.
+ const auto PushIntConstant = [&](const int32_t Value) {
+ constexpr int IntSize = 4;
+ const Register Temp = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ auto MIB =
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::MOVi32imm))
+ .addDef(Temp)
+ .addImm(Value);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+
+ MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRWui))
+ .addUse(Temp)
+ .addUse(VAList)
+ .addImm(OffsetBytes / IntSize)
+ .addMemOperand(MF.getMachineMemOperand(
+ (*I.memoperands_begin())
+ ->getPointerInfo()
+ .getWithOffset(OffsetBytes),
+ MachineMemOperand::MOStore, IntSize, Align(IntSize)));
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ OffsetBytes += IntSize;
+ };
+
+ // int gr_offs at offset 24 (12 on ILP32)
+ PushIntConstant(-static_cast<int32_t>(GPRSize));
+
+ // int vr_offs at offset 28 (16 on ILP32)
+ PushIntConstant(-static_cast<int32_t>(FPRSize));
+
+ assert(OffsetBytes == (STI.isTargetILP32() ? 20 : 32) && "Unexpected offset");
+
+ I.eraseFromParent();
+ return true;
}
bool AArch64InstructionSelector::selectVaStartDarwin(
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/vararg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/vararg.mir
new file mode 100644
index 00000000000000..50911582eef0bc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/vararg.mir
@@ -0,0 +1,325 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=aarch64-unknown-linux -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK
+
+--- |
+ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+ target triple = "aarch64-unknown-linux-gnu"
+
+ %struct.__va_list = type { ptr, ptr, ptr, i32, i32 }
+
+ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+ declare void @llvm.va_start.p0(ptr) #1
+
+ declare void @llvm.va_end.p0(ptr) #1
+
+ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+ define float @vararg(ptr %a, ...) #2 {
+ entry:
+ %ap = alloca %struct.__va_list, align 8
+ call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %ap)
+ call void @llvm.va_start.p0(ptr nonnull %ap)
+ %vr_offs_p = getelementptr inbounds i8, ptr %ap, i64 28
+ %vr_offs = load i32, ptr %vr_offs_p, align 4
+ %0 = sext i32 %vr_offs to i64
+ %1 = icmp sgt i32 %vr_offs, -1
+ br i1 %1, label %vaarg.on_stack, label %vaarg.maybe_reg
+
+ vaarg.maybe_reg: ; preds = %entry
+ %2 = trunc i64 %0 to i32
+ %3 = trunc i64 %0 to i32
+ %new_reg_offs = add nsw i32 %2, 16
+ %sunkaddr = getelementptr inbounds i8, ptr %ap, i64 28
+ store i32 %new_reg_offs, ptr %sunkaddr, align 4
+ %inreg = icmp ult i32 %3, -15
+ br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack
+
+ vaarg.in_reg: ; preds = %vaarg.maybe_reg
+ %reg_top_p = getelementptr inbounds i8, ptr %ap, i64 16
+ %reg_top = load ptr, ptr %reg_top_p, align 8
+ %4 = getelementptr inbounds i8, ptr %reg_top, i64 %0
+ br label %vaarg.end
+
+ vaarg.on_stack: ; preds = %vaarg.maybe_reg, %entry
+ %stack = load ptr, ptr %ap, align 8
+ %new_stack = getelementptr inbounds i8, ptr %stack, i64 8
+ store ptr %new_stack, ptr %ap, align 8
+ br label %vaarg.end
+
+ vaarg.end: ; preds = %vaarg.on_stack, %vaarg.in_reg
+ %vaargs.addr = phi ptr [ %4, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ]
+ %5 = load double, ptr %vaargs.addr, align 8
+ %conv = fptrunc double %5 to float
+ call void @llvm.va_end.p0(ptr nonnull %ap)
+ call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %ap)
+ ret float %conv
+ }
+
+ attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) "target-cpu"="neoverse-v2" }
+ attributes #1 = { nocallback nofree nosync nounwind willreturn "target-cpu"="neoverse-v2" }
+ attributes #2 = { uwtable "frame-pointer"="all" "target-cpu"="neoverse-v2" }
+
+...
+---
+name: vararg
+alignment: 16
+exposesReturnsTwice: false
+legalized: true
+regBankSelected: true
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHCatchret: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+liveins:
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 16
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+ - { id: 0, type: default, offset: 0, size: 4, alignment: 16, stack-id: default,
+ isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+stack:
+ - { id: 0, name: '', type: default, offset: 0, size: 56, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: default, offset: 0, size: 128, alignment: 16,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 2, name: ap, type: default, offset: 0, size: 32, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ ; CHECK-LABEL: name: vararg
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.3(0x50000000), %bb.1(0x30000000)
+ ; CHECK-NEXT: liveins: $q0, $q1, $q2, $q3, $q4, $q5, $q6, $q7, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x1
+ ; CHECK-NEXT: STRXui [[COPY]], %stack.0, 0 :: (store (s64) into stack + 8, align 1)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x2
+ ; CHECK-NEXT: STRXui [[COPY1]], %stack.0, 1 :: (store (s64) into stack + 16, align 1)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x3
+ ; CHECK-NEXT: STRXui [[COPY2]], %stack.0, 2 :: (store (s64) into stack + 24, align 1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY $x4
+ ; CHECK-NEXT: STRXui [[COPY3]], %stack.0, 3 :: (store (s64) into stack + 32, align 1)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY $x5
+ ; CHECK-NEXT: STRXui [[COPY4]], %stack.0, 4 :: (store (s64) into stack + 40, align 1)
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64 = COPY $x6
+ ; CHECK-NEXT: STRXui [[COPY5]], %stack.0, 5 :: (store (s64) into stack + 48, align 1)
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64 = COPY $x7
+ ; CHECK-NEXT: STRXui [[COPY6]], %stack.0, 6 :: (store (s64) into stack + 56, align 1)
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:fpr128 = COPY $q0
+ ; CHECK-NEXT: STRQui [[COPY7]], %stack.1, 0 :: (store (s128) into stack, align 1)
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:fpr128 = COPY $q1
+ ; CHECK-NEXT: STRQui [[COPY8]], %stack.1, 1 :: (store (s128) into stack + 16, align 1)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:fpr128 = COPY $q2
+ ; CHECK-NEXT: STRQui [[COPY9]], %stack.1, 2 :: (store (s128) into stack + 32, align 1)
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:fpr128 = COPY $q3
+ ; CHECK-NEXT: STRQui [[COPY10]], %stack.1, 3 :: (store (s128) into stack + 48, align 1)
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:fpr128 = COPY $q4
+ ; CHECK-NEXT: STRQui [[COPY11]], %stack.1, 4 :: (store (s128) into stack + 64, align 1)
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:fpr128 = COPY $q5
+ ; CHECK-NEXT: STRQui [[COPY12]], %stack.1, 5 :: (store (s128) into stack + 80, align 1)
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:fpr128 = COPY $q6
+ ; CHECK-NEXT: STRQui [[COPY13]], %stack.1, 6 :: (store (s128) into stack + 96, align 1)
+ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:fpr128 = COPY $q7
+ ; CHECK-NEXT: STRQui [[COPY14]], %stack.1, 7 :: (store (s128) into stack + 112, align 1)
+ ; CHECK-NEXT: LIFETIME_START %stack.2.ap
+ ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.2.ap, 0, 0
+ ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: STRXui [[ADDXri1]], [[ADDXri]], 0 :: (store (s64) into %ir.ap)
+ ; CHECK-NEXT: [[ADDXri2:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: STRXui [[ADDXri2]], [[ADDXri]], 1 :: (store (s64) into %ir.ap + 8)
+ ; CHECK-NEXT: [[ADDXri3:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: STRXui [[ADDXri3]], [[ADDXri]], 2 :: (store (s64) into %ir.ap + 16)
+ ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 0
+ ; CHECK-NEXT: STRWui [[MOVi32imm]], [[ADDXri]], 6 :: (store (s32) into %ir.ap + 24)
+ ; CHECK-NEXT: [[MOVi32imm1:%[0-9]+]]:gpr32 = MOVi32imm 0
+ ; CHECK-NEXT: STRWui [[MOVi32imm1]], [[ADDXri]], 7 :: (store (s32) into %ir.ap + 28)
+ ; CHECK-NEXT: [[LDRSWui:%[0-9]+]]:gpr64common = LDRSWui %stack.2.ap, 7 :: (dereferenceable load (s32) from %ir.vr_offs_p)
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr32common = COPY [[LDRSWui]].sub_32
+ ; CHECK-NEXT: TBZW [[COPY15]], 31, %bb.3
+ ; CHECK-NEXT: B %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.vaarg.maybe_reg:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr32sp = COPY [[LDRSWui]].sub_32
+ ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32common = nsw ADDWri [[COPY16]], 16, 0
+ ; CHECK-NEXT: STRWui [[ADDWri]], %stack.2.ap, 7 :: (store (s32) into %ir.sunkaddr)
+ ; CHECK-NEXT: [[ADDSWri:%[0-9]+]]:gpr32 = ADDSWri [[COPY16]], 15, 0, implicit-def $nzcv
+ ; CHECK-NEXT: Bcc 2, %bb.3, implicit $nzcv
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.vaarg.in_reg:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui %stack.2.ap, 2 :: (dereferenceable load (p0) from %ir.reg_top_p)
+ ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRSWui]]
+ ; CHECK-NEXT: B %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.vaarg.on_stack:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui %stack.2.ap, 0 :: (dereferenceable load (p0) from %ir.ap)
+ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gpr64common = COPY [[LDRXui1]]
+ ; CHECK-NEXT: [[ADDXri4:%[0-9]+]]:gpr64sp = ADDXri [[COPY17]], 8, 0
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gpr64 = COPY [[ADDXri4]]
+ ; CHECK-NEXT: STRXui [[COPY18]], %stack.2.ap, 0 :: (store (p0) into %ir.ap)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.vaarg.end:
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr64sp = PHI [[ADDXrr]], %bb.2, [[LDRXui1]], %bb.3
+ ; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[PHI]], 0 :: (load (s64) from %ir.vaargs.addr)
+ ; CHECK-NEXT: [[FCVTSDr:%[0-9]+]]:fpr32 = nofpexcept FCVTSDr [[LDRDui]], implicit $fpcr
+ ; CHECK-NEXT: LIFETIME_END %stack.2.ap
+ ; CHECK-NEXT: $s0 = COPY [[FCVTSDr]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $s0
+ bb.1.entry:
+ successors: %bb.4(0x50000000), %bb.2(0x30000000)
+ liveins: $q0, $q1, $q2, $q3, $q4, $q5, $q6, $q7, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+
+ %3:gpr(s64) = COPY $x1
+ %1:gpr(p0) = G_FRAME_INDEX %stack.0
+ G_STORE %3(s64), %1(p0) :: (store (s64) into stack + 8, align 1)
+ %5:gpr(s64) = COPY $x2
+ %2:gpr(s64) = G_CONSTANT i64 8
+ %4:gpr(p0) = G_PTR_ADD %1, %2(s64)
+ G_STORE %5(s64), %4(p0) :: (store (s64) into stack + 16, align 1)
+ %7:gpr(s64) = COPY $x3
+ %18:gpr(s64) = G_CONSTANT i64 16
+ %6:gpr(p0) = G_PTR_ADD %1, %18(s64)
+ G_STORE %7(s64), %6(p0) :: (store (s64) into stack + 24, align 1)
+ %9:gpr(s64) = COPY $x4
+ %61:gpr(s64) = G_CONSTANT i64 24
+ %8:gpr(p0) = G_PTR_ADD %1, %61(s64)
+ G_STORE %9(s64), %8(p0) :: (store (s64) into stack + 32, align 1)
+ %11:gpr(s64) = COPY $x5
+ %62:gpr(s64) = G_CONSTANT i64 32
+ %10:gpr(p0) = G_PTR_ADD %1, %62(s64)
+ G_STORE %11(s64), %10(p0) :: (store (s64) into stack + 40, align 1)
+ %13:gpr(s64) = COPY $x6
+ %63:gpr(s64) = G_CONSTANT i64 40
+ %12:gpr(p0) = G_PTR_ADD %1, %63(s64)
+ G_STORE %13(s64), %12(p0) :: (store (s64) into stack + 48, align 1)
+ %15:gpr(s64) = COPY $x7
+ %64:gpr(s64) = G_CONSTANT i64 48
+ %14:gpr(p0) = G_PTR_ADD %1, %64(s64)
+ G_STORE %15(s64), %14(p0) :: (store (s64) into stack + 56, align 1)
+ %19:fpr(s128) = COPY $q0
+ %17:gpr(p0) = G_FRAME_INDEX %stack.1
+ G_STORE %19(s128), %17(p0) :: (store (s128) into stack, align 1)
+ %21:fpr(s128) = COPY $q1
+ %20:gpr(p0) = G_PTR_ADD %17, %18(s64)
+ G_STORE %21(s128), %20(p0) :: (store (s128) into stack + 16, align 1)
+ %23:fpr(s128) = COPY $q2
+ %22:gpr(p0) = G_PTR_ADD %17, %62(s64)
+ G_STORE %23(s128), %22(p0) :: (store (s128) into stack + 32, align 1)
+ %25:fpr(s128) = COPY $q3
+ %24:gpr(p0) = G_PTR_ADD %17, %64(s64)
+ G_STORE %25(s128), %24(p0) :: (store (s128) into stack + 48, align 1)
+ %27:fpr(s128) = COPY $q4
+ %65:gpr(s64) = G_CONSTANT i64 64
+ %26:gpr(p0) = G_PTR_ADD %17, %65(s64)
+ G_STORE %27(s128), %26(p0) :: (store (s128) into stack + 64, align 1)
+ %29:fpr(s128) = COPY $q5
+ %66:gpr(s64) = G_CONSTANT i64 80
+ %28:gpr(p0) = G_PTR_ADD %17, %66(s64)
+ G_STORE %29(s128), %28(p0) :: (store (s128) into stack + 80, align 1)
+ %31:fpr(s128) = COPY $q6
+ %67:gpr(s64) = G_CONSTANT i64 96
+ %30:gpr(p0) = G_PTR_ADD %17, %67(s64)
+ G_STORE %31(s128), %30(p0) :: (store (s128) into stack + 96, align 1)
+ %33:fpr(s128) = COPY $q7
+ %68:gpr(s64) = G_CONSTANT i64 112
+ %32:gpr(p0) = G_PTR_ADD %17, %68(s64)
+ G_STORE %33(s128), %32(p0) :: (store (s128) into stack + 112, align 1)
+ LIFETIME_START %stack.2.ap
+ %35:gpr(p0) = G_FRAME_INDEX %stack.2.ap
+ G_VASTART %35(p0) :: (store (s256) into %ir.ap, align 8)
+ %36:gpr(s64) = G_CONSTANT i64 28
+ %37:gpr(p0) = G_PTR_ADD %35, %36(s64)
+ %39:gpr(s64) = G_SEXTLOAD %37(p0) :: (dereferenceable load (s32) from %ir.vr_offs_p)
+ %69:gpr(s32) = G_TRUNC %39(s64)
+ %81:gpr(s32) = G_CONSTANT i32 0
+ %80:gpr(s32) = G_ICMP intpred(sge), %69(s32), %81
+ G_BRCOND %80(s32), %bb.4
+ G_BR %bb.2
+
+ bb.2.vaarg.maybe_reg:
+ successors: %bb.3(0x40000000), %bb.4(0x40000000)
+
+ %42:gpr(s32) = G_TRUNC %39(s64)
+ %76:gpr(s32) = G_CONSTANT i32 16
+ %45:gpr(s32) = nsw G_ADD %42, %76
+ %46:gpr(s64) = G_CONSTANT i64 28
+ %72:gpr(p0) = G_FRAME_INDEX %stack.2.ap
+ %47:gpr(p0) = G_PTR_ADD %72, %46(s64)
+ G_STORE %45(s32), %47(p0) :: (store (s32) into %ir.sunkaddr)
+ %75:gpr(s32) = G_CONSTANT i32 -15
+ %78:gpr(s32) = G_ICMP intpred(uge), %42(s32), %75
+ G_BRCOND %78(s32), %bb.4
+ G_BR %bb.3
+
+ bb.3.vaarg.in_reg:
+ successors: %bb.5(0x80000000)
+
+ %50:gpr(s64) = G_CONSTANT i64 16
+ %73:gpr(p0) = G_FRAME_INDEX %stack.2.ap
+ %51:gpr(p0) = G_PTR_ADD %73, %50(s64)
+ %52:gpr(p0) = G_LOAD %51(p0) :: (dereferenceable load (p0) from %ir.reg_top_p)
+ %53:gpr(p0) = G_PTR_ADD %52, %39(s64)
+ G_BR %bb.5
+
+ bb.4.vaarg.on_stack:
+ successors: %bb.5(0x80000000)
+
+ %74:gpr(p0) = G_FRAME_INDEX %stack.2.ap
+ %55:gpr(p0) = G_LOAD %74(p0) :: (dereferenceable load (p0) from %ir.ap)
+ %56:gpr(s64) = G_CONSTANT i64 8
+ %57:gpr(p0) = G_PTR_ADD %55, %56(s64)
+ G_STORE %57(p0), %74(p0) :: (store (p0) into %ir.ap)
+
+ bb.5.vaarg.end:
+ %58:gpr(p0) = G_PHI %53(p0), %bb.3, %55(p0), %bb.4
+ %59:fpr(s64) = G_LOAD %58(p0) :: (load (s64) from %ir.vaargs.addr)
+ %60:fpr(s32) = G_FPTRUNC %59(s64)
+ LIFETIME_END %stack.2.ap
+ $s0 = COPY %60(s32)
+ RET_ReallyLR implicit $s0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/vararg.ll b/llvm/test/CodeGen/AArch64/vararg.ll
new file mode 100644
index 00000000000000..fa9cdf4fc1bcaa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vararg.ll
@@ -0,0 +1,569 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -O0 -gl...
[truncated]
|
@llvm/pr-subscribers-llvm-globalisel Author: Him188 (Him188) ChangesThis commit adds the missing support for varargs in the instruction selection pass for AAPCS. Previously we only implemented this for Darwin. The implementation was according to AAPCS and SelectionDAG's LowerAAPCS_VASTART. It resolves all VA_START fallbacks in RAJAperf, llvm-test-suite, and SPEC CPU2017. These benchmarks now compile and pass without fallbacks due to varargs. Patch is 42.26 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/106979.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index e9e6b6cb68d0d1..29d1392b51090c 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -1994,7 +1994,108 @@ bool AArch64InstructionSelector::selectVectorAshrLshr(
bool AArch64InstructionSelector::selectVaStartAAPCS(
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
- return false;
+
+ if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64(
+ MF.getFunction().getCallingConv(), MF.getFunction().isVarArg()))
+ return false;
+
+ // The layout of the va_list struct is specified in the AArch64 Procedure Call
+ // Standard, section 10.1.5.
+
+ const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+ const unsigned PtrSize = STI.isTargetILP32() ? 4 : 8;
+ const auto *PtrRegClass =
+ STI.isTargetILP32() ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass;
+
+ const MCInstrDesc &MCIDAddAddr =
+ TII.get(STI.isTargetILP32() ? AArch64::ADDWri : AArch64::ADDXri);
+ const MCInstrDesc &MCIDStoreAddr =
+ TII.get(STI.isTargetILP32() ? AArch64::STRWui : AArch64::STRXui);
+
+ /*
+ * typedef struct va_list {
+ * void * stack; // next stack param
+ * void * gr_top; // end of GP arg reg save area
+ * void * vr_top; // end of FP/SIMD arg reg save area
+ * int gr_offs; // offset from gr_top to next GP register arg
+ * int vr_offs; // offset from vr_top to next FP/SIMD register arg
+ * } va_list;
+ */
+ const auto VAList = I.getOperand(0).getReg();
+
+ // Our current offset in bytes from the va_list struct (VAList).
+ unsigned OffsetBytes = 0;
+
+ // Helper function to store (FrameIndex + Imm) to VAList at offset OffsetBytes
+ // and increment OffsetBytes by PtrSize.
+ const auto PushAddress = [&](const int FrameIndex, const int64_t Imm) {
+ const Register Top = MRI.createVirtualRegister(PtrRegClass);
+ auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDAddAddr)
+ .addDef(Top)
+ .addFrameIndex(FrameIndex)
+ .addImm(Imm)
+ .addImm(0);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+
+ MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), MCIDStoreAddr)
+ .addUse(Top)
+ .addUse(VAList)
+ .addImm(OffsetBytes / PtrSize)
+ .addMemOperand(MF.getMachineMemOperand(
+ (*I.memoperands_begin())
+ ->getPointerInfo()
+ .getWithOffset(OffsetBytes),
+ MachineMemOperand::MOStore, PtrSize, Align(PtrSize)));
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+
+ OffsetBytes += PtrSize;
+ };
+
+ // void* stack at offset 0
+ PushAddress(FuncInfo->getVarArgsStackIndex(), 0);
+
+ // void* gr_top at offset 8 (4 on ILP32)
+ const unsigned GPRSize = FuncInfo->getVarArgsGPRSize();
+ PushAddress(FuncInfo->getVarArgsGPRIndex(), GPRSize);
+
+ // void* vr_top at offset 16 (8 on ILP32)
+ const unsigned FPRSize = FuncInfo->getVarArgsFPRSize();
+ PushAddress(FuncInfo->getVarArgsFPRIndex(), FPRSize);
+
+ // Helper function to store a 4-byte integer constant to VAList at offset
+ // OffsetBytes, and increment OffsetBytes by 4.
+ const auto PushIntConstant = [&](const int32_t Value) {
+ constexpr int IntSize = 4;
+ const Register Temp = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ auto MIB =
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::MOVi32imm))
+ .addDef(Temp)
+ .addImm(Value);
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+
+ MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRWui))
+ .addUse(Temp)
+ .addUse(VAList)
+ .addImm(OffsetBytes / IntSize)
+ .addMemOperand(MF.getMachineMemOperand(
+ (*I.memoperands_begin())
+ ->getPointerInfo()
+ .getWithOffset(OffsetBytes),
+ MachineMemOperand::MOStore, IntSize, Align(IntSize)));
+ constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+ OffsetBytes += IntSize;
+ };
+
+ // int gr_offs at offset 24 (12 on ILP32)
+ PushIntConstant(-static_cast<int32_t>(GPRSize));
+
+ // int vr_offs at offset 28 (16 on ILP32)
+ PushIntConstant(-static_cast<int32_t>(FPRSize));
+
+ assert(OffsetBytes == (STI.isTargetILP32() ? 20 : 32) && "Unexpected offset");
+
+ I.eraseFromParent();
+ return true;
}
bool AArch64InstructionSelector::selectVaStartDarwin(
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/vararg.mir b/llvm/test/CodeGen/AArch64/GlobalISel/vararg.mir
new file mode 100644
index 00000000000000..50911582eef0bc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/vararg.mir
@@ -0,0 +1,325 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -O0 -mtriple=aarch64-unknown-linux -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK
+
+--- |
+ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+ target triple = "aarch64-unknown-linux-gnu"
+
+ %struct.__va_list = type { ptr, ptr, ptr, i32, i32 }
+
+ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+ declare void @llvm.va_start.p0(ptr) #1
+
+ declare void @llvm.va_end.p0(ptr) #1
+
+ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+ define float @vararg(ptr %a, ...) #2 {
+ entry:
+ %ap = alloca %struct.__va_list, align 8
+ call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %ap)
+ call void @llvm.va_start.p0(ptr nonnull %ap)
+ %vr_offs_p = getelementptr inbounds i8, ptr %ap, i64 28
+ %vr_offs = load i32, ptr %vr_offs_p, align 4
+ %0 = sext i32 %vr_offs to i64
+ %1 = icmp sgt i32 %vr_offs, -1
+ br i1 %1, label %vaarg.on_stack, label %vaarg.maybe_reg
+
+ vaarg.maybe_reg: ; preds = %entry
+ %2 = trunc i64 %0 to i32
+ %3 = trunc i64 %0 to i32
+ %new_reg_offs = add nsw i32 %2, 16
+ %sunkaddr = getelementptr inbounds i8, ptr %ap, i64 28
+ store i32 %new_reg_offs, ptr %sunkaddr, align 4
+ %inreg = icmp ult i32 %3, -15
+ br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack
+
+ vaarg.in_reg: ; preds = %vaarg.maybe_reg
+ %reg_top_p = getelementptr inbounds i8, ptr %ap, i64 16
+ %reg_top = load ptr, ptr %reg_top_p, align 8
+ %4 = getelementptr inbounds i8, ptr %reg_top, i64 %0
+ br label %vaarg.end
+
+ vaarg.on_stack: ; preds = %vaarg.maybe_reg, %entry
+ %stack = load ptr, ptr %ap, align 8
+ %new_stack = getelementptr inbounds i8, ptr %stack, i64 8
+ store ptr %new_stack, ptr %ap, align 8
+ br label %vaarg.end
+
+ vaarg.end: ; preds = %vaarg.on_stack, %vaarg.in_reg
+ %vaargs.addr = phi ptr [ %4, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ]
+ %5 = load double, ptr %vaargs.addr, align 8
+ %conv = fptrunc double %5 to float
+ call void @llvm.va_end.p0(ptr nonnull %ap)
+ call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %ap)
+ ret float %conv
+ }
+
+ attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) "target-cpu"="neoverse-v2" }
+ attributes #1 = { nocallback nofree nosync nounwind willreturn "target-cpu"="neoverse-v2" }
+ attributes #2 = { uwtable "frame-pointer"="all" "target-cpu"="neoverse-v2" }
+
+...
+---
+name: vararg
+alignment: 16
+exposesReturnsTwice: false
+legalized: true
+regBankSelected: true
+selected: false
+failedISel: false
+tracksRegLiveness: true
+hasWinCFI: false
+callsEHReturn: false
+callsUnwindInit: false
+hasEHCatchret: false
+hasEHScopes: false
+hasEHFunclets: false
+isOutlined: false
+debugInstrRef: false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+liveins:
+frameInfo:
+ isFrameAddressTaken: false
+ isReturnAddressTaken: false
+ hasStackMap: false
+ hasPatchPoint: false
+ stackSize: 0
+ offsetAdjustment: 0
+ maxAlignment: 16
+ adjustsStack: false
+ hasCalls: false
+ stackProtector: ''
+ functionContext: ''
+ maxCallFrameSize: 4294967295
+ cvBytesOfCalleeSavedRegisters: 0
+ hasOpaqueSPAdjustment: false
+ hasVAStart: false
+ hasMustTailInVarArgFunc: false
+ hasTailCall: false
+ isCalleeSavedInfoValid: false
+ localFrameSize: 0
+ savePoint: ''
+ restorePoint: ''
+fixedStack:
+ - { id: 0, type: default, offset: 0, size: 4, alignment: 16, stack-id: default,
+ isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+stack:
+ - { id: 0, name: '', type: default, offset: 0, size: 56, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 1, name: '', type: default, offset: 0, size: 128, alignment: 16,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+ - { id: 2, name: ap, type: default, offset: 0, size: 32, alignment: 8,
+ stack-id: default, callee-saved-register: '', callee-saved-restored: true,
+ debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+entry_values: []
+callSites: []
+debugValueSubstitutions: []
+constants: []
+machineFunctionInfo: {}
+body: |
+ ; CHECK-LABEL: name: vararg
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.3(0x50000000), %bb.1(0x30000000)
+ ; CHECK-NEXT: liveins: $q0, $q1, $q2, $q3, $q4, $q5, $q6, $q7, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr64 = COPY $x1
+ ; CHECK-NEXT: STRXui [[COPY]], %stack.0, 0 :: (store (s64) into stack + 8, align 1)
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gpr64 = COPY $x2
+ ; CHECK-NEXT: STRXui [[COPY1]], %stack.0, 1 :: (store (s64) into stack + 16, align 1)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gpr64 = COPY $x3
+ ; CHECK-NEXT: STRXui [[COPY2]], %stack.0, 2 :: (store (s64) into stack + 24, align 1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gpr64 = COPY $x4
+ ; CHECK-NEXT: STRXui [[COPY3]], %stack.0, 3 :: (store (s64) into stack + 32, align 1)
+ ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gpr64 = COPY $x5
+ ; CHECK-NEXT: STRXui [[COPY4]], %stack.0, 4 :: (store (s64) into stack + 40, align 1)
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gpr64 = COPY $x6
+ ; CHECK-NEXT: STRXui [[COPY5]], %stack.0, 5 :: (store (s64) into stack + 48, align 1)
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr64 = COPY $x7
+ ; CHECK-NEXT: STRXui [[COPY6]], %stack.0, 6 :: (store (s64) into stack + 56, align 1)
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:fpr128 = COPY $q0
+ ; CHECK-NEXT: STRQui [[COPY7]], %stack.1, 0 :: (store (s128) into stack, align 1)
+ ; CHECK-NEXT: [[COPY8:%[0-9]+]]:fpr128 = COPY $q1
+ ; CHECK-NEXT: STRQui [[COPY8]], %stack.1, 1 :: (store (s128) into stack + 16, align 1)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:fpr128 = COPY $q2
+ ; CHECK-NEXT: STRQui [[COPY9]], %stack.1, 2 :: (store (s128) into stack + 32, align 1)
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:fpr128 = COPY $q3
+ ; CHECK-NEXT: STRQui [[COPY10]], %stack.1, 3 :: (store (s128) into stack + 48, align 1)
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:fpr128 = COPY $q4
+ ; CHECK-NEXT: STRQui [[COPY11]], %stack.1, 4 :: (store (s128) into stack + 64, align 1)
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:fpr128 = COPY $q5
+ ; CHECK-NEXT: STRQui [[COPY12]], %stack.1, 5 :: (store (s128) into stack + 80, align 1)
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:fpr128 = COPY $q6
+ ; CHECK-NEXT: STRQui [[COPY13]], %stack.1, 6 :: (store (s128) into stack + 96, align 1)
+ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:fpr128 = COPY $q7
+ ; CHECK-NEXT: STRQui [[COPY14]], %stack.1, 7 :: (store (s128) into stack + 112, align 1)
+ ; CHECK-NEXT: LIFETIME_START %stack.2.ap
+ ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.2.ap, 0, 0
+ ; CHECK-NEXT: [[ADDXri1:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: STRXui [[ADDXri1]], [[ADDXri]], 0 :: (store (s64) into %ir.ap)
+ ; CHECK-NEXT: [[ADDXri2:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: STRXui [[ADDXri2]], [[ADDXri]], 1 :: (store (s64) into %ir.ap + 8)
+ ; CHECK-NEXT: [[ADDXri3:%[0-9]+]]:gpr64common = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: STRXui [[ADDXri3]], [[ADDXri]], 2 :: (store (s64) into %ir.ap + 16)
+ ; CHECK-NEXT: [[MOVi32imm:%[0-9]+]]:gpr32 = MOVi32imm 0
+ ; CHECK-NEXT: STRWui [[MOVi32imm]], [[ADDXri]], 6 :: (store (s32) into %ir.ap + 24)
+ ; CHECK-NEXT: [[MOVi32imm1:%[0-9]+]]:gpr32 = MOVi32imm 0
+ ; CHECK-NEXT: STRWui [[MOVi32imm1]], [[ADDXri]], 7 :: (store (s32) into %ir.ap + 28)
+ ; CHECK-NEXT: [[LDRSWui:%[0-9]+]]:gpr64common = LDRSWui %stack.2.ap, 7 :: (dereferenceable load (s32) from %ir.vr_offs_p)
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gpr32common = COPY [[LDRSWui]].sub_32
+ ; CHECK-NEXT: TBZW [[COPY15]], 31, %bb.3
+ ; CHECK-NEXT: B %bb.1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.vaarg.maybe_reg:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.3(0x40000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gpr32sp = COPY [[LDRSWui]].sub_32
+ ; CHECK-NEXT: [[ADDWri:%[0-9]+]]:gpr32common = nsw ADDWri [[COPY16]], 16, 0
+ ; CHECK-NEXT: STRWui [[ADDWri]], %stack.2.ap, 7 :: (store (s32) into %ir.sunkaddr)
+ ; CHECK-NEXT: [[ADDSWri:%[0-9]+]]:gpr32 = ADDSWri [[COPY16]], 15, 0, implicit-def $nzcv
+ ; CHECK-NEXT: Bcc 2, %bb.3, implicit $nzcv
+ ; CHECK-NEXT: B %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.vaarg.in_reg:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[LDRXui:%[0-9]+]]:gpr64 = LDRXui %stack.2.ap, 2 :: (dereferenceable load (p0) from %ir.reg_top_p)
+ ; CHECK-NEXT: [[ADDXrr:%[0-9]+]]:gpr64 = ADDXrr [[LDRXui]], [[LDRSWui]]
+ ; CHECK-NEXT: B %bb.4
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.3.vaarg.on_stack:
+ ; CHECK-NEXT: successors: %bb.4(0x80000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[LDRXui1:%[0-9]+]]:gpr64 = LDRXui %stack.2.ap, 0 :: (dereferenceable load (p0) from %ir.ap)
+ ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gpr64common = COPY [[LDRXui1]]
+ ; CHECK-NEXT: [[ADDXri4:%[0-9]+]]:gpr64sp = ADDXri [[COPY17]], 8, 0
+ ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gpr64 = COPY [[ADDXri4]]
+ ; CHECK-NEXT: STRXui [[COPY18]], %stack.2.ap, 0 :: (store (p0) into %ir.ap)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.4.vaarg.end:
+ ; CHECK-NEXT: [[PHI:%[0-9]+]]:gpr64sp = PHI [[ADDXrr]], %bb.2, [[LDRXui1]], %bb.3
+ ; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui [[PHI]], 0 :: (load (s64) from %ir.vaargs.addr)
+ ; CHECK-NEXT: [[FCVTSDr:%[0-9]+]]:fpr32 = nofpexcept FCVTSDr [[LDRDui]], implicit $fpcr
+ ; CHECK-NEXT: LIFETIME_END %stack.2.ap
+ ; CHECK-NEXT: $s0 = COPY [[FCVTSDr]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $s0
+ bb.1.entry:
+ successors: %bb.4(0x50000000), %bb.2(0x30000000)
+ liveins: $q0, $q1, $q2, $q3, $q4, $q5, $q6, $q7, $x0, $x1, $x2, $x3, $x4, $x5, $x6, $x7
+
+ %3:gpr(s64) = COPY $x1
+ %1:gpr(p0) = G_FRAME_INDEX %stack.0
+ G_STORE %3(s64), %1(p0) :: (store (s64) into stack + 8, align 1)
+ %5:gpr(s64) = COPY $x2
+ %2:gpr(s64) = G_CONSTANT i64 8
+ %4:gpr(p0) = G_PTR_ADD %1, %2(s64)
+ G_STORE %5(s64), %4(p0) :: (store (s64) into stack + 16, align 1)
+ %7:gpr(s64) = COPY $x3
+ %18:gpr(s64) = G_CONSTANT i64 16
+ %6:gpr(p0) = G_PTR_ADD %1, %18(s64)
+ G_STORE %7(s64), %6(p0) :: (store (s64) into stack + 24, align 1)
+ %9:gpr(s64) = COPY $x4
+ %61:gpr(s64) = G_CONSTANT i64 24
+ %8:gpr(p0) = G_PTR_ADD %1, %61(s64)
+ G_STORE %9(s64), %8(p0) :: (store (s64) into stack + 32, align 1)
+ %11:gpr(s64) = COPY $x5
+ %62:gpr(s64) = G_CONSTANT i64 32
+ %10:gpr(p0) = G_PTR_ADD %1, %62(s64)
+ G_STORE %11(s64), %10(p0) :: (store (s64) into stack + 40, align 1)
+ %13:gpr(s64) = COPY $x6
+ %63:gpr(s64) = G_CONSTANT i64 40
+ %12:gpr(p0) = G_PTR_ADD %1, %63(s64)
+ G_STORE %13(s64), %12(p0) :: (store (s64) into stack + 48, align 1)
+ %15:gpr(s64) = COPY $x7
+ %64:gpr(s64) = G_CONSTANT i64 48
+ %14:gpr(p0) = G_PTR_ADD %1, %64(s64)
+ G_STORE %15(s64), %14(p0) :: (store (s64) into stack + 56, align 1)
+ %19:fpr(s128) = COPY $q0
+ %17:gpr(p0) = G_FRAME_INDEX %stack.1
+ G_STORE %19(s128), %17(p0) :: (store (s128) into stack, align 1)
+ %21:fpr(s128) = COPY $q1
+ %20:gpr(p0) = G_PTR_ADD %17, %18(s64)
+ G_STORE %21(s128), %20(p0) :: (store (s128) into stack + 16, align 1)
+ %23:fpr(s128) = COPY $q2
+ %22:gpr(p0) = G_PTR_ADD %17, %62(s64)
+ G_STORE %23(s128), %22(p0) :: (store (s128) into stack + 32, align 1)
+ %25:fpr(s128) = COPY $q3
+ %24:gpr(p0) = G_PTR_ADD %17, %64(s64)
+ G_STORE %25(s128), %24(p0) :: (store (s128) into stack + 48, align 1)
+ %27:fpr(s128) = COPY $q4
+ %65:gpr(s64) = G_CONSTANT i64 64
+ %26:gpr(p0) = G_PTR_ADD %17, %65(s64)
+ G_STORE %27(s128), %26(p0) :: (store (s128) into stack + 64, align 1)
+ %29:fpr(s128) = COPY $q5
+ %66:gpr(s64) = G_CONSTANT i64 80
+ %28:gpr(p0) = G_PTR_ADD %17, %66(s64)
+ G_STORE %29(s128), %28(p0) :: (store (s128) into stack + 80, align 1)
+ %31:fpr(s128) = COPY $q6
+ %67:gpr(s64) = G_CONSTANT i64 96
+ %30:gpr(p0) = G_PTR_ADD %17, %67(s64)
+ G_STORE %31(s128), %30(p0) :: (store (s128) into stack + 96, align 1)
+ %33:fpr(s128) = COPY $q7
+ %68:gpr(s64) = G_CONSTANT i64 112
+ %32:gpr(p0) = G_PTR_ADD %17, %68(s64)
+ G_STORE %33(s128), %32(p0) :: (store (s128) into stack + 112, align 1)
+ LIFETIME_START %stack.2.ap
+ %35:gpr(p0) = G_FRAME_INDEX %stack.2.ap
+ G_VASTART %35(p0) :: (store (s256) into %ir.ap, align 8)
+ %36:gpr(s64) = G_CONSTANT i64 28
+ %37:gpr(p0) = G_PTR_ADD %35, %36(s64)
+ %39:gpr(s64) = G_SEXTLOAD %37(p0) :: (dereferenceable load (s32) from %ir.vr_offs_p)
+ %69:gpr(s32) = G_TRUNC %39(s64)
+ %81:gpr(s32) = G_CONSTANT i32 0
+ %80:gpr(s32) = G_ICMP intpred(sge), %69(s32), %81
+ G_BRCOND %80(s32), %bb.4
+ G_BR %bb.2
+
+ bb.2.vaarg.maybe_reg:
+ successors: %bb.3(0x40000000), %bb.4(0x40000000)
+
+ %42:gpr(s32) = G_TRUNC %39(s64)
+ %76:gpr(s32) = G_CONSTANT i32 16
+ %45:gpr(s32) = nsw G_ADD %42, %76
+ %46:gpr(s64) = G_CONSTANT i64 28
+ %72:gpr(p0) = G_FRAME_INDEX %stack.2.ap
+ %47:gpr(p0) = G_PTR_ADD %72, %46(s64)
+ G_STORE %45(s32), %47(p0) :: (store (s32) into %ir.sunkaddr)
+ %75:gpr(s32) = G_CONSTANT i32 -15
+ %78:gpr(s32) = G_ICMP intpred(uge), %42(s32), %75
+ G_BRCOND %78(s32), %bb.4
+ G_BR %bb.3
+
+ bb.3.vaarg.in_reg:
+ successors: %bb.5(0x80000000)
+
+ %50:gpr(s64) = G_CONSTANT i64 16
+ %73:gpr(p0) = G_FRAME_INDEX %stack.2.ap
+ %51:gpr(p0) = G_PTR_ADD %73, %50(s64)
+ %52:gpr(p0) = G_LOAD %51(p0) :: (dereferenceable load (p0) from %ir.reg_top_p)
+ %53:gpr(p0) = G_PTR_ADD %52, %39(s64)
+ G_BR %bb.5
+
+ bb.4.vaarg.on_stack:
+ successors: %bb.5(0x80000000)
+
+ %74:gpr(p0) = G_FRAME_INDEX %stack.2.ap
+ %55:gpr(p0) = G_LOAD %74(p0) :: (dereferenceable load (p0) from %ir.ap)
+ %56:gpr(s64) = G_CONSTANT i64 8
+ %57:gpr(p0) = G_PTR_ADD %55, %56(s64)
+ G_STORE %57(p0), %74(p0) :: (store (p0) into %ir.ap)
+
+ bb.5.vaarg.end:
+ %58:gpr(p0) = G_PHI %53(p0), %bb.3, %55(p0), %bb.4
+ %59:fpr(s64) = G_LOAD %58(p0) :: (load (s64) from %ir.vaargs.addr)
+ %60:fpr(s32) = G_FPTRUNC %59(s64)
+ LIFETIME_END %stack.2.ap
+ $s0 = COPY %60(s32)
+ RET_ReallyLR implicit $s0
+
+...
diff --git a/llvm/test/CodeGen/AArch64/vararg.ll b/llvm/test/CodeGen/AArch64/vararg.ll
new file mode 100644
index 00000000000000..fa9cdf4fc1bcaa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vararg.ll
@@ -0,0 +1,569 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu -O0 -gl...
[truncated]
|
Can we put a status page (AArch64.rst) under https://github.com/llvm/llvm-project/tree/main/llvm/docs/GlobalISel? |
return false; | ||
|
||
if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64( | ||
MF.getFunction().getCallingConv(), MF.getFunction().isVarArg())) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is test over specified? isVarArg()
is implied by the function name. What about isWindowsArm64EC()
? What about isTargetWindows()
? Or is it under specified?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry. Could you instead copy the SDAG and do the Windows test here:
case TargetOpcode::G_VASTART: |
and add an empty
selectWin64VASTART
function, which returns false.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In C __attribute__((ms_abi))
can be used to enable Win64 ABI. selectVaStartDarwin
handles this property. See commit 1c10d5b.
This if
was copied from darwin. I don't know the exact details on what this attribute implies as well as Win64 ABI, so I'm leaving this unsupported.
I think we don't need to change case TargetOpcode::G_VASTART:
in this PR, as it might cause side effects and will require more testing on Win64.
(*I.memoperands_begin()) | ||
->getPointerInfo() | ||
.getWithOffset(OffsetBytes), | ||
MachineMemOperand::MOStore, IntSize, Align(IntSize))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should do commonAlignment of the original alignments and the offset
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks.
Did you mean commonAlignment(Align(IntSize), OffsetBytes)
? Should we use MF.getDataLayout().getPrefTypeAlign(Type::getInt32Ty(MF.getFunction().getContext()))
to get probably more appropriate alignment for the IntSize?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You shouldn't be inspecting the datalayout. All the alignment information is already directly present in the MIR
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replaced with commonAlignment(Align(IntSize), OffsetBytes)
(*I.memoperands_begin()) | ||
->getPointerInfo() | ||
.getWithOffset(OffsetBytes), | ||
MachineMemOperand::MOStore, PtrSize, Align(PtrSize))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should do commonAlignment of the original alignments and the offset
@@ -1994,7 +1994,108 @@ bool AArch64InstructionSelector::selectVectorAshrLshr( | |||
|
|||
bool AArch64InstructionSelector::selectVaStartAAPCS( | |||
MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const { | |||
return false; | |||
|
|||
if (MF.getSubtarget<AArch64Subtarget>().isCallingConvWin64( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I assume the subtarget is already available as a member in AArch64InstructionSelector
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Replaced with STI
Ping @davemgreen and @aemerson ? |
✅ With the latest revision this PR passed the C/C++ code formatter. |
@@ -2045,7 +2045,8 @@ bool AArch64InstructionSelector::selectVaStartAAPCS( | |||
(*I.memoperands_begin()) | |||
->getPointerInfo() | |||
.getWithOffset(OffsetBytes), | |||
MachineMemOperand::MOStore, PtrSize, Align(PtrSize))); | |||
MachineMemOperand::MOStore, PtrSize, | |||
commonAlignment(Align(PtrSize), OffsetBytes))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually I was wrong, the MMO alignment is a function of commonAlignment(BaseAlign, Offset), so this should be passing through the same base alignment from MMO->getBaseAlign()
llvm/test/CodeGen/AArch64/vararg.ll
Outdated
} | ||
|
||
; A real program case | ||
define float @vararg_program(ptr %a, ...) #0 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be a good idea to add a comment briefly describing what the test does in more detail given its length.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe we should remove this case instaed, because it is rather long, includes a lot of things out of the scope of va_start
, and is hard to know if it actually work. We can craft simpler ones if it's needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function should be broken up to test one case per function, with no control flow
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function should be broken up to test one case per function, with no control flow
Then it's essentially duplicating define i64 @vararg(...) #0
.
This PR only modifies va_start. Other blocks does not use va_start
. We can still include them to show that other parts of VA are also working, but to make this PR smaller, I prefer to remove them and only keep things related to va_start
.
llvm/test/CodeGen/AArch64/vararg.ll
Outdated
} | ||
|
||
; A real program case | ||
define float @vararg_program(ptr %a, ...) #0 { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This function should be broken up to test one case per function, with no control flow
llvm/test/CodeGen/AArch64/vararg.ll
Outdated
declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) | ||
|
||
; To make the outputs more readable | ||
attributes #0 = { uwtable "frame-pointer"="all" } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Attribute groups go at the end
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py | ||
# RUN: llc -O0 -mtriple=aarch64-unknown-linux -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefixes=CHECK | ||
|
||
--- | |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Don't need IR section, and there's too much going on here
; CHECK-NEXT: [[LDRWui:%[0-9]+]]:gpr32 = LDRWui %stack.2.ap, 7 :: (dereferenceable load (s32) from %ir.vr_offs_p) | ||
; CHECK-NEXT: $w0 = COPY [[LDRWui]] | ||
LIFETIME_START %stack.2.ap | ||
%35:gpr(p0) = G_FRAME_INDEX %stack.2.ap |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
compact the register numbers, run-pass=none will do that
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks. done.
- { id: 2, name: ap, type: default, offset: 0, size: 32, alignment: 8, | ||
stack-id: default, callee-saved-register: '', callee-saved-restored: true, | ||
debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can lose the names and most of these fields
.addMemOperand(MF.getMachineMemOperand( | ||
MMO->getPointerInfo().getWithOffset(OffsetBytes), | ||
MachineMemOperand::MOStore, PtrSize, | ||
commonAlignment(MMO->getBaseAlign(), OffsetBytes))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be passing through the base alignment and let the PointerInfo offset take care of the commonAlignment
.addMemOperand(MF.getMachineMemOperand( | ||
MMO->getPointerInfo().getWithOffset(OffsetBytes), | ||
MachineMemOperand::MOStore, IntSize, | ||
commonAlignment(MMO->getBaseAlign(), OffsetBytes))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ditto
@arsenm Thanks for your reviews! Since it's already end of my internship, I may not be able to push commits. @madhur13490 would you like to continue working on this? |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
lgtm with the alignment issue fixed
@arsenm Thanks. I made the changes. Hopefully fine to land! |
Looks like formatting is broken https://github.com/llvm/llvm-project/actions/runs/10924413927/job/30323397651?pr=106979 |
fefb4d5
to
785d26e
Compare
This commit adds the missing support for varargs in the instruction selection pass for AAPCS. Previously we only implemented this for Darwin. The implementation was according to AAPCS and SelectionDAG's LowerAAPCS_VASTART. It resolves all VA_START fallbacks in RAJAperf, llvm-test-suite, and SPEC CPU2017. These benchmarks now compile and pass without fallbacks due to varargs. --------- Co-authored-by: Madhur Amilkanthwar <[email protected]>
This commit adds the missing support for varargs in the instruction selection pass for AAPCS. Previously we only implemented this for Darwin.
The implementation was according to AAPCS and SelectionDAG's LowerAAPCS_VASTART.
It resolves all VA_START fallbacks in RAJAperf, llvm-test-suite, and SPEC CPU2017. These benchmarks now compile and pass without fallbacks due to varargs.