-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AArch64] Fix frame-pointer offset with hazard padding #118091
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
The `-aarch64-stack-hazard-size=<val>` option disables register paring (as the hazard padding may mean the offset is too large for STP/LDP). This broke setting the frame-pointer offset, as the code to find the frame record looked for a (FP, LR) register pair. This patch resolves this by looking for FP, LR as two unpaired registers when hazard padding is enabled.
@llvm/pr-subscribers-backend-aarch64 Author: Benjamin Maxwell (MacDue) ChangesThe This broke setting the frame-pointer offset, as the code to find the frame record looked for a (FP, LR) register pair. This patch resolves this by looking for FP, LR as two unpaired registers when hazard padding is enabled. Full diff: https://github.com/llvm/llvm-project/pull/118091.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index d6673969aa3056..d593738cf32414 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -3167,11 +3167,21 @@ static void computeCalleeSaveRegisterPairs(
(RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
"Offset out of bounds for LDP/STP immediate");
+ auto isFrameRecord = [&] {
+ if (RPI.isPaired())
+ return IsWindows ? RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR
+ : RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP;
+ // -aarch64-stack-hazard-size=<val> disables register pairing, so look
+ // for the frame record as two unpaired registers.
+ if (AFI->hasStackHazardSlotIndex())
+ return i > 0 && RPI.Reg1 == AArch64::FP &&
+ CSI[i - 1].getReg() == AArch64::LR;
+ return false;
+ };
+
// Save the offset to frame record so that the FP register can point to the
// innermost frame record (spilled FP and LR registers).
- if (NeedsFrameRecord &&
- ((!IsWindows && RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
- (IsWindows && RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR)))
+ if (NeedsFrameRecord && isFrameRecord())
AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);
RegPairs.push_back(RPI);
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 50a2e41f45756d..a4c2b30566a951 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -337,19 +337,18 @@ define i32 @csr_d8_allocd_framepointer(double %d) "aarch64_pstate_sm_compatible"
; CHECK64-LABEL: csr_d8_allocd_framepointer:
; CHECK64: // %bb.0: // %entry
; CHECK64-NEXT: sub sp, sp, #176
-; CHECK64-NEXT: str d8, [sp, #80] // 8-byte Folded Spill
+; CHECK64-NEXT: stp d0, d8, [sp, #72] // 8-byte Folded Spill
; CHECK64-NEXT: stp x29, x30, [sp, #152] // 16-byte Folded Spill
-; CHECK64-NEXT: add x29, sp, #80
-; CHECK64-NEXT: .cfi_def_cfa w29, 96
+; CHECK64-NEXT: add x29, sp, #152
+; CHECK64-NEXT: .cfi_def_cfa w29, 24
; CHECK64-NEXT: .cfi_offset w30, -16
; CHECK64-NEXT: .cfi_offset w29, -24
; CHECK64-NEXT: .cfi_offset b8, -96
; CHECK64-NEXT: //APP
; CHECK64-NEXT: //NO_APP
-; CHECK64-NEXT: stur d0, [x29, #-8]
; CHECK64-NEXT: ldr x29, [sp, #152] // 8-byte Folded Reload
-; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload
; CHECK64-NEXT: mov w0, wzr
+; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload
; CHECK64-NEXT: add sp, sp, #176
; CHECK64-NEXT: ret
;
@@ -358,17 +357,17 @@ define i32 @csr_d8_allocd_framepointer(double %d) "aarch64_pstate_sm_compatible"
; CHECK1024-NEXT: sub sp, sp, #1056
; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill
; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: mov x29, sp
+; CHECK1024-NEXT: add x29, sp, #1032
; CHECK1024-NEXT: str x30, [sp, #1040] // 8-byte Folded Spill
; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: .cfi_def_cfa w29, 1056
+; CHECK1024-NEXT: .cfi_def_cfa w29, 24
; CHECK1024-NEXT: .cfi_offset w30, -16
; CHECK1024-NEXT: .cfi_offset w29, -24
; CHECK1024-NEXT: .cfi_offset b8, -1056
; CHECK1024-NEXT: mov w0, wzr
; CHECK1024-NEXT: //APP
; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: stur d0, [x29, #-8]
+; CHECK1024-NEXT: str d0, [sp, #1032]
; CHECK1024-NEXT: add sp, sp, #1040
; CHECK1024-NEXT: ldr x30, [sp, #1040] // 8-byte Folded Reload
; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload
@@ -2893,8 +2892,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK64-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill
; CHECK64-NEXT: stp x9, x20, [sp, #144] // 16-byte Folded Spill
; CHECK64-NEXT: str x19, [sp, #160] // 8-byte Folded Spill
-; CHECK64-NEXT: mov x29, sp
-; CHECK64-NEXT: .cfi_def_cfa w29, 176
+; CHECK64-NEXT: add x29, sp, #128
+; CHECK64-NEXT: .cfi_def_cfa w29, 48
; CHECK64-NEXT: .cfi_offset w19, -16
; CHECK64-NEXT: .cfi_offset w20, -24
; CHECK64-NEXT: .cfi_offset w30, -40
@@ -2913,11 +2912,11 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK64-NEXT: mov w20, w0
; CHECK64-NEXT: msub x9, x8, x8, x9
; CHECK64-NEXT: mov sp, x9
-; CHECK64-NEXT: stur x9, [x29, #-80]
-; CHECK64-NEXT: sub x9, x29, #80
-; CHECK64-NEXT: sturh wzr, [x29, #-70]
-; CHECK64-NEXT: stur wzr, [x29, #-68]
-; CHECK64-NEXT: sturh w8, [x29, #-72]
+; CHECK64-NEXT: stur x9, [x29, #-208]
+; CHECK64-NEXT: sub x9, x29, #208
+; CHECK64-NEXT: sturh wzr, [x29, #-198]
+; CHECK64-NEXT: stur wzr, [x29, #-196]
+; CHECK64-NEXT: sturh w8, [x29, #-200]
; CHECK64-NEXT: msr TPIDR2_EL0, x9
; CHECK64-NEXT: .cfi_offset vg, -32
; CHECK64-NEXT: smstop sm
@@ -2926,14 +2925,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK64-NEXT: .cfi_restore vg
; CHECK64-NEXT: smstart za
; CHECK64-NEXT: mrs x8, TPIDR2_EL0
-; CHECK64-NEXT: sub x0, x29, #80
+; CHECK64-NEXT: sub x0, x29, #208
; CHECK64-NEXT: cbnz x8, .LBB33_2
; CHECK64-NEXT: // %bb.1: // %entry
; CHECK64-NEXT: bl __arm_tpidr2_restore
; CHECK64-NEXT: .LBB33_2: // %entry
; CHECK64-NEXT: mov w0, w20
; CHECK64-NEXT: msr TPIDR2_EL0, xzr
-; CHECK64-NEXT: mov sp, x29
+; CHECK64-NEXT: sub sp, x29, #128
; CHECK64-NEXT: .cfi_def_cfa wsp, 176
; CHECK64-NEXT: ldp x20, x19, [sp, #152] // 16-byte Folded Reload
; CHECK64-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload
@@ -2972,8 +2971,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK1024-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill
; CHECK1024-NEXT: str x20, [sp, #1120] // 8-byte Folded Spill
; CHECK1024-NEXT: str x19, [sp, #1128] // 8-byte Folded Spill
-; CHECK1024-NEXT: mov x29, sp
-; CHECK1024-NEXT: .cfi_def_cfa w29, 1136
+; CHECK1024-NEXT: add x29, sp, #1088
+; CHECK1024-NEXT: .cfi_def_cfa w29, 48
; CHECK1024-NEXT: .cfi_offset w19, -8
; CHECK1024-NEXT: .cfi_offset w20, -16
; CHECK1024-NEXT: .cfi_offset w28, -24
@@ -2993,14 +2992,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK1024-NEXT: mov w20, w0
; CHECK1024-NEXT: msub x9, x8, x8, x9
; CHECK1024-NEXT: mov sp, x9
-; CHECK1024-NEXT: sub x10, x29, #784
+; CHECK1024-NEXT: sub x10, x29, #1872
; CHECK1024-NEXT: stur x9, [x10, #-256]
-; CHECK1024-NEXT: sub x9, x29, #774
-; CHECK1024-NEXT: sub x10, x29, #772
+; CHECK1024-NEXT: sub x9, x29, #1862
+; CHECK1024-NEXT: sub x10, x29, #1860
; CHECK1024-NEXT: sturh wzr, [x9, #-256]
-; CHECK1024-NEXT: sub x9, x29, #1040
+; CHECK1024-NEXT: sub x9, x29, #2128
; CHECK1024-NEXT: stur wzr, [x10, #-256]
-; CHECK1024-NEXT: sub x10, x29, #776
+; CHECK1024-NEXT: sub x10, x29, #1864
; CHECK1024-NEXT: sturh w8, [x10, #-256]
; CHECK1024-NEXT: msr TPIDR2_EL0, x9
; CHECK1024-NEXT: .cfi_offset vg, -32
@@ -3010,14 +3009,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK1024-NEXT: .cfi_restore vg
; CHECK1024-NEXT: smstart za
; CHECK1024-NEXT: mrs x8, TPIDR2_EL0
-; CHECK1024-NEXT: sub x0, x29, #1040
+; CHECK1024-NEXT: sub x0, x29, #2128
; CHECK1024-NEXT: cbnz x8, .LBB33_2
; CHECK1024-NEXT: // %bb.1: // %entry
; CHECK1024-NEXT: bl __arm_tpidr2_restore
; CHECK1024-NEXT: .LBB33_2: // %entry
; CHECK1024-NEXT: mov w0, w20
; CHECK1024-NEXT: msr TPIDR2_EL0, xzr
-; CHECK1024-NEXT: mov sp, x29
+; CHECK1024-NEXT: sub sp, x29, #1088
; CHECK1024-NEXT: .cfi_def_cfa wsp, 1136
; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK1024-NEXT: ldr x19, [sp, #1128] // 8-byte Folded Reload
@@ -3049,3 +3048,109 @@ entry:
ret i32 %x
}
declare void @other()
+
+declare void @bar(ptr noundef) "aarch64_pstate_sm_compatible"
+
+define i32 @sve_stack_object_and_vla(double %d, i64 %sz) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK0-LABEL: sve_stack_object_and_vla:
+; CHECK0: // %bb.0: // %entry
+; CHECK0-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK0-NEXT: stp x28, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK0-NEXT: mov x29, sp
+; CHECK0-NEXT: addvl sp, sp, #-1
+; CHECK0-NEXT: mov x19, sp
+; CHECK0-NEXT: .cfi_def_cfa w29, 32
+; CHECK0-NEXT: .cfi_offset w19, -8
+; CHECK0-NEXT: .cfi_offset w28, -16
+; CHECK0-NEXT: .cfi_offset w30, -24
+; CHECK0-NEXT: .cfi_offset w29, -32
+; CHECK0-NEXT: lsl x9, x0, #2
+; CHECK0-NEXT: mov x8, sp
+; CHECK0-NEXT: add x9, x9, #15
+; CHECK0-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK0-NEXT: sub x0, x8, x9
+; CHECK0-NEXT: mov sp, x0
+; CHECK0-NEXT: mov z0.s, #0 // =0x0
+; CHECK0-NEXT: ptrue p0.s
+; CHECK0-NEXT: st1w { z0.s }, p0, [x29, #-1, mul vl]
+; CHECK0-NEXT: bl bar
+; CHECK0-NEXT: mov w0, wzr
+; CHECK0-NEXT: mov sp, x29
+; CHECK0-NEXT: ldp x28, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK0-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK0-NEXT: ret
+;
+; CHECK64-LABEL: sve_stack_object_and_vla:
+; CHECK64: // %bb.0: // %entry
+; CHECK64-NEXT: sub sp, sp, #96
+; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK64-NEXT: add x29, sp, #64
+; CHECK64-NEXT: stp x28, x19, [sp, #80] // 16-byte Folded Spill
+; CHECK64-NEXT: sub sp, sp, #64
+; CHECK64-NEXT: addvl sp, sp, #-1
+; CHECK64-NEXT: mov x19, sp
+; CHECK64-NEXT: .cfi_def_cfa w29, 32
+; CHECK64-NEXT: .cfi_offset w19, -8
+; CHECK64-NEXT: .cfi_offset w28, -16
+; CHECK64-NEXT: .cfi_offset w30, -24
+; CHECK64-NEXT: .cfi_offset w29, -32
+; CHECK64-NEXT: lsl x9, x0, #2
+; CHECK64-NEXT: mov x8, sp
+; CHECK64-NEXT: add x9, x9, #15
+; CHECK64-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK64-NEXT: sub x0, x8, x9
+; CHECK64-NEXT: mov sp, x0
+; CHECK64-NEXT: mov z0.s, #0 // =0x0
+; CHECK64-NEXT: ptrue p0.s
+; CHECK64-NEXT: sub x8, x29, #64
+; CHECK64-NEXT: st1w { z0.s }, p0, [x8, #-1, mul vl]
+; CHECK64-NEXT: bl bar
+; CHECK64-NEXT: mov w0, wzr
+; CHECK64-NEXT: sub sp, x29, #64
+; CHECK64-NEXT: ldp x28, x19, [sp, #80] // 16-byte Folded Reload
+; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK64-NEXT: add sp, sp, #96
+; CHECK64-NEXT: ret
+;
+; CHECK1024-LABEL: sve_stack_object_and_vla:
+; CHECK1024: // %bb.0: // %entry
+; CHECK1024-NEXT: sub sp, sp, #1056
+; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NEXT: add x29, sp, #1024
+; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NEXT: str x19, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NEXT: sub sp, sp, #1024
+; CHECK1024-NEXT: addvl sp, sp, #-1
+; CHECK1024-NEXT: mov x19, sp
+; CHECK1024-NEXT: .cfi_def_cfa w29, 32
+; CHECK1024-NEXT: .cfi_offset w19, -8
+; CHECK1024-NEXT: .cfi_offset w28, -16
+; CHECK1024-NEXT: .cfi_offset w30, -24
+; CHECK1024-NEXT: .cfi_offset w29, -32
+; CHECK1024-NEXT: lsl x9, x0, #2
+; CHECK1024-NEXT: mov x8, sp
+; CHECK1024-NEXT: add x9, x9, #15
+; CHECK1024-NEXT: and x9, x9, #0xfffffffffffffff0
+; CHECK1024-NEXT: sub x0, x8, x9
+; CHECK1024-NEXT: mov sp, x0
+; CHECK1024-NEXT: mov z0.s, #0 // =0x0
+; CHECK1024-NEXT: ptrue p0.s
+; CHECK1024-NEXT: sub x8, x29, #1024
+; CHECK1024-NEXT: st1w { z0.s }, p0, [x8, #-1, mul vl]
+; CHECK1024-NEXT: bl bar
+; CHECK1024-NEXT: mov w0, wzr
+; CHECK1024-NEXT: sub sp, x29, #1024
+; CHECK1024-NEXT: ldr x19, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload
+; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NEXT: add sp, sp, #1056
+; CHECK1024-NEXT: ret
+entry:
+ %a = alloca <vscale x 4 x i32>
+ %b = alloca i32, i64 %sz, align 4
+ store <vscale x 4 x i32> zeroinitializer, ptr %a
+ call void @bar(ptr noundef nonnull %b)
+ ret i32 0
+}
|
(Following is not intended to block this patch.) Why are we generating code like this in the first place? It looks really silly when you subtract 1056 from sp, then split the frame stores because the offsets are too large, then subtract another 1024 from sp. Better to just generate |
I imagine it's an artifact from how the hazard padding has been integrated with the existing lowering logic. I agree that it probably could be improved (which may make this change unnecessary), but I was only focused on fixing the correctness issue here. |
The
-aarch64-stack-hazard-size=<val>
option disables register paring (as the hazard padding may mean the offset is too large for STP/LDP).This broke setting the frame-pointer offset, as the code to find the frame record looked for a (FP, LR) register pair.
This patch resolves this by looking for FP, LR as two unpaired registers when hazard padding is enabled.