Skip to content

[AArch64] Fix frame-pointer offset with hazard padding #118091

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 16 additions & 3 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3167,11 +3167,24 @@ static void computeCalleeSaveRegisterPairs(
(RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
"Offset out of bounds for LDP/STP immediate");

auto isFrameRecord = [&] {
if (RPI.isPaired())
return IsWindows ? RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR
: RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP;
// Otherwise, look for the frame record as two unpaired registers. This is
// needed for -aarch64-stack-hazard-size=<val>, which disables register
// pairing (as the padding may be too large for the LDP/STP offset). Note:
// On Windows, this check works out as current reg == FP, next reg == LR,
// and on other platforms current reg == FP, previous reg == LR. This
// works out as the correct pre-increment or post-increment offsets
// respectively.
return i > 0 && RPI.Reg1 == AArch64::FP &&
CSI[i - 1].getReg() == AArch64::LR;
};

// Save the offset to frame record so that the FP register can point to the
// innermost frame record (spilled FP and LR registers).
if (NeedsFrameRecord &&
((!IsWindows && RPI.Reg1 == AArch64::LR && RPI.Reg2 == AArch64::FP) ||
(IsWindows && RPI.Reg1 == AArch64::FP && RPI.Reg2 == AArch64::LR)))
if (NeedsFrameRecord && isFrameRecord())
AFI->setCalleeSaveBaseToFrameRecordOffset(Offset);

RegPairs.push_back(RPI);
Expand Down
118 changes: 118 additions & 0 deletions llvm/test/CodeGen/AArch64/stack-hazard-windows.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=aarch64-windows-pc-msvc -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK0
; RUN: llc < %s -mtriple=aarch64-windows-pc-msvc -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK64
; RUN: llc < %s -mtriple=aarch64-windows-pc-msvc -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK1024

define i32 @fpr_csr_stackobj(double %x) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
; CHECK0-LABEL: fpr_csr_stackobj:
; CHECK0: .seh_proc fpr_csr_stackobj
; CHECK0-NEXT: // %bb.0: // %entry
; CHECK0-NEXT: str x23, [sp, #-48]! // 8-byte Folded Spill
; CHECK0-NEXT: .seh_save_reg_x x23, 48
; CHECK0-NEXT: stp x29, x30, [sp, #8] // 16-byte Folded Spill
; CHECK0-NEXT: .seh_save_fplr 8
; CHECK0-NEXT: stp d9, d10, [sp, #24] // 16-byte Folded Spill
; CHECK0-NEXT: .seh_save_fregp d9, 24
; CHECK0-NEXT: add x29, sp, #8
; CHECK0-NEXT: .seh_add_fp 8
; CHECK0-NEXT: .seh_endprologue
; CHECK0-NEXT: mov w0, wzr
; CHECK0-NEXT: //APP
; CHECK0-NEXT: //NO_APP
; CHECK0-NEXT: str d0, [x29, #32]
; CHECK0-NEXT: .seh_startepilogue
; CHECK0-NEXT: ldp d9, d10, [sp, #24] // 16-byte Folded Reload
; CHECK0-NEXT: .seh_save_fregp d9, 24
; CHECK0-NEXT: ldp x29, x30, [sp, #8] // 16-byte Folded Reload
; CHECK0-NEXT: .seh_save_fplr 8
; CHECK0-NEXT: ldr x23, [sp], #48 // 8-byte Folded Reload
; CHECK0-NEXT: .seh_save_reg_x x23, 48
; CHECK0-NEXT: .seh_endepilogue
; CHECK0-NEXT: ret
; CHECK0-NEXT: .seh_endfunclet
; CHECK0-NEXT: .seh_endproc
;
; CHECK64-LABEL: fpr_csr_stackobj:
; CHECK64: .seh_proc fpr_csr_stackobj
; CHECK64-NEXT: // %bb.0: // %entry
; CHECK64-NEXT: sub sp, sp, #192
; CHECK64-NEXT: .seh_stackalloc 192
; CHECK64-NEXT: str x23, [sp, #80] // 8-byte Folded Spill
; CHECK64-NEXT: .seh_save_reg x23, 80
; CHECK64-NEXT: str x29, [sp, #88] // 8-byte Folded Spill
; CHECK64-NEXT: .seh_save_reg x29, 88
; CHECK64-NEXT: str x30, [sp, #96] // 8-byte Folded Spill
; CHECK64-NEXT: .seh_save_reg x30, 96
; CHECK64-NEXT: str d9, [sp, #168] // 8-byte Folded Spill
; CHECK64-NEXT: .seh_save_freg d9, 168
; CHECK64-NEXT: str d10, [sp, #176] // 8-byte Folded Spill
; CHECK64-NEXT: .seh_save_freg d10, 176
; CHECK64-NEXT: add x29, sp, #88
; CHECK64-NEXT: .seh_add_fp 88
; CHECK64-NEXT: .seh_endprologue
; CHECK64-NEXT: mov w0, wzr
; CHECK64-NEXT: //APP
; CHECK64-NEXT: //NO_APP
; CHECK64-NEXT: stur d0, [x29, #-16]
; CHECK64-NEXT: .seh_startepilogue
; CHECK64-NEXT: ldr d10, [sp, #176] // 8-byte Folded Reload
; CHECK64-NEXT: .seh_save_freg d10, 176
; CHECK64-NEXT: ldr d9, [sp, #168] // 8-byte Folded Reload
; CHECK64-NEXT: .seh_save_freg d9, 168
; CHECK64-NEXT: ldr x30, [sp, #96] // 8-byte Folded Reload
; CHECK64-NEXT: .seh_save_reg x30, 96
; CHECK64-NEXT: ldr x29, [sp, #88] // 8-byte Folded Reload
; CHECK64-NEXT: .seh_save_reg x29, 88
; CHECK64-NEXT: ldr x23, [sp, #80] // 8-byte Folded Reload
; CHECK64-NEXT: .seh_save_reg x23, 80
; CHECK64-NEXT: add sp, sp, #192
; CHECK64-NEXT: .seh_stackalloc 192
; CHECK64-NEXT: .seh_endepilogue
; CHECK64-NEXT: ret
; CHECK64-NEXT: .seh_endfunclet
; CHECK64-NEXT: .seh_endproc
;
; CHECK1024-LABEL: fpr_csr_stackobj:
; CHECK1024: .seh_proc fpr_csr_stackobj
; CHECK1024-NEXT: // %bb.0: // %entry
; CHECK1024-NEXT: sub sp, sp, #1072
; CHECK1024-NEXT: str x23, [sp] // 8-byte Folded Spill
; CHECK1024-NEXT: str x29, [sp, #8] // 8-byte Folded Spill
; CHECK1024-NEXT: .seh_save_reg x29, 8
; CHECK1024-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
; CHECK1024-NEXT: .seh_save_reg x30, 16
; CHECK1024-NEXT: str d9, [sp, #1048] // 8-byte Folded Spill
; CHECK1024-NEXT: .seh_save_freg d9, 1048
; CHECK1024-NEXT: str d10, [sp, #1056] // 8-byte Folded Spill
; CHECK1024-NEXT: .seh_save_freg d10, 1056
; CHECK1024-NEXT: add x29, sp, #8
; CHECK1024-NEXT: .seh_add_fp 8
; CHECK1024-NEXT: .seh_endprologue
; CHECK1024-NEXT: sub sp, sp, #1040
; CHECK1024-NEXT: mov w0, wzr
; CHECK1024-NEXT: //APP
; CHECK1024-NEXT: //NO_APP
; CHECK1024-NEXT: stur d0, [x29, #-16]
; CHECK1024-NEXT: .seh_startepilogue
; CHECK1024-NEXT: add sp, sp, #1040
; CHECK1024-NEXT: .seh_stackalloc 1040
; CHECK1024-NEXT: ldr d10, [sp, #1056] // 8-byte Folded Reload
; CHECK1024-NEXT: .seh_save_freg d10, 1056
; CHECK1024-NEXT: ldr d9, [sp, #1048] // 8-byte Folded Reload
; CHECK1024-NEXT: .seh_save_freg d9, 1048
; CHECK1024-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
; CHECK1024-NEXT: .seh_save_reg x30, 16
; CHECK1024-NEXT: ldr x29, [sp, #8] // 8-byte Folded Reload
; CHECK1024-NEXT: .seh_save_reg x29, 8
; CHECK1024-NEXT: ldr x23, [sp] // 8-byte Folded Reload
; CHECK1024-NEXT: add sp, sp, #1072
; CHECK1024-NEXT: .seh_endepilogue
; CHECK1024-NEXT: ret
; CHECK1024-NEXT: .seh_endfunclet
; CHECK1024-NEXT: .seh_endproc
entry:
%a = alloca double
tail call void asm sideeffect "", "~{x23},~{d9},~{d10}"()
store double %x, ptr %a
ret i32 0
}
157 changes: 131 additions & 26 deletions llvm/test/CodeGen/AArch64/stack-hazard.ll
Original file line number Diff line number Diff line change
Expand Up @@ -337,19 +337,18 @@ define i32 @csr_d8_allocd_framepointer(double %d) "aarch64_pstate_sm_compatible"
; CHECK64-LABEL: csr_d8_allocd_framepointer:
; CHECK64: // %bb.0: // %entry
; CHECK64-NEXT: sub sp, sp, #176
; CHECK64-NEXT: str d8, [sp, #80] // 8-byte Folded Spill
; CHECK64-NEXT: stp d0, d8, [sp, #72] // 8-byte Folded Spill
; CHECK64-NEXT: stp x29, x30, [sp, #152] // 16-byte Folded Spill
; CHECK64-NEXT: add x29, sp, #80
; CHECK64-NEXT: .cfi_def_cfa w29, 96
; CHECK64-NEXT: add x29, sp, #152
; CHECK64-NEXT: .cfi_def_cfa w29, 24
; CHECK64-NEXT: .cfi_offset w30, -16
; CHECK64-NEXT: .cfi_offset w29, -24
; CHECK64-NEXT: .cfi_offset b8, -96
; CHECK64-NEXT: //APP
; CHECK64-NEXT: //NO_APP
; CHECK64-NEXT: stur d0, [x29, #-8]
; CHECK64-NEXT: ldr x29, [sp, #152] // 8-byte Folded Reload
; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload
; CHECK64-NEXT: mov w0, wzr
; CHECK64-NEXT: ldr d8, [sp, #80] // 8-byte Folded Reload
; CHECK64-NEXT: add sp, sp, #176
; CHECK64-NEXT: ret
;
Expand All @@ -358,17 +357,17 @@ define i32 @csr_d8_allocd_framepointer(double %d) "aarch64_pstate_sm_compatible"
; CHECK1024-NEXT: sub sp, sp, #1056
; CHECK1024-NEXT: str d8, [sp] // 8-byte Folded Spill
; CHECK1024-NEXT: str x29, [sp, #1032] // 8-byte Folded Spill
; CHECK1024-NEXT: mov x29, sp
; CHECK1024-NEXT: add x29, sp, #1032
; CHECK1024-NEXT: str x30, [sp, #1040] // 8-byte Folded Spill
; CHECK1024-NEXT: sub sp, sp, #1040
; CHECK1024-NEXT: .cfi_def_cfa w29, 1056
; CHECK1024-NEXT: .cfi_def_cfa w29, 24
; CHECK1024-NEXT: .cfi_offset w30, -16
; CHECK1024-NEXT: .cfi_offset w29, -24
; CHECK1024-NEXT: .cfi_offset b8, -1056
; CHECK1024-NEXT: mov w0, wzr
; CHECK1024-NEXT: //APP
; CHECK1024-NEXT: //NO_APP
; CHECK1024-NEXT: stur d0, [x29, #-8]
; CHECK1024-NEXT: str d0, [sp, #1032]
; CHECK1024-NEXT: add sp, sp, #1040
; CHECK1024-NEXT: ldr x30, [sp, #1040] // 8-byte Folded Reload
; CHECK1024-NEXT: ldr x29, [sp, #1032] // 8-byte Folded Reload
Expand Down Expand Up @@ -2893,8 +2892,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK64-NEXT: stp x29, x30, [sp, #128] // 16-byte Folded Spill
; CHECK64-NEXT: stp x9, x20, [sp, #144] // 16-byte Folded Spill
; CHECK64-NEXT: str x19, [sp, #160] // 8-byte Folded Spill
; CHECK64-NEXT: mov x29, sp
; CHECK64-NEXT: .cfi_def_cfa w29, 176
; CHECK64-NEXT: add x29, sp, #128
; CHECK64-NEXT: .cfi_def_cfa w29, 48
; CHECK64-NEXT: .cfi_offset w19, -16
; CHECK64-NEXT: .cfi_offset w20, -24
; CHECK64-NEXT: .cfi_offset w30, -40
Expand All @@ -2913,11 +2912,11 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK64-NEXT: mov w20, w0
; CHECK64-NEXT: msub x9, x8, x8, x9
; CHECK64-NEXT: mov sp, x9
; CHECK64-NEXT: stur x9, [x29, #-80]
; CHECK64-NEXT: sub x9, x29, #80
; CHECK64-NEXT: sturh wzr, [x29, #-70]
; CHECK64-NEXT: stur wzr, [x29, #-68]
; CHECK64-NEXT: sturh w8, [x29, #-72]
; CHECK64-NEXT: stur x9, [x29, #-208]
; CHECK64-NEXT: sub x9, x29, #208
; CHECK64-NEXT: sturh wzr, [x29, #-198]
; CHECK64-NEXT: stur wzr, [x29, #-196]
; CHECK64-NEXT: sturh w8, [x29, #-200]
; CHECK64-NEXT: msr TPIDR2_EL0, x9
; CHECK64-NEXT: .cfi_offset vg, -32
; CHECK64-NEXT: smstop sm
Expand All @@ -2926,14 +2925,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK64-NEXT: .cfi_restore vg
; CHECK64-NEXT: smstart za
; CHECK64-NEXT: mrs x8, TPIDR2_EL0
; CHECK64-NEXT: sub x0, x29, #80
; CHECK64-NEXT: sub x0, x29, #208
; CHECK64-NEXT: cbnz x8, .LBB33_2
; CHECK64-NEXT: // %bb.1: // %entry
; CHECK64-NEXT: bl __arm_tpidr2_restore
; CHECK64-NEXT: .LBB33_2: // %entry
; CHECK64-NEXT: mov w0, w20
; CHECK64-NEXT: msr TPIDR2_EL0, xzr
; CHECK64-NEXT: mov sp, x29
; CHECK64-NEXT: sub sp, x29, #128
; CHECK64-NEXT: .cfi_def_cfa wsp, 176
; CHECK64-NEXT: ldp x20, x19, [sp, #152] // 16-byte Folded Reload
; CHECK64-NEXT: ldr d14, [sp, #8] // 8-byte Folded Reload
Expand Down Expand Up @@ -2972,8 +2971,8 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK1024-NEXT: str x28, [sp, #1112] // 8-byte Folded Spill
; CHECK1024-NEXT: str x20, [sp, #1120] // 8-byte Folded Spill
; CHECK1024-NEXT: str x19, [sp, #1128] // 8-byte Folded Spill
; CHECK1024-NEXT: mov x29, sp
; CHECK1024-NEXT: .cfi_def_cfa w29, 1136
; CHECK1024-NEXT: add x29, sp, #1088
; CHECK1024-NEXT: .cfi_def_cfa w29, 48
; CHECK1024-NEXT: .cfi_offset w19, -8
; CHECK1024-NEXT: .cfi_offset w20, -16
; CHECK1024-NEXT: .cfi_offset w28, -24
Expand All @@ -2993,14 +2992,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK1024-NEXT: mov w20, w0
; CHECK1024-NEXT: msub x9, x8, x8, x9
; CHECK1024-NEXT: mov sp, x9
; CHECK1024-NEXT: sub x10, x29, #784
; CHECK1024-NEXT: sub x10, x29, #1872
; CHECK1024-NEXT: stur x9, [x10, #-256]
; CHECK1024-NEXT: sub x9, x29, #774
; CHECK1024-NEXT: sub x10, x29, #772
; CHECK1024-NEXT: sub x9, x29, #1862
; CHECK1024-NEXT: sub x10, x29, #1860
; CHECK1024-NEXT: sturh wzr, [x9, #-256]
; CHECK1024-NEXT: sub x9, x29, #1040
; CHECK1024-NEXT: sub x9, x29, #2128
; CHECK1024-NEXT: stur wzr, [x10, #-256]
; CHECK1024-NEXT: sub x10, x29, #776
; CHECK1024-NEXT: sub x10, x29, #1864
; CHECK1024-NEXT: sturh w8, [x10, #-256]
; CHECK1024-NEXT: msr TPIDR2_EL0, x9
; CHECK1024-NEXT: .cfi_offset vg, -32
Expand All @@ -3010,14 +3009,14 @@ define i32 @vastate(i32 %x) "aarch64_inout_za" "aarch64_pstate_sm_enabled" "targ
; CHECK1024-NEXT: .cfi_restore vg
; CHECK1024-NEXT: smstart za
; CHECK1024-NEXT: mrs x8, TPIDR2_EL0
; CHECK1024-NEXT: sub x0, x29, #1040
; CHECK1024-NEXT: sub x0, x29, #2128
; CHECK1024-NEXT: cbnz x8, .LBB33_2
; CHECK1024-NEXT: // %bb.1: // %entry
; CHECK1024-NEXT: bl __arm_tpidr2_restore
; CHECK1024-NEXT: .LBB33_2: // %entry
; CHECK1024-NEXT: mov w0, w20
; CHECK1024-NEXT: msr TPIDR2_EL0, xzr
; CHECK1024-NEXT: mov sp, x29
; CHECK1024-NEXT: sub sp, x29, #1088
; CHECK1024-NEXT: .cfi_def_cfa wsp, 1136
; CHECK1024-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK1024-NEXT: ldr x19, [sp, #1128] // 8-byte Folded Reload
Expand Down Expand Up @@ -3049,3 +3048,109 @@ entry:
ret i32 %x
}
declare void @other()

declare void @bar(ptr noundef) "aarch64_pstate_sm_compatible"

define i32 @sve_stack_object_and_vla(double %d, i64 %sz) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
; CHECK0-LABEL: sve_stack_object_and_vla:
; CHECK0: // %bb.0: // %entry
; CHECK0-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
; CHECK0-NEXT: stp x28, x19, [sp, #16] // 16-byte Folded Spill
; CHECK0-NEXT: mov x29, sp
; CHECK0-NEXT: addvl sp, sp, #-1
; CHECK0-NEXT: mov x19, sp
; CHECK0-NEXT: .cfi_def_cfa w29, 32
; CHECK0-NEXT: .cfi_offset w19, -8
; CHECK0-NEXT: .cfi_offset w28, -16
; CHECK0-NEXT: .cfi_offset w30, -24
; CHECK0-NEXT: .cfi_offset w29, -32
; CHECK0-NEXT: lsl x9, x0, #2
; CHECK0-NEXT: mov x8, sp
; CHECK0-NEXT: add x9, x9, #15
; CHECK0-NEXT: and x9, x9, #0xfffffffffffffff0
; CHECK0-NEXT: sub x0, x8, x9
; CHECK0-NEXT: mov sp, x0
; CHECK0-NEXT: mov z0.s, #0 // =0x0
; CHECK0-NEXT: ptrue p0.s
; CHECK0-NEXT: st1w { z0.s }, p0, [x29, #-1, mul vl]
; CHECK0-NEXT: bl bar
; CHECK0-NEXT: mov w0, wzr
; CHECK0-NEXT: mov sp, x29
; CHECK0-NEXT: ldp x28, x19, [sp, #16] // 16-byte Folded Reload
; CHECK0-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
; CHECK0-NEXT: ret
;
; CHECK64-LABEL: sve_stack_object_and_vla:
; CHECK64: // %bb.0: // %entry
; CHECK64-NEXT: sub sp, sp, #96
; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK64-NEXT: add x29, sp, #64
; CHECK64-NEXT: stp x28, x19, [sp, #80] // 16-byte Folded Spill
; CHECK64-NEXT: sub sp, sp, #64
; CHECK64-NEXT: addvl sp, sp, #-1
; CHECK64-NEXT: mov x19, sp
; CHECK64-NEXT: .cfi_def_cfa w29, 32
; CHECK64-NEXT: .cfi_offset w19, -8
; CHECK64-NEXT: .cfi_offset w28, -16
; CHECK64-NEXT: .cfi_offset w30, -24
; CHECK64-NEXT: .cfi_offset w29, -32
; CHECK64-NEXT: lsl x9, x0, #2
; CHECK64-NEXT: mov x8, sp
; CHECK64-NEXT: add x9, x9, #15
; CHECK64-NEXT: and x9, x9, #0xfffffffffffffff0
; CHECK64-NEXT: sub x0, x8, x9
; CHECK64-NEXT: mov sp, x0
; CHECK64-NEXT: mov z0.s, #0 // =0x0
; CHECK64-NEXT: ptrue p0.s
; CHECK64-NEXT: sub x8, x29, #64
; CHECK64-NEXT: st1w { z0.s }, p0, [x8, #-1, mul vl]
; CHECK64-NEXT: bl bar
; CHECK64-NEXT: mov w0, wzr
; CHECK64-NEXT: sub sp, x29, #64
; CHECK64-NEXT: ldp x28, x19, [sp, #80] // 16-byte Folded Reload
; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK64-NEXT: add sp, sp, #96
; CHECK64-NEXT: ret
;
; CHECK1024-LABEL: sve_stack_object_and_vla:
; CHECK1024: // %bb.0: // %entry
; CHECK1024-NEXT: sub sp, sp, #1056
; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
; CHECK1024-NEXT: add x29, sp, #1024
; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
; CHECK1024-NEXT: str x28, [sp, #1040] // 8-byte Folded Spill
; CHECK1024-NEXT: str x19, [sp, #1048] // 8-byte Folded Spill
; CHECK1024-NEXT: sub sp, sp, #1024
; CHECK1024-NEXT: addvl sp, sp, #-1
; CHECK1024-NEXT: mov x19, sp
; CHECK1024-NEXT: .cfi_def_cfa w29, 32
; CHECK1024-NEXT: .cfi_offset w19, -8
; CHECK1024-NEXT: .cfi_offset w28, -16
; CHECK1024-NEXT: .cfi_offset w30, -24
; CHECK1024-NEXT: .cfi_offset w29, -32
; CHECK1024-NEXT: lsl x9, x0, #2
; CHECK1024-NEXT: mov x8, sp
; CHECK1024-NEXT: add x9, x9, #15
; CHECK1024-NEXT: and x9, x9, #0xfffffffffffffff0
; CHECK1024-NEXT: sub x0, x8, x9
; CHECK1024-NEXT: mov sp, x0
; CHECK1024-NEXT: mov z0.s, #0 // =0x0
; CHECK1024-NEXT: ptrue p0.s
; CHECK1024-NEXT: sub x8, x29, #1024
; CHECK1024-NEXT: st1w { z0.s }, p0, [x8, #-1, mul vl]
; CHECK1024-NEXT: bl bar
; CHECK1024-NEXT: mov w0, wzr
; CHECK1024-NEXT: sub sp, x29, #1024
; CHECK1024-NEXT: ldr x19, [sp, #1048] // 8-byte Folded Reload
; CHECK1024-NEXT: ldr x28, [sp, #1040] // 8-byte Folded Reload
; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
; CHECK1024-NEXT: add sp, sp, #1056
; CHECK1024-NEXT: ret
entry:
%a = alloca <vscale x 4 x i32>
%b = alloca i32, i64 %sz, align 4
store <vscale x 4 x i32> zeroinitializer, ptr %a
call void @bar(ptr noundef nonnull %b)
ret i32 0
}