[AArch64][SME] Fix frame lowering not using a base pointer for SME functions. #91643

aemerson · 2024-05-09T18:47:54Z

The existing code is checking for the presence of the +sve subtarget feature
when deciding to use a base pointer for the function, but this check doesn't
work when only +sme is used.

rdar://126878490

llvmbot · 2024-05-09T18:48:26Z

@llvm/pr-subscribers-backend-aarch64

Author: Amara Emerson (aemerson)

Changes

The existing code is checking for the presence of the +sve subtarget feature
when deciding to use a base pointer for the function, but this check doesn't
work when only +sme is used.

rdar://126878490

Patch is 20.49 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/91643.diff

5 Files Affected:

(modified) llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp (+2-1)
(modified) llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll (+16-8)
(modified) llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll (+26-20)
(modified) llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll (+8-4)
(modified) llvm/test/CodeGen/AArch64/sme-zt0-state.ll (+26-18)

diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index ad29003f1e817..a192e01f69b20 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -552,7 +552,8 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
     if (hasStackRealignment(MF))
       return true;
 
-    if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+    auto &ST = MF.getSubtarget<AArch64Subtarget>();
+    if (ST.hasSVEorSME()) {
       const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
       // Frames that have variable sized objects and scalable SVE objects,
       // should always use a basepointer.
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index 254e37e836cbb..50d04e39f3527 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -214,7 +214,8 @@ declare double @za_shared_callee(double) "aarch64_inout_za"
 define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{
 ; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee:
 ; CHECK-COMMON:       // %bb.0: // %prelude
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
@@ -240,7 +241,8 @@ define double  @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o
 ; CHECK-COMMON-NEXT:    fadd d0, d0, d1
 ; CHECK-COMMON-NEXT:    smstop za
 ; CHECK-COMMON-NEXT:    mov sp, x29
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
 entry:
   %call = call double @za_shared_callee(double %x)
@@ -251,7 +253,8 @@ entry:
 define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline optnone "aarch64_inout_za"{
 ; CHECK-COMMON-LABEL: za_shared_caller_to_za_none_callee:
 ; CHECK-COMMON:       // %bb.0: // %entry
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
@@ -279,7 +282,8 @@ define double  @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
 ; CHECK-COMMON-NEXT:    fmov d1, x8
 ; CHECK-COMMON-NEXT:    fadd d0, d0, d1
 ; CHECK-COMMON-NEXT:    mov sp, x29
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
 entry:
   %call = call double @normal_callee(double %x)
@@ -291,7 +295,8 @@ entry:
 define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-LABEL: f128_call_za:
 ; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
@@ -314,7 +319,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:  .LBB8_2:
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-COMMON-NEXT:    mov sp, x29
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
   %res = fadd fp128 %a, %b
   ret fp128 %res
@@ -353,7 +359,8 @@ define fp128 @f128_call_sm(fp128 %a, fp128 %b) "aarch64_pstate_sm_enabled" nounw
 define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-LABEL: frem_call_za:
 ; CHECK-COMMON:       // %bb.0:
-; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-COMMON-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-COMMON-NEXT:    mov x29, sp
 ; CHECK-COMMON-NEXT:    sub sp, sp, #16
 ; CHECK-COMMON-NEXT:    rdsvl x8, #1
@@ -376,7 +383,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind {
 ; CHECK-COMMON-NEXT:  .LBB10_2:
 ; CHECK-COMMON-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-COMMON-NEXT:    mov sp, x29
-; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-COMMON-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-COMMON-NEXT:    ret
   %res = frem double %a, %b
   ret double %res
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9d635f0b88f19..92baf0d223a4e 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -8,7 +8,8 @@ declare float @llvm.cos.f32(float)
 define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 ; CHECK-LABEL: test_lazy_save_1_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -31,7 +32,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @private_za_callee()
   ret void
@@ -41,20 +43,21 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" {
 define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-LABEL: test_lazy_save_2_callees:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-48]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x21, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x19, #1
+; CHECK-NEXT:    rdsvl x20, #1
 ; CHECK-NEXT:    mov x8, sp
-; CHECK-NEXT:    msub x8, x19, x19, x8
+; CHECK-NEXT:    msub x8, x20, x20, x8
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    sub x20, x29, #16
+; CHECK-NEXT:    sub x21, x29, #16
 ; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    sturh w19, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x20
+; CHECK-NEXT:    sturh w20, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x21
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -64,8 +67,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:    bl __arm_tpidr2_restore
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:    sturh w19, [x29, #-8]
-; CHECK-NEXT:    msr TPIDR2_EL0, x20
+; CHECK-NEXT:    sturh w20, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x21
 ; CHECK-NEXT:    bl private_za_callee
 ; CHECK-NEXT:    smstart za
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
@@ -76,8 +79,9 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 ; CHECK-NEXT:  .LBB1_4:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x21, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #48 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @private_za_callee()
   call void @private_za_callee()
@@ -88,7 +92,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" {
 define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inout_za" {
 ; CHECK-LABEL: test_lazy_save_expanded_intrinsic:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -111,7 +116,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou
 ; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = call float @llvm.cos.f32(float %a)
   ret float %res
@@ -127,7 +133,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
 ; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
 ; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #80] // 16-byte Folded Spill
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
@@ -140,13 +146,13 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:    sturh w8, [x29, #-72]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x10
 ; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbz w19, #0, .LBB3_2
+; CHECK-NEXT:    and x20, x0, #0x1
+; CHECK-NEXT:    tbz w20, #0, .LBB3_2
 ; CHECK-NEXT:  // %bb.1:
 ; CHECK-NEXT:    smstop sm
 ; CHECK-NEXT:  .LBB3_2:
 ; CHECK-NEXT:    bl private_za_callee
-; CHECK-NEXT:    tbz w19, #0, .LBB3_4
+; CHECK-NEXT:    tbz w20, #0, .LBB3_4
 ; CHECK-NEXT:  // %bb.3:
 ; CHECK-NEXT:    smstart sm
 ; CHECK-NEXT:  .LBB3_4:
@@ -159,8 +165,8 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za
 ; CHECK-NEXT:  .LBB3_6:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x20, x19, [sp, #80] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
 ; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index cd7460b177c4b..095e84cda1085 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -7,7 +7,8 @@ declare void @private_za_callee()
 define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-LABEL: disable_tailcallopt:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -30,7 +31,8 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   tail call void @private_za_callee()
   ret void
@@ -40,7 +42,8 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind {
 define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-LABEL: f128_call_za:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -63,7 +66,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind {
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   %res = fadd fp128 %a, %b
   ret fp128 %res
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 7f40b5e7e1344..884096743e034 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -34,7 +34,7 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-LABEL: za_zt0_shared_caller_no_state_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -42,16 +42,16 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:    msub x9, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x9
 ; CHECK-NEXT:    sub x10, x29, #16
-; CHECK-NEXT:    sub x19, x29, #80
+; CHECK-NEXT:    sub x20, x29, #80
 ; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur x9, [x29, #-16]
 ; CHECK-NEXT:    sturh w8, [x29, #-8]
 ; CHECK-NEXT:    msr TPIDR2_EL0, x10
-; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    str zt0, [x20]
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstart za
-; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    ldr zt0, [x20]
 ; CHECK-NEXT:    mrs x8, TPIDR2_EL0
 ; CHECK-NEXT:    sub x0, x29, #16
 ; CHECK-NEXT:    cbnz x8, .LBB1_2
@@ -60,7 +60,7 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_
 ; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    msr TPIDR2_EL0, xzr
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee();
@@ -88,22 +88,22 @@ define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64
 ; CHECK-LABEL: za_zt0_shared_caller_za_shared_callee:
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
-; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
 ; CHECK-NEXT:    mov x9, sp
 ; CHECK-NEXT:    msub x8, x8, x8, x9
 ; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    sub x19, x29, #80
+; CHECK-NEXT:    sub x20, x29, #80
 ; CHECK-NEXT:    stur wzr, [x29, #-4]
 ; CHECK-NEXT:    sturh wzr, [x29, #-6]
 ; CHECK-NEXT:    stur x8, [x29, #-16]
-; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    str zt0, [x20]
 ; CHECK-NEXT:    bl callee
-; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    ldr zt0, [x20]
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
 ; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_inout_za";
@@ -114,7 +114,8 @@ define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64
 define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind {
 ; CHECK-LABEL: za_zt0_shared_caller_za_zt0_shared_callee:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -126,7 +127,8 @@ define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aar
 ; CHECK-NEXT:    stur x8, [x29, #-16]
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
   ret void;
@@ -192,7 +194,8 @@ define void @zt0_new_caller() "aarch64_new_zt0" nounwind {
 define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-LABEL: new_za_zt0_caller:
 ; CHECK:       // %bb.0: // %prelude
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #80
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -217,7 +220,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    smstop za
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
   ret void;
@@ -227,7 +231,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind {
 define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwind {
 ; CHECK-LABEL: new_za_shared_zt0_caller:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHECK-NEXT:    rdsvl x8, #1
@@ -240,7 +245,8 @@ define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwi
 ; CHECK-NEXT:    zero {za}
 ; CHECK-NEXT:    bl callee
 ; CHECK-NEXT:    mov sp, x29
-; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
 ; CHECK-NEXT:    ret
   call void @callee() "aarch64_inout_za" "aarch64_in_zt0";
   ret void;
@@ -250,7 +256,8 @@ define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwi
 define void @shared_za_new_zt0() "aarch64_inout_za" "aarch64_new_zt0" nounwind {
 ; CHECK-LABEL: shared_za_new_zt0:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
 ; CHECK-NEXT:    mov x29, sp
 ; CHECK-NEXT:    sub sp, sp, #16
 ; CHE...
[truncated]

efriedma-quic · 2024-05-09T23:36:00Z

Please add a testcase where a base pointer actually improves the generated code. None of the testcases you're modifying even materialize the base pointer.

(I think some of the existing testcases are likely to be changed to add +sve; see discussion on #83301 .)

aemerson · 2024-05-10T17:34:37Z

Please add a testcase where a base pointer actually improves the generated code. None of the testcases you're modifying even materialize the base pointer.

(I think some of the existing testcases are likely to be changed to add +sve; see discussion on #83301 .)

The test case I have is a pretty large one that causes a crash. I'll add it but that was the only motivation for this, not to improve code.

efriedma-quic · 2024-05-10T18:41:53Z

Your testcase doesn't appear to crash on trunk? Or is that reduced?

If you just want to force the codepath in question, something smaller like the following works:

void g(int, void*, svfloat32_t, void*);
void f(int x, svfloat32_t y) {
  void *p = __builtin_alloca(x);
  __attribute((aligned(16))) int a[32]; 
  asm("":::"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16",
    "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31",
    "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11","x12","x13","x14","x15","x16",
    "x17","x18","x19","x20","x21","x22","x23","x24","x25","x26","x27","x28","x30");
  g(x,p,y,a);
}

Note we'll probably end up messing with the fp/bp code at some point to address #80009.

efriedma-quic · 2024-05-10T18:47:20Z

llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp

@@ -552,7 +552,8 @@ bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
    if (hasStackRealignment(MF))
      return true;

-    if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+    auto &ST = MF.getSubtarget<AArch64Subtarget>();
+    if (ST.hasSVEorSME()) {


Really what you want is ST.hasSVE() || (ST.hasSME() && SMEAttrs(MF->getFunction).hasStreamingInterfaceOrBody()). Maybe the subtarget should expose a helper for this.

aemerson · 2024-05-13T21:26:23Z

Your testcase doesn't appear to crash on trunk? Or is that reduced?

If you just want to force the codepath in question, something smaller like the following works:
void g(int, void*, svfloat32_t, void*);
void f(int x, svfloat32_t y) {
  void *p = __builtin_alloca(x);
  __attribute((aligned(16))) int a[32]; 
  asm("":::"v0","v1","v2","v3","v4","v5","v6","v7","v8","v9","v10","v11","v12","v13","v14","v15","v16",
    "v17","v18","v19","v20","v21","v22","v23","v24","v25","v26","v27","v28","v29","v30","v31",
    "x0","x1","x2","x3","x4","x5","x6","x7","x8","x9","x10","x11","x12","x13","x14","x15","x16",
    "x17","x18","x19","x20","x21","x22","x23","x24","x25","x26","x27","x28","x30");
  g(x,p,y,a);
}
Note we'll probably end up messing with the fp/bp code at some point to address #80009.

Ah I forgot to add -O0 to the RUN line.

…nctions. The existing code is checking for the presence of the +sve subtarget feature when deciding to use a base pointer for the function, but this check doesn't work when only +sme is used. rdar://126878490

efriedma-quic · 2024-05-13T22:22:43Z

Oh, I see, reproduced.

Probably there should be a comment in this code that the base pointer is necessary to ensure the emergency spill slot is reachable.

Probably you can write a reduced testcase to force an emergency spill (just need to ensure you have a store to a stack slot with no scavengeable register... should be able to force that with inline asm). I'd be okay leaving it out if it turns out to be too tricky to write, though. The unreduced case probably isn't that useful.

aemerson · 2024-05-14T18:02:50Z

Oh, I see, reproduced.

Probably there should be a comment in this code that the base pointer is necessary to ensure the emergency spill slot is reachable.

Probably you can write a reduced testcase to force an emergency spill (just need to ensure you have a store to a stack slot with no scavengeable register... should be able to force that with inline asm). I'd be okay leaving it out if it turns out to be too tricky to write, though. The unreduced case probably isn't that useful.

I wasn't able to write a smaller test for the crash (probably did something wrong) but there was an existing SVE test for BP that I've replicated for SME.

efriedma-quic · 2024-05-14T18:14:41Z

llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp

-    if (MF.getSubtarget<AArch64Subtarget>().hasSVE()) {
+    auto &ST = MF.getSubtarget<AArch64Subtarget>();
+    if (ST.hasSVE() ||
+        (ST.hasSME() && ST.hasStreamingInterfaceOrBody(MF.getFunction()))) {


Oops, didn't realize there was an existing bit for this... ST,isStreaming() is equivalent, I think.

Don't we need to check it on a per Function basis?

The subtarget is per-function, sort of... see AArch64TargetMachine::getSubtargetImpl.

Oh right, of course.

efriedma-quic

LGTM

efriedma-quic · 2024-05-14T18:43:49Z

llvm/test/CodeGen/AArch64/framelayout-sme-basepointer.mir

+...
+---
+# This test verifies that the basepointer is available in presence of SME stack objects.
+# This is almost identical to the SVE version of this test but has to be a separate file


Can you just mark the target attribute on the IR function, instead of using -mattr? Not a big deal either way.

aemerson requested a review from sdesmalen-arm May 9, 2024 18:48

llvmbot added the backend:AArch64 label May 9, 2024

aemerson requested review from TNorthover, davemgreen and paulwalker-arm May 9, 2024 18:48

efriedma-quic requested review from kmclaughlin-arm and efriedma-quic May 9, 2024 23:29

aemerson force-pushed the sme-bp branch from 544157d to 7292d0b Compare May 10, 2024 17:46

efriedma-quic reviewed May 10, 2024

View reviewed changes

[AArch64][SME] Fix frame lowering not using a base pointer for SME fu…

7dbf92f

…nctions. The existing code is checking for the presence of the +sve subtarget feature when deciding to use a base pointer for the function, but this check doesn't work when only +sme is used. rdar://126878490

aemerson force-pushed the sme-bp branch from 7292d0b to 7dbf92f Compare May 13, 2024 22:21

Add MIR test.

a9f9bea

efriedma-quic reviewed May 14, 2024

View reviewed changes

Fixups

32d5adf

efriedma-quic approved these changes May 14, 2024

View reviewed changes

aemerson added 2 commits May 14, 2024 12:58

Merge tests.

ee30fbb

Fix CHECK prefixes.

5f64ad0

aemerson merged commit d4c86e7 into llvm:main May 14, 2024
3 of 4 checks passed

aemerson deleted the sme-bp branch May 23, 2024 18:32

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AArch64][SME] Fix frame lowering not using a base pointer for SME functions. #91643

[AArch64][SME] Fix frame lowering not using a base pointer for SME functions. #91643

Uh oh!

aemerson commented May 9, 2024

Uh oh!

llvmbot commented May 9, 2024

Uh oh!

efriedma-quic commented May 9, 2024

Uh oh!

aemerson commented May 10, 2024

Uh oh!

efriedma-quic commented May 10, 2024

Uh oh!

efriedma-quic May 10, 2024

Uh oh!

aemerson commented May 13, 2024

Uh oh!

efriedma-quic commented May 13, 2024

Uh oh!

aemerson commented May 14, 2024

Uh oh!

efriedma-quic May 14, 2024

Uh oh!

aemerson May 14, 2024

Uh oh!

efriedma-quic May 14, 2024

Uh oh!

aemerson May 14, 2024

Uh oh!

efriedma-quic left a comment

Uh oh!

efriedma-quic May 14, 2024

Uh oh!

Uh oh!

Uh oh!

[AArch64][SME] Fix frame lowering not using a base pointer for SME functions. #91643

[AArch64][SME] Fix frame lowering not using a base pointer for SME functions. #91643

Uh oh!

Conversation

aemerson commented May 9, 2024

Uh oh!

llvmbot commented May 9, 2024

Uh oh!

efriedma-quic commented May 9, 2024

Uh oh!

aemerson commented May 10, 2024

Uh oh!

efriedma-quic commented May 10, 2024

Uh oh!

efriedma-quic May 10, 2024

Choose a reason for hiding this comment

Uh oh!

aemerson commented May 13, 2024

Uh oh!

efriedma-quic commented May 13, 2024

Uh oh!

aemerson commented May 14, 2024

Uh oh!

efriedma-quic May 14, 2024

Choose a reason for hiding this comment

Uh oh!

aemerson May 14, 2024

Choose a reason for hiding this comment

Uh oh!

efriedma-quic May 14, 2024

Choose a reason for hiding this comment

Uh oh!

aemerson May 14, 2024

Choose a reason for hiding this comment

Uh oh!

efriedma-quic left a comment

Choose a reason for hiding this comment

Uh oh!

efriedma-quic May 14, 2024

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!