Skip to content

[AArch64][SME] Remove combination of private-ZA and preserves_za. #78563

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 30 additions & 46 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7608,23 +7608,15 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,

bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
if (RequiresLazySave) {
SDValue NumZaSaveSlices;
if (!CalleeAttrs.preservesZA()) {
// Set up a lazy save mechanism by storing the runtime live slices
// (worst-case SVL) to the TPIDR2 stack object.
NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
DAG.getConstant(1, DL, MVT::i32));
} else if (CalleeAttrs.preservesZA()) {
NumZaSaveSlices = DAG.getConstant(0, DL, MVT::i64);
}

unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
SDValue NumZaSaveSlicesAddr =
DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
DAG.getConstant(1, DL, MVT::i32));
Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
MPI, MVT::i16);
Chain = DAG.getNode(
Expand All @@ -7637,14 +7629,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
CLI.CB)
: OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
&MF.getFunction());
DescribeCallsite(R) << " sets up a lazy save for ZA";
if (CalleeAttrs.preservesZA())
R << ", but callee preserves ZA, so we request 0 slices to be saved";
else
R << ", and we request that all slices be saved";
R << ore::setExtraArgs()
<< ore::NV("CalleePreservesZA", CalleeAttrs.preservesZA());
return R;
return DescribeCallsite(R) << " sets up a lazy save for ZA";
});
}

Expand Down Expand Up @@ -8075,34 +8060,33 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}

if (RequiresLazySave) {
if (!CalleeAttrs.preservesZA()) {
// Unconditionally resume ZA.
Result = DAG.getNode(
AArch64ISD::SMSTART, DL, MVT::Other, Result,
DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));

// Conditionally restore the lazy save using a pseudo node.
unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
SDValue RegMask = DAG.getRegisterMask(
TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
"__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
SDValue TPIDR2_EL0 = DAG.getNode(
ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));

// Copy the address of the TPIDR2 block into X0 before 'calling' the
// RESTORE_ZA pseudo.
SDValue Glue;
SDValue TPIDR2Block = DAG.getFrameIndex(
FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
{Result, TPIDR2_EL0,
DAG.getRegister(AArch64::X0, MVT::i64),
RestoreRoutine, RegMask, Result.getValue(1)});
}
// Unconditionally resume ZA.
Result = DAG.getNode(
AArch64ISD::SMSTART, DL, MVT::Other, Result,
DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));

// Conditionally restore the lazy save using a pseudo node.
unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
SDValue RegMask = DAG.getRegisterMask(
TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
"__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
SDValue TPIDR2_EL0 = DAG.getNode(
ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));

// Copy the address of the TPIDR2 block into X0 before 'calling' the
// RESTORE_ZA pseudo.
SDValue Glue;
SDValue TPIDR2Block = DAG.getFrameIndex(
FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
Result =
DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
{Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
RestoreRoutine, RegMask, Result.getValue(1)});

// Finally reset the TPIDR2_EL0 register to 0.
Result = DAG.getNode(
ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
Expand Down
15 changes: 4 additions & 11 deletions llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,24 @@
; RUN: llc -mtriple=aarch64 -mattr=+sme --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s

declare void @private_za_callee()
declare void @private_za_preserved_callee() "aarch64_pstate_za_preserved"
declare float @llvm.cos.f32(float)

define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" {
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA
call void @private_za_callee()
ret void
}

define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA
call void @private_za_callee()
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA
call void @private_za_callee()
ret void
}

define void @test_lazy_save_preserved_callee() nounwind "aarch64_pstate_za_shared" {
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_preserved_callee' to 'private_za_preserved_callee' sets up a lazy save for ZA, but callee preserves ZA, so we request 0 slices to be saved
call void @private_za_preserved_callee()
ret void
}

define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_pstate_za_shared" {
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA, and we request that all slices be saved
; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA
%res = call float @llvm.cos.f32(float %a)
ret float %res
}
46 changes: 0 additions & 46 deletions llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s

declare void @private_za_callee()
declare void @private_za_preserved_callee() "aarch64_pstate_za_preserved"
declare float @llvm.cos.f32(float)

; Test lazy-save mechanism for a single callee.
Expand Down Expand Up @@ -170,48 +169,3 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z
call void @private_za_callee()
ret void
}


; Test lazy-save mechanism for an aarch64_pstate_za_shared caller
; calling a callee with aarch64_pstate_za_preserved.
define void @za_shared_caller_za_preserved_callee() nounwind "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" {
; CHECK-LABEL: za_shared_caller_za_preserved_callee:
; CHECK: // %bb.0:
; CHECK-NEXT: stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
; CHECK-NEXT: stp d13, d12, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: stp d11, d10, [sp, #32] // 16-byte Folded Spill
; CHECK-NEXT: stp d9, d8, [sp, #48] // 16-byte Folded Spill
; CHECK-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
; CHECK-NEXT: add x29, sp, #64
; CHECK-NEXT: str x19, [sp, #80] // 8-byte Folded Spill
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
; CHECK-NEXT: msub x8, x8, x8, x9
; CHECK-NEXT: mov sp, x8
; CHECK-NEXT: sub x9, x29, #80
; CHECK-NEXT: stp x8, xzr, [x29, #-80]
; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB4_2
; CHECK-NEXT: // %bb.1:
; CHECK-NEXT: smstop sm
; CHECK-NEXT: .LBB4_2:
; CHECK-NEXT: bl private_za_preserved_callee
; CHECK-NEXT: tbz w19, #0, .LBB4_4
; CHECK-NEXT: // %bb.3:
; CHECK-NEXT: smstart sm
; CHECK-NEXT: .LBB4_4:
; CHECK-NEXT: msr TPIDR2_EL0, xzr
; CHECK-NEXT: sub sp, x29, #64
; CHECK-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
; CHECK-NEXT: ldr x19, [sp, #80] // 8-byte Folded Reload
; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload
; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload
; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload
; CHECK-NEXT: ldp d15, d14, [sp], #96 // 16-byte Folded Reload
; CHECK-NEXT: ret
call void @private_za_preserved_callee()
ret void
}