@@ -2492,6 +2492,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
2492
2492
switch ((AArch64ISD::NodeType)Opcode) {
2493
2493
case AArch64ISD::FIRST_NUMBER:
2494
2494
break;
2495
+ MAKE_CASE(AArch64ISD::ALLOCATE_ZA_BUFFER)
2496
+ MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ)
2495
2497
MAKE_CASE(AArch64ISD::COALESCER_BARRIER)
2496
2498
MAKE_CASE(AArch64ISD::VG_SAVE)
2497
2499
MAKE_CASE(AArch64ISD::VG_RESTORE)
@@ -2991,6 +2993,80 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
2991
2993
return BB;
2992
2994
}
2993
2995
2996
+ MachineBasicBlock *
2997
+ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI,
2998
+ MachineBasicBlock *BB) const {
2999
+ MachineFunction *MF = BB->getParent();
3000
+ MachineFrameInfo &MFI = MF->getFrameInfo();
3001
+ AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3002
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3003
+ if (TPIDR2.Uses > 0) {
3004
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3005
+ // Store the buffer pointer to the TPIDR2 stack object.
3006
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
3007
+ .addReg(MI.getOperand(0).getReg())
3008
+ .addFrameIndex(TPIDR2.FrameIndex)
3009
+ .addImm(0);
3010
+ // Set the reserved bytes (10-15) to zero
3011
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
3012
+ .addReg(AArch64::WZR)
3013
+ .addFrameIndex(TPIDR2.FrameIndex)
3014
+ .addImm(5);
3015
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
3016
+ .addReg(AArch64::WZR)
3017
+ .addFrameIndex(TPIDR2.FrameIndex)
3018
+ .addImm(3);
3019
+ } else
3020
+ MFI.RemoveStackObject(TPIDR2.FrameIndex);
3021
+
3022
+ BB->remove_instr(&MI);
3023
+ return BB;
3024
+ }
3025
+
3026
+ MachineBasicBlock *
3027
+ AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI,
3028
+ MachineBasicBlock *BB) const {
3029
+ MachineFunction *MF = BB->getParent();
3030
+ MachineFrameInfo &MFI = MF->getFrameInfo();
3031
+ AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
3032
+ // TODO This function grows the stack with a subtraction, which doesn't work
3033
+ // on Windows. Some refactoring to share the functionality in
3034
+ // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI
3035
+ // supports SME
3036
+ assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() &&
3037
+ "Lazy ZA save is not yet supported on Windows");
3038
+
3039
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
3040
+
3041
+ if (TPIDR2.Uses > 0) {
3042
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3043
+ MachineRegisterInfo &MRI = MF->getRegInfo();
3044
+
3045
+ // The SUBXrs below won't always be emitted in a form that accepts SP
3046
+ // directly
3047
+ Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
3048
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
3049
+ .addReg(AArch64::SP);
3050
+
3051
+ // Allocate a lazy-save buffer object of the size given, normally SVL * SVL
3052
+ auto Size = MI.getOperand(1).getReg();
3053
+ auto Dest = MI.getOperand(0).getReg();
3054
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest)
3055
+ .addReg(Size)
3056
+ .addReg(Size)
3057
+ .addReg(SP);
3058
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY),
3059
+ AArch64::SP)
3060
+ .addReg(Dest);
3061
+
3062
+ // We have just allocated a variable sized object, tell this to PEI.
3063
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
3064
+ }
3065
+
3066
+ BB->remove_instr(&MI);
3067
+ return BB;
3068
+ }
3069
+
2994
3070
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
2995
3071
MachineInstr &MI, MachineBasicBlock *BB) const {
2996
3072
@@ -3021,7 +3097,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
3021
3097
MI.dump();
3022
3098
#endif
3023
3099
llvm_unreachable("Unexpected instruction for custom inserter!");
3024
-
3100
+ case AArch64::InitTPIDR2Obj:
3101
+ return EmitInitTPIDR2Object(MI, BB);
3102
+ case AArch64::AllocateZABuffer:
3103
+ return EmitAllocateZABuffer(MI, BB);
3025
3104
case AArch64::F128CSEL:
3026
3105
return EmitF128CSEL(MI, BB);
3027
3106
case TargetOpcode::STATEPOINT:
@@ -7029,47 +7108,6 @@ AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
7029
7108
}
7030
7109
}
7031
7110
7032
-
7033
- unsigned
7034
- AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL,
7035
- SelectionDAG &DAG) const {
7036
- MachineFunction &MF = DAG.getMachineFunction();
7037
- MachineFrameInfo &MFI = MF.getFrameInfo();
7038
-
7039
- // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case)
7040
- SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7041
- DAG.getConstant(1, DL, MVT::i32));
7042
- SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N);
7043
- SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)};
7044
- SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other);
7045
- SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops);
7046
- Chain = Buffer.getValue(1);
7047
- MFI.CreateVariableSizedObject(Align(1), nullptr);
7048
-
7049
- // Allocate an additional TPIDR2 object on the stack (16 bytes)
7050
- unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false);
7051
-
7052
- // Store the buffer pointer to the TPIDR2 stack object.
7053
- MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
7054
- SDValue Ptr = DAG.getFrameIndex(
7055
- TPIDR2Obj,
7056
- DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
7057
- Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI);
7058
-
7059
- // Set the reserved bytes (10-15) to zero
7060
- EVT PtrTy = Ptr.getValueType();
7061
- SDValue ReservedPtr =
7062
- DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy));
7063
- Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr,
7064
- MPI);
7065
- ReservedPtr =
7066
- DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy));
7067
- Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr,
7068
- MPI);
7069
-
7070
- return TPIDR2Obj;
7071
- }
7072
-
7073
7111
static bool isPassedInFPR(EVT VT) {
7074
7112
return VT.isFixedLengthVector() ||
7075
7113
(VT.isFloatingPoint() && !VT.isScalableVector());
@@ -7485,10 +7523,28 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
7485
7523
if (Subtarget->hasCustomCallingConv())
7486
7524
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
7487
7525
7488
- // Conservatively assume the function requires the lazy-save mechanism.
7526
+ // Create a 16 Byte TPIDR2 object. The dynamic buffer
7527
+ // will be expanded and stored in the static object later using a pseudonode.
7489
7528
if (SMEAttrs(MF.getFunction()).hasZAState()) {
7490
- unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
7491
- FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
7529
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
7530
+ TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false);
7531
+ SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
7532
+ DAG.getConstant(1, DL, MVT::i32));
7533
+
7534
+ SDValue Buffer;
7535
+ if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) {
7536
+ Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL,
7537
+ DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL});
7538
+ } else {
7539
+ SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL);
7540
+ Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL,
7541
+ DAG.getVTList(MVT::i64, MVT::Other),
7542
+ {Chain, Size, DAG.getConstant(1, DL, MVT::i64)});
7543
+ MFI.CreateVariableSizedObject(Align(16), nullptr);
7544
+ }
7545
+ Chain = DAG.getNode(
7546
+ AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other),
7547
+ {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)});
7492
7548
}
7493
7549
7494
7550
if (CallConv == CallingConv::PreserveNone) {
@@ -8174,9 +8230,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8174
8230
8175
8231
bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
8176
8232
if (RequiresLazySave) {
8177
- unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
8178
- MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
8179
- SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
8233
+ const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
8234
+ MachinePointerInfo MPI =
8235
+ MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex);
8236
+ SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
8237
+ TPIDR2.FrameIndex,
8180
8238
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
8181
8239
SDValue NumZaSaveSlicesAddr =
8182
8240
DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
@@ -8719,7 +8777,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8719
8777
8720
8778
if (RequiresLazySave) {
8721
8779
// Conditionally restore the lazy save using a pseudo node.
8722
- unsigned FI = FuncInfo->getLazySaveTPIDR2Obj ();
8780
+ TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj ();
8723
8781
SDValue RegMask = DAG.getRegisterMask(
8724
8782
TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
8725
8783
SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
@@ -8732,7 +8790,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8732
8790
// RESTORE_ZA pseudo.
8733
8791
SDValue Glue;
8734
8792
SDValue TPIDR2Block = DAG.getFrameIndex(
8735
- FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
8793
+ TPIDR2.FrameIndex,
8794
+ DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
8736
8795
Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
8737
8796
Result =
8738
8797
DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
@@ -8744,6 +8803,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
8744
8803
ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
8745
8804
DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
8746
8805
DAG.getConstant(0, DL, MVT::i64));
8806
+ TPIDR2.Uses++;
8747
8807
}
8748
8808
8749
8809
if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
0 commit comments