52
52
// | async context if needed |
53
53
// | (a.k.a. "frame record") |
54
54
// |-----------------------------------| <- fp(=x29)
55
+ // | <hazard padding> |
56
+ // |-----------------------------------|
55
57
// | |
56
58
// | callee-saved fp/simd/SVE regs |
57
59
// | |
64
66
// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
65
67
// |.the.standard.16-byte.alignment....| compile time; if present)
66
68
// |-----------------------------------|
67
- // | |
68
69
// | local variables of fixed size |
69
70
// | including spill slots |
71
+ // | <FPR> |
72
+ // | <hazard padding> |
73
+ // | <GPR> |
70
74
// |-----------------------------------| <- bp(not defined by ABI,
71
75
// |.variable-sized.local.variables....| LLVM chooses X19)
72
76
// |.(VLAs)............................| (size of this area is unknown at
117
121
//
118
122
// FIXME: also explain the redzone concept.
119
123
//
124
+ // About stack hazards: Under some SME contexts, a coprocessor with its own
125
+ // separate cache can used for FP operations. This can create hazards if the CPU
126
+ // and the SME unit try to access the same area of memory, including if the
127
+ // access is to an area of the stack. To try to alleviate this we attempt to
128
+ // introduce extra padding into the stack frame between FP and GPR accesses,
129
+ // controlled by the StackHazardSize option. Without changing the layout of the
130
+ // stack frame in the diagram above, a stack object of size StackHazardSize is
131
+ // added between GPR and FPR CSRs. Another is added to the stack objects
132
+ // section, and stack objects are sorted so that FPR > Hazard padding slot >
133
+ // GPRs (where possible). Unfortunately some things are not handled well (VLA
134
+ // area, arguments on the stack, object with both GPR and FPR accesses), but if
135
+ // those are controlled by the user then the entire stack frame becomes GPR at
136
+ // the start/end with FPR in the middle, surrounded by Hazard padding.
137
+ //
120
138
// An example of the prologue:
121
139
//
122
140
// .globl __foo
196
214
#include " llvm/ADT/ScopeExit.h"
197
215
#include " llvm/ADT/SmallVector.h"
198
216
#include " llvm/ADT/Statistic.h"
217
+ #include " llvm/Analysis/ValueTracking.h"
199
218
#include " llvm/CodeGen/LivePhysRegs.h"
200
219
#include " llvm/CodeGen/MachineBasicBlock.h"
201
220
#include " llvm/CodeGen/MachineFrameInfo.h"
@@ -253,6 +272,14 @@ cl::opt<bool> EnableHomogeneousPrologEpilog(
253
272
cl::desc (" Emit homogeneous prologue and epilogue for the size "
254
273
" optimization (default = off)" ));
255
274
275
+ // Stack hazard padding size. 0 = disabled.
276
+ static cl::opt<unsigned > StackHazardSize (" aarch64-stack-hazard-size" ,
277
+ cl::init (0 ), cl::Hidden);
278
+ // Whether to insert padding into non-streaming functions (for testing).
279
+ static cl::opt<bool >
280
+ StackHazardInNonStreaming (" aarch64-stack-hazard-in-non-streaming" ,
281
+ cl::init (false ), cl::Hidden);
282
+
256
283
STATISTIC (NumRedZoneFunctions, " Number of functions using red zone" );
257
284
258
285
// / Returns how much of the incoming argument stack area (in bytes) we should
@@ -1461,6 +1488,10 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
1461
1488
// update in so create a normal arithmetic instruction instead.
1462
1489
if (MBBI->getOperand (MBBI->getNumOperands () - 1 ).getImm () != 0 ||
1463
1490
CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
1491
+ // If we are destroying the frame, make sure we add the increment after the
1492
+ // last frame operation.
1493
+ if (FrameFlag == MachineInstr::FrameDestroy)
1494
+ ++MBBI;
1464
1495
emitFrameOffset (MBB, MBBI, DL, AArch64::SP, AArch64::SP,
1465
1496
StackOffset::getFixed (CSStackSizeInc), TII, FrameFlag,
1466
1497
false , false , nullptr , EmitCFI,
@@ -2901,6 +2932,7 @@ static void computeCalleeSaveRegisterPairs(
2901
2932
}
2902
2933
int ScalableByteOffset = AFI->getSVECalleeSavedStackSize ();
2903
2934
bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace ();
2935
+ Register LastReg = 0 ;
2904
2936
2905
2937
// When iterating backwards, the loop condition relies on unsigned wraparound.
2906
2938
for (unsigned i = FirstReg; i < Count; i += RegInc) {
@@ -2922,8 +2954,15 @@ static void computeCalleeSaveRegisterPairs(
2922
2954
else
2923
2955
llvm_unreachable (" Unsupported register class." );
2924
2956
2957
+ // Add the stack hazard size as we transition from GPR->FPR CSRs.
2958
+ if (AFI->hasStackHazardSlotIndex () &&
2959
+ (!LastReg || !AArch64InstrInfo::isFpOrNEON (LastReg)) &&
2960
+ AArch64InstrInfo::isFpOrNEON (RPI.Reg1 ))
2961
+ ByteOffset += StackFillDir * StackHazardSize;
2962
+ LastReg = RPI.Reg1 ;
2963
+
2925
2964
// Add the next reg to the pair if it is in the same register class.
2926
- if (unsigned (i + RegInc) < Count) {
2965
+ if (unsigned (i + RegInc) < Count && !AFI-> hasStackHazardSlotIndex () ) {
2927
2966
Register NextReg = CSI[i + RegInc].getReg ();
2928
2967
bool IsFirst = i == FirstReg;
2929
2968
switch (RPI.Type ) {
@@ -3034,7 +3073,8 @@ static void computeCalleeSaveRegisterPairs(
3034
3073
Offset += 8 ;
3035
3074
RPI.Offset = Offset / Scale;
3036
3075
3037
- assert (((!RPI.isScalable () && RPI.Offset >= -64 && RPI.Offset <= 63 ) ||
3076
+ assert ((!RPI.isPaired () ||
3077
+ (!RPI.isScalable () && RPI.Offset >= -64 && RPI.Offset <= 63 ) ||
3038
3078
(RPI.isScalable () && RPI.Offset >= -256 && RPI.Offset <= 255 )) &&
3039
3079
" Offset out of bounds for LDP/STP immediate" );
3040
3080
@@ -3455,6 +3495,81 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
3455
3495
return true ;
3456
3496
}
3457
3497
3498
+ // Return the FrameID for a Load/Store instruction by looking at the MMO.
3499
+ static std::optional<int > getLdStFrameID (const MachineInstr &MI,
3500
+ const MachineFrameInfo &MFI) {
3501
+ if (!MI.mayLoadOrStore () || MI.getNumMemOperands () < 1 )
3502
+ return std::nullopt;
3503
+
3504
+ MachineMemOperand *MMO = *MI.memoperands_begin ();
3505
+ auto *PSV =
3506
+ dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue ());
3507
+ if (PSV)
3508
+ return std::optional<int >(PSV->getFrameIndex ());
3509
+
3510
+ if (MMO->getValue ()) {
3511
+ if (auto *Al = dyn_cast<AllocaInst>(getUnderlyingObject (MMO->getValue ()))) {
3512
+ for (int FI = MFI.getObjectIndexBegin (); FI < MFI.getObjectIndexEnd ();
3513
+ FI++)
3514
+ if (MFI.getObjectAllocation (FI) == Al)
3515
+ return FI;
3516
+ }
3517
+ }
3518
+
3519
+ return std::nullopt;
3520
+ }
3521
+
3522
+ // Check if a Hazard slot is needed for the current function, and if so create
3523
+ // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
3524
+ // which can be used to determine if any hazard padding is needed.
3525
+ void AArch64FrameLowering::determineStackHazardSlot (
3526
+ MachineFunction &MF, BitVector &SavedRegs) const {
3527
+ if (StackHazardSize == 0 || StackHazardSize % 16 != 0 ||
3528
+ MF.getInfo <AArch64FunctionInfo>()->hasStackHazardSlotIndex ())
3529
+ return ;
3530
+
3531
+ // Stack hazards are only needed in streaming functions.
3532
+ SMEAttrs Attrs (MF.getFunction ());
3533
+ if (!StackHazardInNonStreaming &&
3534
+ Attrs.hasNonStreamingInterfaceAndBody ())
3535
+ return ;
3536
+
3537
+ MachineFrameInfo &MFI = MF.getFrameInfo ();
3538
+
3539
+ // Add a hazard slot if there are any CSR FPR registers, or are any fp-only
3540
+ // stack objects.
3541
+ bool HasFPRCSRs = any_of (SavedRegs.set_bits (), [](unsigned Reg) {
3542
+ return AArch64::FPR64RegClass.contains (Reg) ||
3543
+ AArch64::FPR128RegClass.contains (Reg) ||
3544
+ AArch64::ZPRRegClass.contains (Reg) ||
3545
+ AArch64::PPRRegClass.contains (Reg);
3546
+ });
3547
+ bool HasFPRStackObjects = false ;
3548
+ if (!HasFPRCSRs) {
3549
+ std::vector<unsigned > FrameObjects (MFI.getObjectIndexEnd ());
3550
+ for (auto &MBB : MF) {
3551
+ for (auto &MI : MBB) {
3552
+ std::optional<int > FI = getLdStFrameID (MI, MFI);
3553
+ if (FI && *FI >= 0 && *FI < (int )FrameObjects.size ()) {
3554
+ if (MFI.getStackID (*FI) == 2 || AArch64InstrInfo::isFpOrNEON (MI))
3555
+ FrameObjects[*FI] |= 2 ;
3556
+ else
3557
+ FrameObjects[*FI] |= 1 ;
3558
+ }
3559
+ }
3560
+ }
3561
+ HasFPRStackObjects =
3562
+ any_of (FrameObjects, [](unsigned B) { return (B & 3 ) == 2 ; });
3563
+ }
3564
+
3565
+ if (HasFPRCSRs || HasFPRStackObjects) {
3566
+ int ID = MFI.CreateStackObject (StackHazardSize, Align (16 ), false );
3567
+ LLVM_DEBUG (dbgs () << " Created Hazard slot at " << ID << " size "
3568
+ << StackHazardSize << " \n " );
3569
+ MF.getInfo <AArch64FunctionInfo>()->setStackHazardSlotIndex (ID);
3570
+ }
3571
+ }
3572
+
3458
3573
void AArch64FrameLowering::determineCalleeSaves (MachineFunction &MF,
3459
3574
BitVector &SavedRegs,
3460
3575
RegScavenger *RS) const {
@@ -3595,6 +3710,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
3595
3710
CSStackSize += 8 ;
3596
3711
}
3597
3712
3713
+ // Determine if a Hazard slot should be used, and increase the CSStackSize by
3714
+ // StackHazardSize if so.
3715
+ determineStackHazardSlot (MF, SavedRegs);
3716
+ if (AFI->hasStackHazardSlotIndex ())
3717
+ CSStackSize += StackHazardSize;
3718
+
3598
3719
// Save number of saved regs, so we can easily update CSStackSize later.
3599
3720
unsigned NumSavedRegs = SavedRegs.count ();
3600
3721
@@ -3761,10 +3882,28 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
3761
3882
CSI.insert (CSI.end (), VGSaves.begin (), VGSaves.end ());
3762
3883
}
3763
3884
3885
+ Register LastReg = 0 ;
3886
+ int HazardSlotIndex = std::numeric_limits<int >::max ();
3764
3887
for (auto &CS : CSI) {
3765
3888
Register Reg = CS.getReg ();
3766
3889
const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass (Reg);
3767
3890
3891
+ // Create a hazard slot as we switch between GPR and FPR CSRs.
3892
+ if (AFI->hasStackHazardSlotIndex () &&
3893
+ (!LastReg || !AArch64InstrInfo::isFpOrNEON (LastReg)) &&
3894
+ AArch64InstrInfo::isFpOrNEON (Reg)) {
3895
+ assert (HazardSlotIndex == std::numeric_limits<int >::max () &&
3896
+ " Unexpected register order for hazard slot" );
3897
+ HazardSlotIndex = MFI.CreateStackObject (StackHazardSize, Align (8 ), true );
3898
+ LLVM_DEBUG (dbgs () << " Created CSR Hazard at slot " << HazardSlotIndex
3899
+ << " \n " );
3900
+ AFI->setStackHazardCSRSlotIndex (HazardSlotIndex);
3901
+ if ((unsigned )HazardSlotIndex < MinCSFrameIndex)
3902
+ MinCSFrameIndex = HazardSlotIndex;
3903
+ if ((unsigned )HazardSlotIndex > MaxCSFrameIndex)
3904
+ MaxCSFrameIndex = HazardSlotIndex;
3905
+ }
3906
+
3768
3907
unsigned Size = RegInfo->getSpillSize (*RC);
3769
3908
Align Alignment (RegInfo->getSpillAlign (*RC));
3770
3909
int FrameIdx = MFI.CreateStackObject (Size, Alignment, true );
@@ -3785,7 +3924,22 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
3785
3924
if ((unsigned )FrameIdx > MaxCSFrameIndex)
3786
3925
MaxCSFrameIndex = FrameIdx;
3787
3926
}
3927
+ LastReg = Reg;
3928
+ }
3929
+
3930
+ // Add hazard slot in the case where no FPR CSRs are present.
3931
+ if (AFI->hasStackHazardSlotIndex () &&
3932
+ HazardSlotIndex == std::numeric_limits<int >::max ()) {
3933
+ HazardSlotIndex = MFI.CreateStackObject (StackHazardSize, Align (8 ), true );
3934
+ LLVM_DEBUG (dbgs () << " Created CSR Hazard at slot " << HazardSlotIndex
3935
+ << " \n " );
3936
+ AFI->setStackHazardCSRSlotIndex (HazardSlotIndex);
3937
+ if ((unsigned )HazardSlotIndex < MinCSFrameIndex)
3938
+ MinCSFrameIndex = HazardSlotIndex;
3939
+ if ((unsigned )HazardSlotIndex > MaxCSFrameIndex)
3940
+ MaxCSFrameIndex = HazardSlotIndex;
3788
3941
}
3942
+
3789
3943
return true ;
3790
3944
}
3791
3945
@@ -3798,6 +3952,10 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
3798
3952
// function doesn't use a FP.
3799
3953
if (AFI->hasStreamingModeChanges () && !hasFP (MF))
3800
3954
return false ;
3955
+ // Don't allow register salvaging with hazard slots, in case it moves objects
3956
+ // into the wrong place.
3957
+ if (AFI->hasStackHazardSlotIndex ())
3958
+ return false ;
3801
3959
return AFI->hasCalleeSaveStackFreeSpace ();
3802
3960
}
3803
3961
@@ -4492,6 +4650,10 @@ struct FrameObject {
4492
4650
// This object's group (which always contains the object with
4493
4651
// ObjectFirst==true) should be placed first.
4494
4652
bool GroupFirst = false ;
4653
+
4654
+ // Used to distinguish between FP and GPR accesses.
4655
+ // 1 = GPR, 2 = FPR, 8 = Hazard Object.
4656
+ unsigned Accesses = 0 ;
4495
4657
};
4496
4658
4497
4659
class GroupBuilder {
@@ -4527,8 +4689,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
4527
4689
// at the end. This also allows us to stop walking when we hit the
4528
4690
// first invalid item after it's all sorted.
4529
4691
//
4530
- // The "first" object goes first (closest to SP), followed by the members of
4531
- // the "first" group.
4692
+ // If we want to include a stack hazard region, order FPR accesses < the
4693
+ // hazard object < GPRs accesses in order to create a separation between the
4694
+ // two. For the Accesses field 1 = GPR, 2 = FPR, 8 = Hazard Object.
4695
+ //
4696
+ // Otherwise the "first" object goes first (closest to SP), followed by the
4697
+ // members of the "first" group.
4532
4698
//
4533
4699
// The rest are sorted by the group index to keep the groups together.
4534
4700
// Higher numbered groups are more likely to be around longer (i.e. untagged
@@ -4537,9 +4703,15 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
4537
4703
//
4538
4704
// If all else equal, sort by the object index to keep the objects in the
4539
4705
// original order.
4540
- return std::make_tuple (!A.IsValid , A.ObjectFirst , A.GroupFirst , A.GroupIndex ,
4706
+ if (A.IsValid != B.IsValid )
4707
+ return A.IsValid ;
4708
+ if (A.Accesses == 2 && B.Accesses != 2 )
4709
+ return true ;
4710
+ if (A.Accesses == 8 && B.Accesses != 2 )
4711
+ return true ;
4712
+ return std::make_tuple (A.ObjectFirst , A.GroupFirst , A.GroupIndex ,
4541
4713
A.ObjectIndex ) <
4542
- std::make_tuple (!B. IsValid , B.ObjectFirst , B.GroupFirst , B.GroupIndex ,
4714
+ std::make_tuple (B.ObjectFirst , B.GroupFirst , B.GroupIndex ,
4543
4715
B.ObjectIndex );
4544
4716
}
4545
4717
} // namespace
@@ -4549,6 +4721,7 @@ void AArch64FrameLowering::orderFrameObjects(
4549
4721
if (!OrderFrameObjects || ObjectsToAllocate.empty ())
4550
4722
return ;
4551
4723
4724
+ const AArch64FunctionInfo &AFI = *MF.getInfo <AArch64FunctionInfo>();
4552
4725
const MachineFrameInfo &MFI = MF.getFrameInfo ();
4553
4726
std::vector<FrameObject> FrameObjects (MFI.getObjectIndexEnd ());
4554
4727
for (auto &Obj : ObjectsToAllocate) {
@@ -4595,16 +4768,28 @@ void AArch64FrameLowering::orderFrameObjects(
4595
4768
GB.AddMember (TaggedFI);
4596
4769
else
4597
4770
GB.EndCurrentGroup ();
4771
+
4772
+ if (AFI.hasStackHazardSlotIndex ()) {
4773
+ std::optional<int > FI = getLdStFrameID (MI, MFI);
4774
+ if (FI && *FI >= 0 && *FI < (int )FrameObjects.size ()) {
4775
+ if (MFI.getStackID (*FI) == 2 || AArch64InstrInfo::isFpOrNEON (MI))
4776
+ FrameObjects[*FI].Accesses |= 2 ;
4777
+ else
4778
+ FrameObjects[*FI].Accesses |= 1 ;
4779
+ }
4780
+ }
4598
4781
}
4599
4782
// Groups should never span multiple basic blocks.
4600
4783
GB.EndCurrentGroup ();
4601
4784
}
4602
4785
4786
+ if (AFI.hasStackHazardSlotIndex ())
4787
+ FrameObjects[AFI.getStackHazardSlotIndex ()].Accesses = 8 ;
4788
+
4603
4789
// If the function's tagged base pointer is pinned to a stack slot, we want to
4604
4790
// put that slot first when possible. This will likely place it at SP + 0,
4605
4791
// and save one instruction when generating the base pointer because IRG does
4606
4792
// not allow an immediate offset.
4607
- const AArch64FunctionInfo &AFI = *MF.getInfo <AArch64FunctionInfo>();
4608
4793
std::optional<int > TBPI = AFI.getTaggedBasePointerIndex ();
4609
4794
if (TBPI) {
4610
4795
FrameObjects[*TBPI].ObjectFirst = true ;
0 commit comments