Skip to content

[AArch64] Add streaming-mode stack hazards. #98956

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jul 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 199 additions & 11 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@
// | async context if needed |
// | (a.k.a. "frame record") |
// |-----------------------------------| <- fp(=x29)
// | <hazard padding> |
// |-----------------------------------|
// | |
// | callee-saved fp/simd/SVE regs |
// | |
Expand All @@ -64,9 +66,11 @@
// |.aligned.in.case.it.needs.more.than| (size of this area is unknown at
// |.the.standard.16-byte.alignment....| compile time; if present)
// |-----------------------------------|
// | |
// | local variables of fixed size |
// | including spill slots |
// | <FPR> |
// | <hazard padding> |
// | <GPR> |
// |-----------------------------------| <- bp(not defined by ABI,
// |.variable-sized.local.variables....| LLVM chooses X19)
// |.(VLAs)............................| (size of this area is unknown at
Expand Down Expand Up @@ -117,6 +121,20 @@
//
// FIXME: also explain the redzone concept.
//
// About stack hazards: Under some SME contexts, a coprocessor with its own
// separate cache can used for FP operations. This can create hazards if the CPU
// and the SME unit try to access the same area of memory, including if the
// access is to an area of the stack. To try to alleviate this we attempt to
// introduce extra padding into the stack frame between FP and GPR accesses,
// controlled by the StackHazardSize option. Without changing the layout of the
// stack frame in the diagram above, a stack object of size StackHazardSize is
// added between GPR and FPR CSRs. Another is added to the stack objects
// section, and stack objects are sorted so that FPR > Hazard padding slot >
// GPRs (where possible). Unfortunately some things are not handled well (VLA
// area, arguments on the stack, object with both GPR and FPR accesses), but if
// those are controlled by the user then the entire stack frame becomes GPR at
// the start/end with FPR in the middle, surrounded by Hazard padding.
//
// An example of the prologue:
//
// .globl __foo
Expand Down Expand Up @@ -196,6 +214,7 @@
#include "llvm/ADT/ScopeExit.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
Expand Down Expand Up @@ -253,6 +272,14 @@ cl::opt<bool> EnableHomogeneousPrologEpilog(
cl::desc("Emit homogeneous prologue and epilogue for the size "
"optimization (default = off)"));

// Stack hazard padding size. 0 = disabled.
static cl::opt<unsigned> StackHazardSize("aarch64-stack-hazard-size",
cl::init(0), cl::Hidden);
// Whether to insert padding into non-streaming functions (for testing).
static cl::opt<bool>
StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming",
cl::init(false), cl::Hidden);

STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");

/// Returns how much of the incoming argument stack area (in bytes) we should
Expand Down Expand Up @@ -1461,6 +1488,10 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// update in so create a normal arithmetic instruction instead.
if (MBBI->getOperand(MBBI->getNumOperands() - 1).getImm() != 0 ||
CSStackSizeInc < MinOffset || CSStackSizeInc > MaxOffset) {
// If we are destroying the frame, make sure we add the increment after the
// last frame operation.
if (FrameFlag == MachineInstr::FrameDestroy)
++MBBI;
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
StackOffset::getFixed(CSStackSizeInc), TII, FrameFlag,
false, false, nullptr, EmitCFI,
Expand Down Expand Up @@ -2901,6 +2932,7 @@ static void computeCalleeSaveRegisterPairs(
}
int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
bool NeedGapToAlignStack = AFI->hasCalleeSaveStackFreeSpace();
Register LastReg = 0;

// When iterating backwards, the loop condition relies on unsigned wraparound.
for (unsigned i = FirstReg; i < Count; i += RegInc) {
Expand All @@ -2922,8 +2954,15 @@ static void computeCalleeSaveRegisterPairs(
else
llvm_unreachable("Unsupported register class.");

// Add the stack hazard size as we transition from GPR->FPR CSRs.
if (AFI->hasStackHazardSlotIndex() &&
(!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
AArch64InstrInfo::isFpOrNEON(RPI.Reg1))
ByteOffset += StackFillDir * StackHazardSize;
LastReg = RPI.Reg1;

// Add the next reg to the pair if it is in the same register class.
if (unsigned(i + RegInc) < Count) {
if (unsigned(i + RegInc) < Count && !AFI->hasStackHazardSlotIndex()) {
Register NextReg = CSI[i + RegInc].getReg();
bool IsFirst = i == FirstReg;
switch (RPI.Type) {
Expand Down Expand Up @@ -3034,7 +3073,8 @@ static void computeCalleeSaveRegisterPairs(
Offset += 8;
RPI.Offset = Offset / Scale;

assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
assert((!RPI.isPaired() ||
(!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
(RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
"Offset out of bounds for LDP/STP immediate");

Expand Down Expand Up @@ -3455,6 +3495,80 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
return true;
}

// Return the FrameID for a Load/Store instruction by looking at the MMO.
static std::optional<int> getLdStFrameID(const MachineInstr &MI,
const MachineFrameInfo &MFI) {
if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1)
return std::nullopt;

MachineMemOperand *MMO = *MI.memoperands_begin();
auto *PSV =
dyn_cast_or_null<FixedStackPseudoSourceValue>(MMO->getPseudoValue());
if (PSV)
return std::optional<int>(PSV->getFrameIndex());

if (MMO->getValue()) {
if (auto *Al = dyn_cast<AllocaInst>(getUnderlyingObject(MMO->getValue()))) {
for (int FI = MFI.getObjectIndexBegin(); FI < MFI.getObjectIndexEnd();
FI++)
if (MFI.getObjectAllocation(FI) == Al)
return FI;
}
}

return std::nullopt;
}

// Check if a Hazard slot is needed for the current function, and if so create
// one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex,
// which can be used to determine if any hazard padding is needed.
void AArch64FrameLowering::determineStackHazardSlot(
MachineFunction &MF, BitVector &SavedRegs) const {
if (StackHazardSize == 0 || StackHazardSize % 16 != 0 ||
MF.getInfo<AArch64FunctionInfo>()->hasStackHazardSlotIndex())
return;

// Stack hazards are only needed in streaming functions.
SMEAttrs Attrs(MF.getFunction());
if (!StackHazardInNonStreaming && Attrs.hasNonStreamingInterfaceAndBody())
return;

MachineFrameInfo &MFI = MF.getFrameInfo();

// Add a hazard slot if there are any CSR FPR registers, or are any fp-only
// stack objects.
bool HasFPRCSRs = any_of(SavedRegs.set_bits(), [](unsigned Reg) {
return AArch64::FPR64RegClass.contains(Reg) ||
AArch64::FPR128RegClass.contains(Reg) ||
AArch64::ZPRRegClass.contains(Reg) ||
AArch64::PPRRegClass.contains(Reg);
});
bool HasFPRStackObjects = false;
if (!HasFPRCSRs) {
std::vector<unsigned> FrameObjects(MFI.getObjectIndexEnd());
for (auto &MBB : MF) {
for (auto &MI : MBB) {
std::optional<int> FI = getLdStFrameID(MI, MFI);
if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
if (MFI.getStackID(*FI) == 2 || AArch64InstrInfo::isFpOrNEON(MI))
FrameObjects[*FI] |= 2;
else
FrameObjects[*FI] |= 1;
}
}
}
HasFPRStackObjects =
any_of(FrameObjects, [](unsigned B) { return (B & 3) == 2; });
}

if (HasFPRCSRs || HasFPRStackObjects) {
int ID = MFI.CreateStackObject(StackHazardSize, Align(16), false);
LLVM_DEBUG(dbgs() << "Created Hazard slot at " << ID << " size "
<< StackHazardSize << "\n");
MF.getInfo<AArch64FunctionInfo>()->setStackHazardSlotIndex(ID);
}
}

void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
BitVector &SavedRegs,
RegScavenger *RS) const {
Expand Down Expand Up @@ -3595,6 +3709,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
CSStackSize += 8;
}

// Determine if a Hazard slot should be used, and increase the CSStackSize by
// StackHazardSize if so.
determineStackHazardSlot(MF, SavedRegs);
if (AFI->hasStackHazardSlotIndex())
CSStackSize += StackHazardSize;

// Save number of saved regs, so we can easily update CSStackSize later.
unsigned NumSavedRegs = SavedRegs.count();

Expand Down Expand Up @@ -3761,10 +3881,28 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
CSI.insert(CSI.end(), VGSaves.begin(), VGSaves.end());
}

Register LastReg = 0;
int HazardSlotIndex = std::numeric_limits<int>::max();
for (auto &CS : CSI) {
Register Reg = CS.getReg();
const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);

// Create a hazard slot as we switch between GPR and FPR CSRs.
if (AFI->hasStackHazardSlotIndex() &&
(!LastReg || !AArch64InstrInfo::isFpOrNEON(LastReg)) &&
AArch64InstrInfo::isFpOrNEON(Reg)) {
assert(HazardSlotIndex == std::numeric_limits<int>::max() &&
"Unexpected register order for hazard slot");
HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true);
LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
<< "\n");
AFI->setStackHazardCSRSlotIndex(HazardSlotIndex);
if ((unsigned)HazardSlotIndex < MinCSFrameIndex)
MinCSFrameIndex = HazardSlotIndex;
if ((unsigned)HazardSlotIndex > MaxCSFrameIndex)
MaxCSFrameIndex = HazardSlotIndex;
}

unsigned Size = RegInfo->getSpillSize(*RC);
Align Alignment(RegInfo->getSpillAlign(*RC));
int FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
Expand All @@ -3785,7 +3923,22 @@ bool AArch64FrameLowering::assignCalleeSavedSpillSlots(
if ((unsigned)FrameIdx > MaxCSFrameIndex)
MaxCSFrameIndex = FrameIdx;
}
LastReg = Reg;
}

// Add hazard slot in the case where no FPR CSRs are present.
if (AFI->hasStackHazardSlotIndex() &&
HazardSlotIndex == std::numeric_limits<int>::max()) {
HazardSlotIndex = MFI.CreateStackObject(StackHazardSize, Align(8), true);
LLVM_DEBUG(dbgs() << "Created CSR Hazard at slot " << HazardSlotIndex
<< "\n");
AFI->setStackHazardCSRSlotIndex(HazardSlotIndex);
if ((unsigned)HazardSlotIndex < MinCSFrameIndex)
MinCSFrameIndex = HazardSlotIndex;
if ((unsigned)HazardSlotIndex > MaxCSFrameIndex)
MaxCSFrameIndex = HazardSlotIndex;
}

return true;
}

Expand All @@ -3798,6 +3951,10 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
// function doesn't use a FP.
if (AFI->hasStreamingModeChanges() && !hasFP(MF))
return false;
// Don't allow register salvaging with hazard slots, in case it moves objects
// into the wrong place.
if (AFI->hasStackHazardSlotIndex())
return false;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the interaction here? Just that scavenging could introduce hazards?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep - in some cases it was moving the FPR registers into the empty slot next to fp or the scavenged register.

return AFI->hasCalleeSaveStackFreeSpace();
}

Expand Down Expand Up @@ -4492,6 +4649,11 @@ struct FrameObject {
// This object's group (which always contains the object with
// ObjectFirst==true) should be placed first.
bool GroupFirst = false;

// Used to distinguish between FP and GPR accesses. The values are decided so
// that they sort FPR < Hazard < GPR and they can be or'd together.
unsigned Accesses = 0;
enum { AccessFPR = 1, AccessHazard = 2, AccessGPR = 4 };
};

class GroupBuilder {
Expand Down Expand Up @@ -4527,8 +4689,12 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
// at the end. This also allows us to stop walking when we hit the
// first invalid item after it's all sorted.
//
// The "first" object goes first (closest to SP), followed by the members of
// the "first" group.
// If we want to include a stack hazard region, order FPR accesses < the
// hazard object < GPRs accesses in order to create a separation between the
// two. For the Accesses field 1 = FPR, 2 = Hazard Object, 4 = GPR.
//
// Otherwise the "first" object goes first (closest to SP), followed by the
// members of the "first" group.
//
// The rest are sorted by the group index to keep the groups together.
// Higher numbered groups are more likely to be around longer (i.e. untagged
Expand All @@ -4537,10 +4703,10 @@ bool FrameObjectCompare(const FrameObject &A, const FrameObject &B) {
//
// If all else equal, sort by the object index to keep the objects in the
// original order.
return std::make_tuple(!A.IsValid, A.ObjectFirst, A.GroupFirst, A.GroupIndex,
A.ObjectIndex) <
std::make_tuple(!B.IsValid, B.ObjectFirst, B.GroupFirst, B.GroupIndex,
B.ObjectIndex);
return std::make_tuple(!A.IsValid, A.Accesses, A.ObjectFirst, A.GroupFirst,
A.GroupIndex, A.ObjectIndex) <
std::make_tuple(!B.IsValid, B.Accesses, B.ObjectFirst, B.GroupFirst,
B.GroupIndex, B.ObjectIndex);
}
} // namespace

Expand All @@ -4549,19 +4715,32 @@ void AArch64FrameLowering::orderFrameObjects(
if (!OrderFrameObjects || ObjectsToAllocate.empty())
return;

const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
std::vector<FrameObject> FrameObjects(MFI.getObjectIndexEnd());
for (auto &Obj : ObjectsToAllocate) {
FrameObjects[Obj].IsValid = true;
FrameObjects[Obj].ObjectIndex = Obj;
}

// Identify stack slots that are tagged at the same time.
// Identify FPR vs GPR slots for hazards, and stack slots that are tagged at
// the same time.
GroupBuilder GB(FrameObjects);
for (auto &MBB : MF) {
for (auto &MI : MBB) {
if (MI.isDebugInstr())
continue;

if (AFI.hasStackHazardSlotIndex()) {
std::optional<int> FI = getLdStFrameID(MI, MFI);
if (FI && *FI >= 0 && *FI < (int)FrameObjects.size()) {
if (MFI.getStackID(*FI) == 2 || AArch64InstrInfo::isFpOrNEON(MI))
FrameObjects[*FI].Accesses |= FrameObject::AccessFPR;
else
FrameObjects[*FI].Accesses |= FrameObject::AccessGPR;
}
}

int OpIndex;
switch (MI.getOpcode()) {
case AArch64::STGloop:
Expand Down Expand Up @@ -4600,11 +4779,20 @@ void AArch64FrameLowering::orderFrameObjects(
GB.EndCurrentGroup();
}

if (AFI.hasStackHazardSlotIndex()) {
FrameObjects[AFI.getStackHazardSlotIndex()].Accesses =
FrameObject::AccessHazard;
// If a stack object is unknown or both GPR and FPR, sort it into GPR.
for (auto &Obj : FrameObjects)
if (!Obj.Accesses ||
Obj.Accesses == (FrameObject::AccessGPR | FrameObject::AccessFPR))
Obj.Accesses = FrameObject::AccessGPR;
}

// If the function's tagged base pointer is pinned to a stack slot, we want to
// put that slot first when possible. This will likely place it at SP + 0,
// and save one instruction when generating the base pointer because IRG does
// not allow an immediate offset.
const AArch64FunctionInfo &AFI = *MF.getInfo<AArch64FunctionInfo>();
std::optional<int> TBPI = AFI.getTaggedBasePointerIndex();
if (TBPI) {
FrameObjects[*TBPI].ObjectFirst = true;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64FrameLowering.h
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ class AArch64FrameLowering : public TargetFrameLowering {
int64_t RealignmentPadding, StackOffset AllocSize,
bool NeedsWinCFI, bool *HasWinCFI, bool EmitCFI,
StackOffset InitialOffset, bool FollowupAllocs) const;
/// Make a determination whether a Hazard slot is used and create it if
/// needed.
void determineStackHazardSlot(MachineFunction &MF,
BitVector &SavedRegs) const;

/// Emit target zero call-used regs.
void emitZeroCallUsedRegs(BitVector RegsToZero,
Expand Down
Loading
Loading