Skip to content

[X86][CodeGen] Teach frame lowering to spill/reload registers w/ PUSHP/POPP, PUSH2[P]/POP2[P] #73292

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Nov 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/lib/Target/X86/X86.td
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,10 @@ def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true
[FeatureAVX10_1, FeatureEVEX512]>;
def FeatureEGPR : SubtargetFeature<"egpr", "HasEGPR", "true",
"Support extended general purpose register">;
def FeaturePush2Pop2 : SubtargetFeature<"push2pop2", "HasPush2Pop2", "true",
"Support PUSH2/POP2 instructions">;
def FeaturePPX : SubtargetFeature<"ppx", "HasPPX", "true",
"Support Push-Pop Acceleration">;

// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
// "string operations"). See "REP String Enhancement" in the Intel Software
Expand Down
189 changes: 148 additions & 41 deletions llvm/lib/Target/X86/X86FrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
STATISTIC(NumFrameExtraProbe,
"Number of extra stack probes generated in prologue");
STATISTIC(NumFunctionUsingPush2Pop2, "Number of funtions using push2/pop2");

using namespace llvm;

Expand Down Expand Up @@ -139,6 +140,38 @@ static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) {
return X86::MOV32ri;
}

// Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the
// value written by the PUSH from the stack. The processor tracks these marked
// instructions internally and fast-forwards register data between matching PUSH
// and POP instructions, without going through memory or through the training
// loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient
// memory-renaming optimization can be used.
//
// The PPX hint is purely a performance hint. Instructions with this hint have
// the same functional semantics as those without. PPX hints set by the
// compiler that violate the balancing rule may turn off the PPX optimization,
// but they will not affect program semantics.
//
// Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp
// are not considered).
//
// PUSH2 and POP2 are instructions for (respectively) pushing/popping 2
// GPRs at a time to/from the stack.
static unsigned getPUSHOpcode(const X86Subtarget &ST) {
return ST.is64Bit() ? (ST.hasPPX() ? X86::PUSHP64r : X86::PUSH64r)
: X86::PUSH32r;
}
static unsigned getPOPOpcode(const X86Subtarget &ST) {
return ST.is64Bit() ? (ST.hasPPX() ? X86::POPP64r : X86::POP64r)
: X86::POP32r;
}
static unsigned getPUSH2Opcode(const X86Subtarget &ST) {
return ST.hasPPX() ? X86::PUSH2P : X86::PUSH2;
}
static unsigned getPOP2Opcode(const X86Subtarget &ST) {
return ST.hasPPX() ? X86::POP2P : X86::POP2;
}

static bool isEAXLiveIn(MachineBasicBlock &MBB) {
for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
unsigned Reg = RegMask.PhysReg;
Expand Down Expand Up @@ -1679,7 +1712,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
NumBytes = alignTo(NumBytes, MaxAlign);

// Save EBP/RBP into the appropriate stack slot.
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
BuildMI(MBB, MBBI, DL,
TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))
.addReg(MachineFramePtr, RegState::Kill)
.setMIFlag(MachineInstr::FrameSetup);

Expand Down Expand Up @@ -1818,18 +1852,30 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// Skip the callee-saved push instructions.
bool PushedRegs = false;
int StackOffset = 2 * stackGrowth;
MachineBasicBlock::const_iterator LastCSPush = MBBI;
auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) {
if (MBBI == MBB.end() || !MBBI->getFlag(MachineInstr::FrameSetup))
return false;
unsigned Opc = MBBI->getOpcode();
return Opc == X86::PUSH32r || Opc == X86::PUSH64r || Opc == X86::PUSHP64r ||
Opc == X86::PUSH2 || Opc == X86::PUSH2P;
};

while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup) &&
(MBBI->getOpcode() == X86::PUSH32r ||
MBBI->getOpcode() == X86::PUSH64r)) {
while (IsCSPush(MBBI)) {
PushedRegs = true;
Register Reg = MBBI->getOperand(0).getReg();
LastCSPush = MBBI;
++MBBI;
unsigned Opc = LastCSPush->getOpcode();

if (!HasFP && NeedsDwarfCFI) {
// Mark callee-saved push instruction.
// Define the current CFA rule to use the provided offset.
assert(StackSize);
// Compared to push, push2 introduces more stack offset (one more
// register).
if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
StackOffset += stackGrowth;
BuildCFI(MBB, MBBI, DL,
MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset),
MachineInstr::FrameSetup);
Expand All @@ -1841,6 +1887,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
.addImm(Reg)
.setMIFlag(MachineInstr::FrameSetup);
if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
.addImm(LastCSPush->getOperand(1).getReg())
.setMIFlag(MachineInstr::FrameSetup);
}
}

Expand Down Expand Up @@ -2317,7 +2367,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
}
// Pop EBP.
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
BuildMI(MBB, MBBI, DL,
TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())),
MachineFramePtr)
.setMIFlag(MachineInstr::FrameDestroy);

Expand Down Expand Up @@ -2357,10 +2408,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
unsigned Opc = PI->getOpcode();

if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
(Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
(Opc != X86::BTR64ri8 || !PI->getFlag(MachineInstr::FrameDestroy)) &&
(Opc != X86::ADD64ri32 || !PI->getFlag(MachineInstr::FrameDestroy)))
if (!PI->getFlag(MachineInstr::FrameDestroy) ||
(Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 &&
Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 &&
Opc != X86::POP2P && Opc != X86::LEA64r))
break;
FirstCSPop = PI;
}
Expand Down Expand Up @@ -2451,8 +2502,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock::iterator PI = MBBI;
unsigned Opc = PI->getOpcode();
++MBBI;
if (Opc == X86::POP32r || Opc == X86::POP64r) {
if (Opc == X86::POP32r || Opc == X86::POP64r || Opc == X86::POPP64r ||
Opc == X86::POP2 || Opc == X86::POP2P) {
Offset += SlotSize;
// Compared to pop, pop2 introduces more stack offset (one more
// register).
if (Opc == X86::POP2 || Opc == X86::POP2P)
Offset += SlotSize;
BuildCFI(MBB, MBBI, DL,
MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset),
MachineInstr::FrameDestroy);
Expand Down Expand Up @@ -2735,13 +2791,44 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
}
}

// Strategy:
// 1. Use push2 when
// a) number of CSR > 1 if no need padding
// b) number of CSR > 2 if need padding
// 2. When the number of CSR push is odd
// a. Start to use push2 from the 1st push if stack is 16B aligned.
// b. Start to use push2 from the 2nd push if stack is not 16B aligned.
// 3. When the number of CSR push is even, start to use push2 from the 1st
// push and make the stack 16B aligned before the push
unsigned NumRegsForPush2 = 0;
if (STI.hasPush2Pop2()) {
unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) {
return X86::GR64RegClass.contains(I.getReg());
});
bool NeedPadding = (SpillSlotOffset % 16 != 0) && (NumCSGPR % 2 == 0);
bool UsePush2Pop2 = NeedPadding ? NumCSGPR > 2 : NumCSGPR > 1;
X86FI->setPadForPush2Pop2(NeedPadding && UsePush2Pop2);
NumRegsForPush2 = UsePush2Pop2 ? alignDown(NumCSGPR, 2) : 0;
if (X86FI->padForPush2Pop2()) {
SpillSlotOffset -= SlotSize;
MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
}
}

// Assign slots for GPRs. It increases frame size.
for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
Register Reg = I.getReg();

if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
continue;

// A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned
// or only an odd number of registers in the candidates.
if (X86FI->getNumCandidatesForPush2Pop2() < NumRegsForPush2 &&
(SpillSlotOffset % 16 == 0 ||
X86FI->getNumCandidatesForPush2Pop2() % 2))
X86FI->addCandidateForPush2Pop2(Reg);

SpillSlotOffset -= SlotSize;
CalleeSavedFrameSize += SlotSize;

Expand All @@ -2759,6 +2846,10 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
// TODO: saving the slot index is better?
X86FI->setRestoreBasePointer(CalleeSavedFrameSize);
}
assert(X86FI->getNumCandidatesForPush2Pop2() % 2 == 0 &&
"Expect even candidates for push2/pop2");
if (X86FI->getNumCandidatesForPush2Pop2())
++NumFunctionUsingPush2Pop2;
X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);

Expand Down Expand Up @@ -2808,41 +2899,50 @@ bool X86FrameLowering::spillCalleeSavedRegisters(

// Push GPRs. It increases frame size.
const MachineFunction &MF = *MBB.getParent();
unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
Register Reg = I.getReg();

if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
continue;
const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
if (X86FI->padForPush2Pop2())
emitSPUpdate(MBB, MI, DL, -(int64_t)SlotSize, /*InEpilogue=*/false);

// Update LiveIn of the basic block and decide whether we can add a kill flag
// to the use.
auto UpdateLiveInCheckCanKill = [&](Register Reg) {
const MachineRegisterInfo &MRI = MF.getRegInfo();
bool isLiveIn = MRI.isLiveIn(Reg);
if (!isLiveIn)
MBB.addLiveIn(Reg);

// Decide whether we can add a kill flag to the use.
bool CanKill = !isLiveIn;
// Check if any subregister is live-in
if (CanKill) {
for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) {
if (MRI.isLiveIn(*AReg)) {
CanKill = false;
break;
}
}
}

// Do not set a kill flag on values that are also marked as live-in. This
// happens with the @llvm-returnaddress intrinsic and with arguments
// passed in callee saved registers.
// Omitting the kill flags is conservatively correct even if the live-in
// is not used after all.
BuildMI(MBB, MI, DL, TII.get(Opc))
.addReg(Reg, getKillRegState(CanKill))
.setMIFlag(MachineInstr::FrameSetup);
if (MRI.isLiveIn(Reg))
return false;
MBB.addLiveIn(Reg);
// Check if any subregister is live-in
for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg)
if (MRI.isLiveIn(*AReg))
return false;
return true;
};
auto UpdateLiveInGetKillRegState = [&](Register Reg) {
return getKillRegState(UpdateLiveInCheckCanKill(Reg));
};

for (auto RI = CSI.rbegin(), RE = CSI.rend(); RI != RE; ++RI) {
Register Reg = RI->getReg();
if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
continue;

if (X86FI->isCandidateForPush2Pop2(Reg)) {
Register Reg2 = (++RI)->getReg();
BuildMI(MBB, MI, DL, TII.get(getPUSH2Opcode(STI)))
.addReg(Reg, UpdateLiveInGetKillRegState(Reg))
.addReg(Reg2, UpdateLiveInGetKillRegState(Reg2))
.setMIFlag(MachineInstr::FrameSetup);
} else {
BuildMI(MBB, MI, DL, TII.get(getPUSHOpcode(STI)))
.addReg(Reg, UpdateLiveInGetKillRegState(Reg))
.setMIFlag(MachineInstr::FrameSetup);
}
}

const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
if (X86FI->getRestoreBasePointer()) {
unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
Register BaseReg = this->TRI->getBaseRegister();
Expand Down Expand Up @@ -2958,15 +3058,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
}

// POP GPRs.
unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
for (const CalleeSavedInfo &I : CSI) {
Register Reg = I.getReg();
for (auto I = CSI.begin(), E = CSI.end(); I != E; ++I) {
Register Reg = I->getReg();
if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
continue;

BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
.setMIFlag(MachineInstr::FrameDestroy);
if (X86FI->isCandidateForPush2Pop2(Reg))
BuildMI(MBB, MI, DL, TII.get(getPOP2Opcode(STI)), Reg)
.addReg((++I)->getReg(), RegState::Define)
.setMIFlag(MachineInstr::FrameDestroy);
else
BuildMI(MBB, MI, DL, TII.get(getPOPOpcode(STI)), Reg)
.setMIFlag(MachineInstr::FrameDestroy);
}
if (X86FI->padForPush2Pop2())
emitSPUpdate(MBB, MI, DL, SlotSize, /*InEpilogue=*/true);

return true;
}

Expand Down
24 changes: 23 additions & 1 deletion llvm/lib/Target/X86/X86MachineFunctionInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
#include <set>

namespace llvm {

Expand Down Expand Up @@ -117,6 +118,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
/// determine if we should insert tilerelease in frame lowering.
bool HasVirtualTileReg = false;

/// Ajust stack for push2/pop2
bool PadForPush2Pop2 = false;

/// Candidate registers for push2/pop2
std::set<Register> CandidatesForPush2Pop2;

/// True if this function has CFI directives that adjust the CFA.
/// This is used to determine if we should direct the debugger to use
/// the CFA instead of the stack pointer.
Expand Down Expand Up @@ -165,7 +172,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const {
return WinEHXMMSlotInfo; }

unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
unsigned getCalleeSavedFrameSize() const {
return CalleeSavedFrameSize + 8 * padForPush2Pop2();
}
void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }

unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
Expand Down Expand Up @@ -232,6 +241,19 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
bool hasVirtualTileReg() const { return HasVirtualTileReg; }
void setHasVirtualTileReg(bool v) { HasVirtualTileReg = v; }

bool padForPush2Pop2() const { return PadForPush2Pop2; }
void setPadForPush2Pop2(bool V) { PadForPush2Pop2 = V; }

bool isCandidateForPush2Pop2(Register Reg) const {
return CandidatesForPush2Pop2.find(Reg) != CandidatesForPush2Pop2.end();
}
void addCandidateForPush2Pop2(Register Reg) {
CandidatesForPush2Pop2.insert(Reg);
}
size_t getNumCandidatesForPush2Pop2() const {
return CandidatesForPush2Pop2.size();
}

bool hasCFIAdjustCfa() const { return HasCFIAdjustCfa; }
void setHasCFIAdjustCfa(bool v) { HasCFIAdjustCfa = v; }

Expand Down
Loading