Skip to content

Commit cb112eb

Browse files
authored
[X86][CodeGen] Teach frame lowering to spill/reload registers w/ PUSHP/POPP, PUSH2[P]/POP2[P] (#73292)
#73092 supported the encoding/decoding for PUSHP/POPP #73233 supported the encoding/decoding for PUSH2[P]/POP2[P] In this patch, we teach frame lowering to spill/reload registers w/ these instructions. 1. Use PPX for balanced spill/reload 2. Use PUSH2/POP2 for continuous spills/reloads 3. PUSH2/POP2 must be 16B-aligned on the stack, so pad when necessary
1 parent d7c03a1 commit cb112eb

File tree

7 files changed

+912
-42
lines changed

7 files changed

+912
-42
lines changed

llvm/lib/Target/X86/X86.td

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,10 @@ def FeatureAVX10_1_512 : SubtargetFeature<"avx10.1-512", "HasAVX10_1_512", "true
343343
[FeatureAVX10_1, FeatureEVEX512]>;
344344
def FeatureEGPR : SubtargetFeature<"egpr", "HasEGPR", "true",
345345
"Support extended general purpose register">;
346+
def FeaturePush2Pop2 : SubtargetFeature<"push2pop2", "HasPush2Pop2", "true",
347+
"Support PUSH2/POP2 instructions">;
348+
def FeaturePPX : SubtargetFeature<"ppx", "HasPPX", "true",
349+
"Support Push-Pop Acceleration">;
346350

347351
// Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka
348352
// "string operations"). See "REP String Enhancement" in the Intel Software

llvm/lib/Target/X86/X86FrameLowering.cpp

Lines changed: 148 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
4242
STATISTIC(NumFrameExtraProbe,
4343
"Number of extra stack probes generated in prologue");
44+
STATISTIC(NumFunctionUsingPush2Pop2, "Number of funtions using push2/pop2");
4445

4546
using namespace llvm;
4647

@@ -139,6 +140,38 @@ static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) {
139140
return X86::MOV32ri;
140141
}
141142

143+
// Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the
144+
// value written by the PUSH from the stack. The processor tracks these marked
145+
// instructions internally and fast-forwards register data between matching PUSH
146+
// and POP instructions, without going through memory or through the training
147+
// loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient
148+
// memory-renaming optimization can be used.
149+
//
150+
// The PPX hint is purely a performance hint. Instructions with this hint have
151+
// the same functional semantics as those without. PPX hints set by the
152+
// compiler that violate the balancing rule may turn off the PPX optimization,
153+
// but they will not affect program semantics.
154+
//
155+
// Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp
156+
// are not considered).
157+
//
158+
// PUSH2 and POP2 are instructions for (respectively) pushing/popping 2
159+
// GPRs at a time to/from the stack.
160+
static unsigned getPUSHOpcode(const X86Subtarget &ST) {
161+
return ST.is64Bit() ? (ST.hasPPX() ? X86::PUSHP64r : X86::PUSH64r)
162+
: X86::PUSH32r;
163+
}
164+
static unsigned getPOPOpcode(const X86Subtarget &ST) {
165+
return ST.is64Bit() ? (ST.hasPPX() ? X86::POPP64r : X86::POP64r)
166+
: X86::POP32r;
167+
}
168+
static unsigned getPUSH2Opcode(const X86Subtarget &ST) {
169+
return ST.hasPPX() ? X86::PUSH2P : X86::PUSH2;
170+
}
171+
static unsigned getPOP2Opcode(const X86Subtarget &ST) {
172+
return ST.hasPPX() ? X86::POP2P : X86::POP2;
173+
}
174+
142175
static bool isEAXLiveIn(MachineBasicBlock &MBB) {
143176
for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins()) {
144177
unsigned Reg = RegMask.PhysReg;
@@ -1679,7 +1712,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
16791712
NumBytes = alignTo(NumBytes, MaxAlign);
16801713

16811714
// Save EBP/RBP into the appropriate stack slot.
1682-
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::PUSH64r : X86::PUSH32r))
1715+
BuildMI(MBB, MBBI, DL,
1716+
TII.get(getPUSHOpcode(MF.getSubtarget<X86Subtarget>())))
16831717
.addReg(MachineFramePtr, RegState::Kill)
16841718
.setMIFlag(MachineInstr::FrameSetup);
16851719

@@ -1818,18 +1852,30 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
18181852
// Skip the callee-saved push instructions.
18191853
bool PushedRegs = false;
18201854
int StackOffset = 2 * stackGrowth;
1855+
MachineBasicBlock::const_iterator LastCSPush = MBBI;
1856+
auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) {
1857+
if (MBBI == MBB.end() || !MBBI->getFlag(MachineInstr::FrameSetup))
1858+
return false;
1859+
unsigned Opc = MBBI->getOpcode();
1860+
return Opc == X86::PUSH32r || Opc == X86::PUSH64r || Opc == X86::PUSHP64r ||
1861+
Opc == X86::PUSH2 || Opc == X86::PUSH2P;
1862+
};
18211863

1822-
while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup) &&
1823-
(MBBI->getOpcode() == X86::PUSH32r ||
1824-
MBBI->getOpcode() == X86::PUSH64r)) {
1864+
while (IsCSPush(MBBI)) {
18251865
PushedRegs = true;
18261866
Register Reg = MBBI->getOperand(0).getReg();
1867+
LastCSPush = MBBI;
18271868
++MBBI;
1869+
unsigned Opc = LastCSPush->getOpcode();
18281870

18291871
if (!HasFP && NeedsDwarfCFI) {
18301872
// Mark callee-saved push instruction.
18311873
// Define the current CFA rule to use the provided offset.
18321874
assert(StackSize);
1875+
// Compared to push, push2 introduces more stack offset (one more
1876+
// register).
1877+
if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
1878+
StackOffset += stackGrowth;
18331879
BuildCFI(MBB, MBBI, DL,
18341880
MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset),
18351881
MachineInstr::FrameSetup);
@@ -1841,6 +1887,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
18411887
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
18421888
.addImm(Reg)
18431889
.setMIFlag(MachineInstr::FrameSetup);
1890+
if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
1891+
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
1892+
.addImm(LastCSPush->getOperand(1).getReg())
1893+
.setMIFlag(MachineInstr::FrameSetup);
18441894
}
18451895
}
18461896

@@ -2317,7 +2367,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
23172367
emitSPUpdate(MBB, MBBI, DL, Offset, /*InEpilogue*/ true);
23182368
}
23192369
// Pop EBP.
2320-
BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
2370+
BuildMI(MBB, MBBI, DL,
2371+
TII.get(getPOPOpcode(MF.getSubtarget<X86Subtarget>())),
23212372
MachineFramePtr)
23222373
.setMIFlag(MachineInstr::FrameDestroy);
23232374

@@ -2357,10 +2408,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
23572408
unsigned Opc = PI->getOpcode();
23582409

23592410
if (Opc != X86::DBG_VALUE && !PI->isTerminator()) {
2360-
if ((Opc != X86::POP32r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
2361-
(Opc != X86::POP64r || !PI->getFlag(MachineInstr::FrameDestroy)) &&
2362-
(Opc != X86::BTR64ri8 || !PI->getFlag(MachineInstr::FrameDestroy)) &&
2363-
(Opc != X86::ADD64ri32 || !PI->getFlag(MachineInstr::FrameDestroy)))
2411+
if (!PI->getFlag(MachineInstr::FrameDestroy) ||
2412+
(Opc != X86::POP32r && Opc != X86::POP64r && Opc != X86::BTR64ri8 &&
2413+
Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 &&
2414+
Opc != X86::POP2P && Opc != X86::LEA64r))
23642415
break;
23652416
FirstCSPop = PI;
23662417
}
@@ -2451,8 +2502,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
24512502
MachineBasicBlock::iterator PI = MBBI;
24522503
unsigned Opc = PI->getOpcode();
24532504
++MBBI;
2454-
if (Opc == X86::POP32r || Opc == X86::POP64r) {
2505+
if (Opc == X86::POP32r || Opc == X86::POP64r || Opc == X86::POPP64r ||
2506+
Opc == X86::POP2 || Opc == X86::POP2P) {
24552507
Offset += SlotSize;
2508+
// Compared to pop, pop2 introduces more stack offset (one more
2509+
// register).
2510+
if (Opc == X86::POP2 || Opc == X86::POP2P)
2511+
Offset += SlotSize;
24562512
BuildCFI(MBB, MBBI, DL,
24572513
MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset),
24582514
MachineInstr::FrameDestroy);
@@ -2735,13 +2791,44 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
27352791
}
27362792
}
27372793

2794+
// Strategy:
2795+
// 1. Use push2 when
2796+
// a) number of CSR > 1 if no need padding
2797+
// b) number of CSR > 2 if need padding
2798+
// 2. When the number of CSR push is odd
2799+
// a. Start to use push2 from the 1st push if stack is 16B aligned.
2800+
// b. Start to use push2 from the 2nd push if stack is not 16B aligned.
2801+
// 3. When the number of CSR push is even, start to use push2 from the 1st
2802+
// push and make the stack 16B aligned before the push
2803+
unsigned NumRegsForPush2 = 0;
2804+
if (STI.hasPush2Pop2()) {
2805+
unsigned NumCSGPR = llvm::count_if(CSI, [](const CalleeSavedInfo &I) {
2806+
return X86::GR64RegClass.contains(I.getReg());
2807+
});
2808+
bool NeedPadding = (SpillSlotOffset % 16 != 0) && (NumCSGPR % 2 == 0);
2809+
bool UsePush2Pop2 = NeedPadding ? NumCSGPR > 2 : NumCSGPR > 1;
2810+
X86FI->setPadForPush2Pop2(NeedPadding && UsePush2Pop2);
2811+
NumRegsForPush2 = UsePush2Pop2 ? alignDown(NumCSGPR, 2) : 0;
2812+
if (X86FI->padForPush2Pop2()) {
2813+
SpillSlotOffset -= SlotSize;
2814+
MFI.CreateFixedSpillStackObject(SlotSize, SpillSlotOffset);
2815+
}
2816+
}
2817+
27382818
// Assign slots for GPRs. It increases frame size.
27392819
for (CalleeSavedInfo &I : llvm::reverse(CSI)) {
27402820
Register Reg = I.getReg();
27412821

27422822
if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
27432823
continue;
27442824

2825+
// A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned
2826+
// or only an odd number of registers in the candidates.
2827+
if (X86FI->getNumCandidatesForPush2Pop2() < NumRegsForPush2 &&
2828+
(SpillSlotOffset % 16 == 0 ||
2829+
X86FI->getNumCandidatesForPush2Pop2() % 2))
2830+
X86FI->addCandidateForPush2Pop2(Reg);
2831+
27452832
SpillSlotOffset -= SlotSize;
27462833
CalleeSavedFrameSize += SlotSize;
27472834

@@ -2759,6 +2846,10 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
27592846
// TODO: saving the slot index is better?
27602847
X86FI->setRestoreBasePointer(CalleeSavedFrameSize);
27612848
}
2849+
assert(X86FI->getNumCandidatesForPush2Pop2() % 2 == 0 &&
2850+
"Expect even candidates for push2/pop2");
2851+
if (X86FI->getNumCandidatesForPush2Pop2())
2852+
++NumFunctionUsingPush2Pop2;
27622853
X86FI->setCalleeSavedFrameSize(CalleeSavedFrameSize);
27632854
MFI.setCVBytesOfCalleeSavedRegisters(CalleeSavedFrameSize);
27642855

@@ -2808,41 +2899,50 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
28082899

28092900
// Push GPRs. It increases frame size.
28102901
const MachineFunction &MF = *MBB.getParent();
2811-
unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
2812-
for (const CalleeSavedInfo &I : llvm::reverse(CSI)) {
2813-
Register Reg = I.getReg();
2814-
2815-
if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
2816-
continue;
2902+
const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
2903+
if (X86FI->padForPush2Pop2())
2904+
emitSPUpdate(MBB, MI, DL, -(int64_t)SlotSize, /*InEpilogue=*/false);
28172905

2906+
// Update LiveIn of the basic block and decide whether we can add a kill flag
2907+
// to the use.
2908+
auto UpdateLiveInCheckCanKill = [&](Register Reg) {
28182909
const MachineRegisterInfo &MRI = MF.getRegInfo();
2819-
bool isLiveIn = MRI.isLiveIn(Reg);
2820-
if (!isLiveIn)
2821-
MBB.addLiveIn(Reg);
2822-
2823-
// Decide whether we can add a kill flag to the use.
2824-
bool CanKill = !isLiveIn;
2825-
// Check if any subregister is live-in
2826-
if (CanKill) {
2827-
for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg) {
2828-
if (MRI.isLiveIn(*AReg)) {
2829-
CanKill = false;
2830-
break;
2831-
}
2832-
}
2833-
}
2834-
28352910
// Do not set a kill flag on values that are also marked as live-in. This
28362911
// happens with the @llvm-returnaddress intrinsic and with arguments
28372912
// passed in callee saved registers.
28382913
// Omitting the kill flags is conservatively correct even if the live-in
28392914
// is not used after all.
2840-
BuildMI(MBB, MI, DL, TII.get(Opc))
2841-
.addReg(Reg, getKillRegState(CanKill))
2842-
.setMIFlag(MachineInstr::FrameSetup);
2915+
if (MRI.isLiveIn(Reg))
2916+
return false;
2917+
MBB.addLiveIn(Reg);
2918+
// Check if any subregister is live-in
2919+
for (MCRegAliasIterator AReg(Reg, TRI, false); AReg.isValid(); ++AReg)
2920+
if (MRI.isLiveIn(*AReg))
2921+
return false;
2922+
return true;
2923+
};
2924+
auto UpdateLiveInGetKillRegState = [&](Register Reg) {
2925+
return getKillRegState(UpdateLiveInCheckCanKill(Reg));
2926+
};
2927+
2928+
for (auto RI = CSI.rbegin(), RE = CSI.rend(); RI != RE; ++RI) {
2929+
Register Reg = RI->getReg();
2930+
if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
2931+
continue;
2932+
2933+
if (X86FI->isCandidateForPush2Pop2(Reg)) {
2934+
Register Reg2 = (++RI)->getReg();
2935+
BuildMI(MBB, MI, DL, TII.get(getPUSH2Opcode(STI)))
2936+
.addReg(Reg, UpdateLiveInGetKillRegState(Reg))
2937+
.addReg(Reg2, UpdateLiveInGetKillRegState(Reg2))
2938+
.setMIFlag(MachineInstr::FrameSetup);
2939+
} else {
2940+
BuildMI(MBB, MI, DL, TII.get(getPUSHOpcode(STI)))
2941+
.addReg(Reg, UpdateLiveInGetKillRegState(Reg))
2942+
.setMIFlag(MachineInstr::FrameSetup);
2943+
}
28432944
}
28442945

2845-
const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
28462946
if (X86FI->getRestoreBasePointer()) {
28472947
unsigned Opc = STI.is64Bit() ? X86::PUSH64r : X86::PUSH32r;
28482948
Register BaseReg = this->TRI->getBaseRegister();
@@ -2958,15 +3058,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
29583058
}
29593059

29603060
// POP GPRs.
2961-
unsigned Opc = STI.is64Bit() ? X86::POP64r : X86::POP32r;
2962-
for (const CalleeSavedInfo &I : CSI) {
2963-
Register Reg = I.getReg();
3061+
for (auto I = CSI.begin(), E = CSI.end(); I != E; ++I) {
3062+
Register Reg = I->getReg();
29643063
if (!X86::GR64RegClass.contains(Reg) && !X86::GR32RegClass.contains(Reg))
29653064
continue;
29663065

2967-
BuildMI(MBB, MI, DL, TII.get(Opc), Reg)
2968-
.setMIFlag(MachineInstr::FrameDestroy);
3066+
if (X86FI->isCandidateForPush2Pop2(Reg))
3067+
BuildMI(MBB, MI, DL, TII.get(getPOP2Opcode(STI)), Reg)
3068+
.addReg((++I)->getReg(), RegState::Define)
3069+
.setMIFlag(MachineInstr::FrameDestroy);
3070+
else
3071+
BuildMI(MBB, MI, DL, TII.get(getPOPOpcode(STI)), Reg)
3072+
.setMIFlag(MachineInstr::FrameDestroy);
29693073
}
3074+
if (X86FI->padForPush2Pop2())
3075+
emitSPUpdate(MBB, MI, DL, SlotSize, /*InEpilogue=*/true);
3076+
29703077
return true;
29713078
}
29723079

llvm/lib/Target/X86/X86MachineFunctionInfo.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "llvm/ADT/SmallVector.h"
1818
#include "llvm/CodeGen/CallingConvLower.h"
1919
#include "llvm/CodeGen/MachineFunction.h"
20+
#include <set>
2021

2122
namespace llvm {
2223

@@ -117,6 +118,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
117118
/// determine if we should insert tilerelease in frame lowering.
118119
bool HasVirtualTileReg = false;
119120

121+
/// Ajust stack for push2/pop2
122+
bool PadForPush2Pop2 = false;
123+
124+
/// Candidate registers for push2/pop2
125+
std::set<Register> CandidatesForPush2Pop2;
126+
120127
/// True if this function has CFI directives that adjust the CFA.
121128
/// This is used to determine if we should direct the debugger to use
122129
/// the CFA instead of the stack pointer.
@@ -165,7 +172,9 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
165172
const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const {
166173
return WinEHXMMSlotInfo; }
167174

168-
unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
175+
unsigned getCalleeSavedFrameSize() const {
176+
return CalleeSavedFrameSize + 8 * padForPush2Pop2();
177+
}
169178
void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
170179

171180
unsigned getBytesToPopOnReturn() const { return BytesToPopOnReturn; }
@@ -232,6 +241,19 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
232241
bool hasVirtualTileReg() const { return HasVirtualTileReg; }
233242
void setHasVirtualTileReg(bool v) { HasVirtualTileReg = v; }
234243

244+
bool padForPush2Pop2() const { return PadForPush2Pop2; }
245+
void setPadForPush2Pop2(bool V) { PadForPush2Pop2 = V; }
246+
247+
bool isCandidateForPush2Pop2(Register Reg) const {
248+
return CandidatesForPush2Pop2.find(Reg) != CandidatesForPush2Pop2.end();
249+
}
250+
void addCandidateForPush2Pop2(Register Reg) {
251+
CandidatesForPush2Pop2.insert(Reg);
252+
}
253+
size_t getNumCandidatesForPush2Pop2() const {
254+
return CandidatesForPush2Pop2.size();
255+
}
256+
235257
bool hasCFIAdjustCfa() const { return HasCFIAdjustCfa; }
236258
void setHasCFIAdjustCfa(bool v) { HasCFIAdjustCfa = v; }
237259

0 commit comments

Comments
 (0)