41
41
STATISTIC (NumFrameLoopProbe, " Number of loop stack probes used in prologue" );
42
42
STATISTIC (NumFrameExtraProbe,
43
43
" Number of extra stack probes generated in prologue" );
44
+ STATISTIC (NumFunctionUsingPush2Pop2, " Number of funtions using push2/pop2" );
44
45
45
46
using namespace llvm ;
46
47
@@ -139,6 +140,38 @@ static unsigned getMOVriOpcode(bool Use64BitReg, int64_t Imm) {
139
140
return X86::MOV32ri;
140
141
}
141
142
143
+ // Push-Pop Acceleration (PPX) hint is used to indicate that the POP reads the
144
+ // value written by the PUSH from the stack. The processor tracks these marked
145
+ // instructions internally and fast-forwards register data between matching PUSH
146
+ // and POP instructions, without going through memory or through the training
147
+ // loop of the Fast Store Forwarding Predictor (FSFP). Instead, a more efficient
148
+ // memory-renaming optimization can be used.
149
+ //
150
+ // The PPX hint is purely a performance hint. Instructions with this hint have
151
+ // the same functional semantics as those without. PPX hints set by the
152
+ // compiler that violate the balancing rule may turn off the PPX optimization,
153
+ // but they will not affect program semantics.
154
+ //
155
+ // Hence, PPX is used for balanced spill/reloads (Exceptions and setjmp/longjmp
156
+ // are not considered).
157
+ //
158
+ // PUSH2 and POP2 are instructions for (respectively) pushing/popping 2
159
+ // GPRs at a time to/from the stack.
160
+ static unsigned getPUSHOpcode (const X86Subtarget &ST) {
161
+ return ST.is64Bit () ? (ST.hasPPX () ? X86::PUSHP64r : X86::PUSH64r)
162
+ : X86::PUSH32r;
163
+ }
164
+ static unsigned getPOPOpcode (const X86Subtarget &ST) {
165
+ return ST.is64Bit () ? (ST.hasPPX () ? X86::POPP64r : X86::POP64r)
166
+ : X86::POP32r;
167
+ }
168
+ static unsigned getPUSH2Opcode (const X86Subtarget &ST) {
169
+ return ST.hasPPX () ? X86::PUSH2P : X86::PUSH2;
170
+ }
171
+ static unsigned getPOP2Opcode (const X86Subtarget &ST) {
172
+ return ST.hasPPX () ? X86::POP2P : X86::POP2;
173
+ }
174
+
142
175
static bool isEAXLiveIn (MachineBasicBlock &MBB) {
143
176
for (MachineBasicBlock::RegisterMaskPair RegMask : MBB.liveins ()) {
144
177
unsigned Reg = RegMask.PhysReg ;
@@ -1679,7 +1712,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
1679
1712
NumBytes = alignTo (NumBytes, MaxAlign);
1680
1713
1681
1714
// Save EBP/RBP into the appropriate stack slot.
1682
- BuildMI (MBB, MBBI, DL, TII.get (Is64Bit ? X86::PUSH64r : X86::PUSH32r))
1715
+ BuildMI (MBB, MBBI, DL,
1716
+ TII.get (getPUSHOpcode (MF.getSubtarget <X86Subtarget>())))
1683
1717
.addReg (MachineFramePtr, RegState::Kill)
1684
1718
.setMIFlag (MachineInstr::FrameSetup);
1685
1719
@@ -1818,18 +1852,30 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
1818
1852
// Skip the callee-saved push instructions.
1819
1853
bool PushedRegs = false ;
1820
1854
int StackOffset = 2 * stackGrowth;
1855
+ MachineBasicBlock::const_iterator LastCSPush = MBBI;
1856
+ auto IsCSPush = [&](const MachineBasicBlock::iterator &MBBI) {
1857
+ if (MBBI == MBB.end () || !MBBI->getFlag (MachineInstr::FrameSetup))
1858
+ return false ;
1859
+ unsigned Opc = MBBI->getOpcode ();
1860
+ return Opc == X86::PUSH32r || Opc == X86::PUSH64r || Opc == X86::PUSHP64r ||
1861
+ Opc == X86::PUSH2 || Opc == X86::PUSH2P;
1862
+ };
1821
1863
1822
- while (MBBI != MBB.end () && MBBI->getFlag (MachineInstr::FrameSetup) &&
1823
- (MBBI->getOpcode () == X86::PUSH32r ||
1824
- MBBI->getOpcode () == X86::PUSH64r)) {
1864
+ while (IsCSPush (MBBI)) {
1825
1865
PushedRegs = true ;
1826
1866
Register Reg = MBBI->getOperand (0 ).getReg ();
1867
+ LastCSPush = MBBI;
1827
1868
++MBBI;
1869
+ unsigned Opc = LastCSPush->getOpcode ();
1828
1870
1829
1871
if (!HasFP && NeedsDwarfCFI) {
1830
1872
// Mark callee-saved push instruction.
1831
1873
// Define the current CFA rule to use the provided offset.
1832
1874
assert (StackSize);
1875
+ // Compared to push, push2 introduces more stack offset (one more
1876
+ // register).
1877
+ if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
1878
+ StackOffset += stackGrowth;
1833
1879
BuildCFI (MBB, MBBI, DL,
1834
1880
MCCFIInstruction::cfiDefCfaOffset (nullptr , -StackOffset),
1835
1881
MachineInstr::FrameSetup);
@@ -1841,6 +1887,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
1841
1887
BuildMI (MBB, MBBI, DL, TII.get (X86::SEH_PushReg))
1842
1888
.addImm (Reg)
1843
1889
.setMIFlag (MachineInstr::FrameSetup);
1890
+ if (Opc == X86::PUSH2 || Opc == X86::PUSH2P)
1891
+ BuildMI (MBB, MBBI, DL, TII.get (X86::SEH_PushReg))
1892
+ .addImm (LastCSPush->getOperand (1 ).getReg ())
1893
+ .setMIFlag (MachineInstr::FrameSetup);
1844
1894
}
1845
1895
}
1846
1896
@@ -2317,7 +2367,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
2317
2367
emitSPUpdate (MBB, MBBI, DL, Offset, /* InEpilogue*/ true );
2318
2368
}
2319
2369
// Pop EBP.
2320
- BuildMI (MBB, MBBI, DL, TII.get (Is64Bit ? X86::POP64r : X86::POP32r),
2370
+ BuildMI (MBB, MBBI, DL,
2371
+ TII.get (getPOPOpcode (MF.getSubtarget <X86Subtarget>())),
2321
2372
MachineFramePtr)
2322
2373
.setMIFlag (MachineInstr::FrameDestroy);
2323
2374
@@ -2357,10 +2408,10 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
2357
2408
unsigned Opc = PI->getOpcode ();
2358
2409
2359
2410
if (Opc != X86::DBG_VALUE && !PI->isTerminator ()) {
2360
- if ((Opc != X86::POP32r || ! PI->getFlag (MachineInstr::FrameDestroy)) &&
2361
- (Opc != X86::POP64r || !PI-> getFlag (MachineInstr::FrameDestroy)) &&
2362
- ( Opc != X86::BTR64ri8 || !PI-> getFlag (MachineInstr::FrameDestroy)) &&
2363
- ( Opc != X86::ADD64ri32 || !PI-> getFlag (MachineInstr::FrameDestroy) ))
2411
+ if (! PI->getFlag (MachineInstr::FrameDestroy) ||
2412
+ (Opc != X86::POP32r && Opc != X86:: POP64r && Opc != X86::BTR64ri8 &&
2413
+ Opc != X86::ADD64ri32 && Opc != X86::POPP64r && Opc != X86::POP2 &&
2414
+ Opc != X86::POP2P && Opc != X86::LEA64r ))
2364
2415
break ;
2365
2416
FirstCSPop = PI;
2366
2417
}
@@ -2451,8 +2502,13 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
2451
2502
MachineBasicBlock::iterator PI = MBBI;
2452
2503
unsigned Opc = PI->getOpcode ();
2453
2504
++MBBI;
2454
- if (Opc == X86::POP32r || Opc == X86::POP64r) {
2505
+ if (Opc == X86::POP32r || Opc == X86::POP64r || Opc == X86::POPP64r ||
2506
+ Opc == X86::POP2 || Opc == X86::POP2P) {
2455
2507
Offset += SlotSize;
2508
+ // Compared to pop, pop2 introduces more stack offset (one more
2509
+ // register).
2510
+ if (Opc == X86::POP2 || Opc == X86::POP2P)
2511
+ Offset += SlotSize;
2456
2512
BuildCFI (MBB, MBBI, DL,
2457
2513
MCCFIInstruction::cfiDefCfaOffset (nullptr , -Offset),
2458
2514
MachineInstr::FrameDestroy);
@@ -2735,13 +2791,44 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
2735
2791
}
2736
2792
}
2737
2793
2794
+ // Strategy:
2795
+ // 1. Use push2 when
2796
+ // a) number of CSR > 1 if no need padding
2797
+ // b) number of CSR > 2 if need padding
2798
+ // 2. When the number of CSR push is odd
2799
+ // a. Start to use push2 from the 1st push if stack is 16B aligned.
2800
+ // b. Start to use push2 from the 2nd push if stack is not 16B aligned.
2801
+ // 3. When the number of CSR push is even, start to use push2 from the 1st
2802
+ // push and make the stack 16B aligned before the push
2803
+ unsigned NumRegsForPush2 = 0 ;
2804
+ if (STI.hasPush2Pop2 ()) {
2805
+ unsigned NumCSGPR = llvm::count_if (CSI, [](const CalleeSavedInfo &I) {
2806
+ return X86::GR64RegClass.contains (I.getReg ());
2807
+ });
2808
+ bool NeedPadding = (SpillSlotOffset % 16 != 0 ) && (NumCSGPR % 2 == 0 );
2809
+ bool UsePush2Pop2 = NeedPadding ? NumCSGPR > 2 : NumCSGPR > 1 ;
2810
+ X86FI->setPadForPush2Pop2 (NeedPadding && UsePush2Pop2);
2811
+ NumRegsForPush2 = UsePush2Pop2 ? alignDown (NumCSGPR, 2 ) : 0 ;
2812
+ if (X86FI->padForPush2Pop2 ()) {
2813
+ SpillSlotOffset -= SlotSize;
2814
+ MFI.CreateFixedSpillStackObject (SlotSize, SpillSlotOffset);
2815
+ }
2816
+ }
2817
+
2738
2818
// Assign slots for GPRs. It increases frame size.
2739
2819
for (CalleeSavedInfo &I : llvm::reverse (CSI)) {
2740
2820
Register Reg = I.getReg ();
2741
2821
2742
2822
if (!X86::GR64RegClass.contains (Reg) && !X86::GR32RegClass.contains (Reg))
2743
2823
continue ;
2744
2824
2825
+ // A CSR is a candidate for push2/pop2 when it's slot offset is 16B aligned
2826
+ // or only an odd number of registers in the candidates.
2827
+ if (X86FI->getNumCandidatesForPush2Pop2 () < NumRegsForPush2 &&
2828
+ (SpillSlotOffset % 16 == 0 ||
2829
+ X86FI->getNumCandidatesForPush2Pop2 () % 2 ))
2830
+ X86FI->addCandidateForPush2Pop2 (Reg);
2831
+
2745
2832
SpillSlotOffset -= SlotSize;
2746
2833
CalleeSavedFrameSize += SlotSize;
2747
2834
@@ -2759,6 +2846,10 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
2759
2846
// TODO: saving the slot index is better?
2760
2847
X86FI->setRestoreBasePointer (CalleeSavedFrameSize);
2761
2848
}
2849
+ assert (X86FI->getNumCandidatesForPush2Pop2 () % 2 == 0 &&
2850
+ " Expect even candidates for push2/pop2" );
2851
+ if (X86FI->getNumCandidatesForPush2Pop2 ())
2852
+ ++NumFunctionUsingPush2Pop2;
2762
2853
X86FI->setCalleeSavedFrameSize (CalleeSavedFrameSize);
2763
2854
MFI.setCVBytesOfCalleeSavedRegisters (CalleeSavedFrameSize);
2764
2855
@@ -2808,41 +2899,50 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
2808
2899
2809
2900
// Push GPRs. It increases frame size.
2810
2901
const MachineFunction &MF = *MBB.getParent ();
2811
- unsigned Opc = STI.is64Bit () ? X86::PUSH64r : X86::PUSH32r;
2812
- for (const CalleeSavedInfo &I : llvm::reverse (CSI)) {
2813
- Register Reg = I.getReg ();
2814
-
2815
- if (!X86::GR64RegClass.contains (Reg) && !X86::GR32RegClass.contains (Reg))
2816
- continue ;
2902
+ const X86MachineFunctionInfo *X86FI = MF.getInfo <X86MachineFunctionInfo>();
2903
+ if (X86FI->padForPush2Pop2 ())
2904
+ emitSPUpdate (MBB, MI, DL, -(int64_t )SlotSize, /* InEpilogue=*/ false );
2817
2905
2906
+ // Update LiveIn of the basic block and decide whether we can add a kill flag
2907
+ // to the use.
2908
+ auto UpdateLiveInCheckCanKill = [&](Register Reg) {
2818
2909
const MachineRegisterInfo &MRI = MF.getRegInfo ();
2819
- bool isLiveIn = MRI.isLiveIn (Reg);
2820
- if (!isLiveIn)
2821
- MBB.addLiveIn (Reg);
2822
-
2823
- // Decide whether we can add a kill flag to the use.
2824
- bool CanKill = !isLiveIn;
2825
- // Check if any subregister is live-in
2826
- if (CanKill) {
2827
- for (MCRegAliasIterator AReg (Reg, TRI, false ); AReg.isValid (); ++AReg) {
2828
- if (MRI.isLiveIn (*AReg)) {
2829
- CanKill = false ;
2830
- break ;
2831
- }
2832
- }
2833
- }
2834
-
2835
2910
// Do not set a kill flag on values that are also marked as live-in. This
2836
2911
// happens with the @llvm-returnaddress intrinsic and with arguments
2837
2912
// passed in callee saved registers.
2838
2913
// Omitting the kill flags is conservatively correct even if the live-in
2839
2914
// is not used after all.
2840
- BuildMI (MBB, MI, DL, TII.get (Opc))
2841
- .addReg (Reg, getKillRegState (CanKill))
2842
- .setMIFlag (MachineInstr::FrameSetup);
2915
+ if (MRI.isLiveIn (Reg))
2916
+ return false ;
2917
+ MBB.addLiveIn (Reg);
2918
+ // Check if any subregister is live-in
2919
+ for (MCRegAliasIterator AReg (Reg, TRI, false ); AReg.isValid (); ++AReg)
2920
+ if (MRI.isLiveIn (*AReg))
2921
+ return false ;
2922
+ return true ;
2923
+ };
2924
+ auto UpdateLiveInGetKillRegState = [&](Register Reg) {
2925
+ return getKillRegState (UpdateLiveInCheckCanKill (Reg));
2926
+ };
2927
+
2928
+ for (auto RI = CSI.rbegin (), RE = CSI.rend (); RI != RE; ++RI) {
2929
+ Register Reg = RI->getReg ();
2930
+ if (!X86::GR64RegClass.contains (Reg) && !X86::GR32RegClass.contains (Reg))
2931
+ continue ;
2932
+
2933
+ if (X86FI->isCandidateForPush2Pop2 (Reg)) {
2934
+ Register Reg2 = (++RI)->getReg ();
2935
+ BuildMI (MBB, MI, DL, TII.get (getPUSH2Opcode (STI)))
2936
+ .addReg (Reg, UpdateLiveInGetKillRegState (Reg))
2937
+ .addReg (Reg2, UpdateLiveInGetKillRegState (Reg2))
2938
+ .setMIFlag (MachineInstr::FrameSetup);
2939
+ } else {
2940
+ BuildMI (MBB, MI, DL, TII.get (getPUSHOpcode (STI)))
2941
+ .addReg (Reg, UpdateLiveInGetKillRegState (Reg))
2942
+ .setMIFlag (MachineInstr::FrameSetup);
2943
+ }
2843
2944
}
2844
2945
2845
- const X86MachineFunctionInfo *X86FI = MF.getInfo <X86MachineFunctionInfo>();
2846
2946
if (X86FI->getRestoreBasePointer ()) {
2847
2947
unsigned Opc = STI.is64Bit () ? X86::PUSH64r : X86::PUSH32r;
2848
2948
Register BaseReg = this ->TRI ->getBaseRegister ();
@@ -2958,15 +3058,22 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(
2958
3058
}
2959
3059
2960
3060
// POP GPRs.
2961
- unsigned Opc = STI.is64Bit () ? X86::POP64r : X86::POP32r;
2962
- for (const CalleeSavedInfo &I : CSI) {
2963
- Register Reg = I.getReg ();
3061
+ for (auto I = CSI.begin (), E = CSI.end (); I != E; ++I) {
3062
+ Register Reg = I->getReg ();
2964
3063
if (!X86::GR64RegClass.contains (Reg) && !X86::GR32RegClass.contains (Reg))
2965
3064
continue ;
2966
3065
2967
- BuildMI (MBB, MI, DL, TII.get (Opc), Reg)
2968
- .setMIFlag (MachineInstr::FrameDestroy);
3066
+ if (X86FI->isCandidateForPush2Pop2 (Reg))
3067
+ BuildMI (MBB, MI, DL, TII.get (getPOP2Opcode (STI)), Reg)
3068
+ .addReg ((++I)->getReg (), RegState::Define)
3069
+ .setMIFlag (MachineInstr::FrameDestroy);
3070
+ else
3071
+ BuildMI (MBB, MI, DL, TII.get (getPOPOpcode (STI)), Reg)
3072
+ .setMIFlag (MachineInstr::FrameDestroy);
2969
3073
}
3074
+ if (X86FI->padForPush2Pop2 ())
3075
+ emitSPUpdate (MBB, MI, DL, SlotSize, /* InEpilogue=*/ true );
3076
+
2970
3077
return true ;
2971
3078
}
2972
3079
0 commit comments