Skip to content

Commit 230c13d

Browse files
authored
[AMDGPU] Pick available high VGPR for CSR SGPR spilling (#78669)
CSR SGPR spilling currently uses the early available physical VGPRs. It currently imposes a high register pressure while trying to allocate large VGPR tuples within the default register budget. This patch changes the spilling strategy by picking the VGPRs in the reverse order, the highest available VGPR first and later after regalloc shift them back to the lowest available range. With that, the initial VGPRs would be available for allocation and possibility of finding large number of contiguous registers will be more.
1 parent 7e50f00 commit 230c13d

31 files changed

+4513
-4231
lines changed

llvm/lib/Target/AMDGPU/SIFrameLowering.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ static void getVGPRSpillLaneOrTempRegister(
9595
TargetStackID::SGPRSpill);
9696

9797
if (TRI->spillSGPRToVGPR() &&
98-
MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) {
98+
MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
99+
/*IsPrologEpilog=*/true)) {
99100
// 2: There's no free lane to spill, and no free register to save the
100101
// SGPR, so we're forced to take another VGPR to use for the spill.
101102
MFI->addToPrologEpilogSGPRSpills(
@@ -1560,6 +1561,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
15601561
if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
15611562
return;
15621563

1564+
MFI->shiftSpillPhysVGPRsToLowestRange(MF);
1565+
15631566
TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
15641567
if (MFI->isEntryFunction())
15651568
return;

llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -368,7 +368,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
368368
// regalloc aware CFI generation to insert new CFIs along with the
369369
// intermediate spills is implemented. There is no such support
370370
// currently exist in the LLVM compiler.
371-
if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
371+
if (FuncInfo->allocateSGPRSpillToVGPRLane(
372+
MF, FI, /*SpillToPhysVGPRLane=*/true)) {
372373
bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
373374
MI, FI, nullptr, Indexes, LIS, true);
374375
if (!Spilled)

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,33 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
312312
return false;
313313
}
314314

315+
void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange(
316+
MachineFunction &MF) {
317+
const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
318+
MachineRegisterInfo &MRI = MF.getRegInfo();
319+
for (unsigned I = 0, E = SpillPhysVGPRs.size(); I < E; ++I) {
320+
Register Reg = SpillPhysVGPRs[I];
321+
Register NewReg =
322+
TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
323+
if (!NewReg || NewReg >= Reg)
324+
break;
325+
326+
MRI.replaceRegWith(Reg, NewReg);
327+
328+
// Update various tables with the new VGPR.
329+
SpillPhysVGPRs[I] = NewReg;
330+
WWMReservedRegs.remove(Reg);
331+
WWMReservedRegs.insert(NewReg);
332+
WWMSpills.insert(std::make_pair(NewReg, WWMSpills[Reg]));
333+
WWMSpills.erase(Reg);
334+
335+
for (MachineBasicBlock &MBB : MF) {
336+
MBB.removeLiveIn(Reg);
337+
MBB.sortUniqueLiveIns();
338+
}
339+
}
340+
}
341+
315342
bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
316343
MachineFunction &MF, int FI, unsigned LaneIndex) {
317344
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -329,13 +356,17 @@ bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
329356
}
330357

331358
bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
332-
MachineFunction &MF, int FI, unsigned LaneIndex) {
359+
MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
333360
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
334361
const SIRegisterInfo *TRI = ST.getRegisterInfo();
335362
MachineRegisterInfo &MRI = MF.getRegInfo();
336363
Register LaneVGPR;
337364
if (!LaneIndex) {
338-
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
365+
// Find the highest available register if called before RA to ensure the
366+
// lowest registers are available for allocation. The LaneVGPR, in that
367+
// case, will be shifted back to the lowest range after VGPR allocation.
368+
LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
369+
!IsPrologEpilog);
339370
if (LaneVGPR == AMDGPU::NoRegister) {
340371
// We have no VGPRs left for spilling SGPRs. Reset because we will not
341372
// partially spill the SGPR to VGPRs.
@@ -359,12 +390,12 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
359390
return true;
360391
}
361392

362-
bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
363-
int FI,
364-
bool IsPrologEpilog) {
393+
bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(
394+
MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
395+
bool IsPrologEpilog) {
365396
std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
366-
IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
367-
: SGPRSpillsToVirtualVGPRLanes[FI];
397+
SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
398+
: SGPRSpillsToVirtualVGPRLanes[FI];
368399

369400
// This has already been allocated.
370401
if (!SpillLanes.empty())
@@ -384,14 +415,15 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
384415
assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
385416
"not spilling SGPRs to VGPRs");
386417

387-
unsigned &NumSpillLanes =
388-
IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
418+
unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
419+
: NumVirtualVGPRSpillLanes;
389420

390421
for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
391422
unsigned LaneIndex = (NumSpillLanes % WaveSize);
392423

393-
bool Allocated = IsPrologEpilog
394-
? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
424+
bool Allocated = SpillToPhysVGPRLane
425+
? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
426+
IsPrologEpilog)
395427
: allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
396428
if (!Allocated) {
397429
NumSpillLanes -= I;

llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -548,7 +548,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
548548
bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
549549
unsigned LaneIndex);
550550
bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI,
551-
unsigned LaneIndex);
551+
unsigned LaneIndex,
552+
bool IsPrologEpilog);
552553

553554
public:
554555
Register getVGPRForAGPRCopy() const {
@@ -588,6 +589,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
588589
}
589590

590591
ArrayRef<Register> getSGPRSpillVGPRs() const { return SpillVGPRs; }
592+
591593
const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
592594
const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
593595

@@ -702,7 +704,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
702704
I->second.IsDead = true;
703705
}
704706

707+
// To bring the Physical VGPRs in the highest range allocated for CSR SGPR
708+
// spilling into the lowest available range.
709+
void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF);
710+
705711
bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI,
712+
bool SpillToPhysVGPRLane = false,
706713
bool IsPrologEpilog = false);
707714
bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
708715

0 commit comments

Comments
 (0)