Skip to content

Commit c74ed8a

Browse files
authored
[AMDGPU][Scheduler] Support for rematerializing SGPRs and AGPRs (#140036)
This adds the ability to rematerialize SGPRs and AGPRs to the scheduler's `PreRARematStage`, which can currently only rematerialize ArchVGPRs. This also fixes a small potential issue in the stage where, in case of spilling, the target occupancy could be set to a lower than expected value when the function had either one of the "amdgpu-num-sgpr" or "amdgpu-num-vgpr" attributes set.
1 parent 09b43a5 commit c74ed8a

File tree

4 files changed

+670
-336
lines changed

4 files changed

+670
-336
lines changed

llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp

Lines changed: 81 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1703,11 +1703,13 @@ namespace {
17031703
/// Models excess register pressure in a region and tracks our progress as we
17041704
/// identify rematerialization opportunities.
17051705
struct ExcessRP {
1706+
/// Number of excess SGPRs.
1707+
unsigned SGPRs = 0;
17061708
/// Number of excess ArchVGPRs.
17071709
unsigned ArchVGPRs = 0;
17081710
/// Number of excess AGPRs.
17091711
unsigned AGPRs = 0;
1710-
/// For unified register files, number of excess VGPRs.
1712+
/// For unified register files, number of excess VGPRs. 0 otherwise.
17111713
unsigned VGPRs = 0;
17121714
/// For unified register files with AGPR usage, number of excess ArchVGPRs to
17131715
/// save before we are able to save a whole allocation granule.
@@ -1716,28 +1718,37 @@ struct ExcessRP {
17161718
bool HasAGPRs = false;
17171719
/// Whether the subtarget has a unified RF.
17181720
bool UnifiedRF;
1721+
/// Whether we consider that the register allocator will be able to swap
1722+
/// between ArchVGPRs and AGPRs by copying them to a super register class.
1723+
/// Concretely, this allows savings of one kind of VGPR to help toward savings
1724+
/// the other kind of VGPR.
1725+
bool CombineVGPRSavings;
17191726

17201727
/// Constructs the excess RP model; determines the excess pressure w.r.t. a
1721-
/// maximum number of allowed VGPRs.
1722-
ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxVGPRs);
1728+
/// maximum number of allowed SGPRs/VGPRs.
1729+
ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP, unsigned MaxSGPRs,
1730+
unsigned MaxVGPRs, bool CombineVGPRSavings);
17231731

1724-
/// Accounts for \p NumRegs saved ArchVGPRs in the model. If \p
1725-
/// UseArchVGPRForAGPRSpill is true, saved ArchVGPRs are used to save excess
1726-
/// AGPRs once excess ArchVGPR pressure has been eliminated. Returns whether
1727-
/// saving these ArchVGPRs helped reduce excess pressure.
1728-
bool saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill);
1732+
/// Accounts for \p NumRegs saved SGPRs in the model. Returns whether saving
1733+
/// these SGPRs helped reduce excess pressure.
1734+
bool saveSGPRs(unsigned NumRegs) { return saveRegs(SGPRs, NumRegs); }
17291735

1730-
/// Accounts for \p NumRegs saved AGPRS in the model. Returns whether saving
1731-
/// these ArchVGPRs helped reduce excess pressure.
1736+
/// Accounts for \p NumRegs saved ArchVGPRs in the model. Returns whether
1737+
/// saving these ArchGPRs helped reduce excess pressure.
1738+
bool saveArchVGPRs(unsigned NumRegs);
1739+
1740+
/// Accounts for \p NumRegs saved AGPRs in the model. Returns whether saving
1741+
/// these AGPRs helped reduce excess pressure.
17321742
bool saveAGPRs(unsigned NumRegs);
17331743

17341744
/// Returns whether there is any excess register pressure.
1735-
operator bool() const { return ArchVGPRs != 0 || AGPRs != 0 || VGPRs != 0; }
1745+
operator bool() const { return SGPRs || ArchVGPRs || AGPRs || VGPRs; }
17361746

17371747
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
17381748
friend raw_ostream &operator<<(raw_ostream &OS, const ExcessRP &Excess) {
1739-
OS << Excess.ArchVGPRs << " ArchVGPRs, " << Excess.AGPRs << " AGPRs, and "
1740-
<< Excess.VGPRs << " VGPRs (next ArchVGPR aligment in "
1749+
OS << Excess.SGPRs << " SGPRs, " << Excess.ArchVGPRs << " ArchVGPRs, and "
1750+
<< Excess.AGPRs << " AGPRs, (" << Excess.VGPRs
1751+
<< " VGPRs in total, next ArchVGPR aligment in "
17411752
<< Excess.ArchVGPRsToAlignment << " registers)\n";
17421753
return OS;
17431754
}
@@ -1754,12 +1765,18 @@ struct ExcessRP {
17541765
} // namespace
17551766

17561767
ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
1757-
unsigned MaxVGPRs)
1758-
: UnifiedRF(ST.hasGFX90AInsts()) {
1768+
unsigned MaxSGPRs, unsigned MaxVGPRs,
1769+
bool CombineVGPRSavings)
1770+
: UnifiedRF(ST.hasGFX90AInsts()), CombineVGPRSavings(CombineVGPRSavings) {
1771+
// Compute excess SGPR pressure.
1772+
unsigned NumSGPRs = RP.getSGPRNum();
1773+
if (NumSGPRs > MaxSGPRs)
1774+
SGPRs = NumSGPRs - MaxSGPRs;
1775+
1776+
// Compute excess ArchVGPR/AGPR pressure.
17591777
unsigned NumArchVGPRs = RP.getArchVGPRNum();
17601778
unsigned NumAGPRs = RP.getAGPRNum();
17611779
HasAGPRs = NumAGPRs;
1762-
17631780
if (!UnifiedRF) {
17641781
// Non-unified RF. Account for excess pressure for ArchVGPRs and AGPRs
17651782
// independently.
@@ -1795,15 +1812,15 @@ ExcessRP::ExcessRP(const GCNSubtarget &ST, const GCNRegPressure &RP,
17951812
}
17961813
}
17971814

1798-
bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
1815+
bool ExcessRP::saveArchVGPRs(unsigned NumRegs) {
17991816
bool Progress = saveRegs(ArchVGPRs, NumRegs);
18001817
if (!NumRegs)
18011818
return Progress;
18021819

18031820
if (!UnifiedRF) {
1804-
if (UseArchVGPRForAGPRSpill)
1821+
if (CombineVGPRSavings)
18051822
Progress |= saveRegs(AGPRs, NumRegs);
1806-
} else if (HasAGPRs && (VGPRs || (UseArchVGPRForAGPRSpill && AGPRs))) {
1823+
} else if (HasAGPRs && (VGPRs || (CombineVGPRSavings && AGPRs))) {
18071824
// There is progress as long as there are VGPRs left to save, even if the
18081825
// save induced by this particular call does not cross an ArchVGPR alignment
18091826
// barrier.
@@ -1827,21 +1844,25 @@ bool ExcessRP::saveArchVGPRs(unsigned NumRegs, bool UseArchVGPRForAGPRSpill) {
18271844
ArchVGPRsToAlignment -= NumRegs;
18281845
}
18291846

1830-
// Prioritize saving generic VGPRs, then AGPRs if we allow AGPR-to-ArchVGPR
1831-
// spilling and have some free ArchVGPR slots.
1847+
// Prioritize saving generic VGPRs, then AGPRs if we consider that the
1848+
// register allocator will be able to replace an AGPR with an ArchVGPR.
18321849
saveRegs(VGPRs, NumSavedRegs);
1833-
if (UseArchVGPRForAGPRSpill)
1850+
if (CombineVGPRSavings)
18341851
saveRegs(AGPRs, NumSavedRegs);
18351852
} else {
18361853
// No AGPR usage in the region i.e., no allocation granule to worry about.
18371854
Progress |= saveRegs(VGPRs, NumRegs);
18381855
}
1839-
18401856
return Progress;
18411857
}
18421858

18431859
bool ExcessRP::saveAGPRs(unsigned NumRegs) {
1844-
return saveRegs(AGPRs, NumRegs) || saveRegs(VGPRs, NumRegs);
1860+
bool Progress = saveRegs(AGPRs, NumRegs);
1861+
if (UnifiedRF)
1862+
Progress |= saveRegs(VGPRs, NumRegs);
1863+
if (CombineVGPRSavings)
1864+
Progress |= saveRegs(ArchVGPRs, NumRegs);
1865+
return Progress;
18451866
}
18461867

18471868
bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
@@ -1869,46 +1890,28 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
18691890
ST.getMaxNumVGPRs(DAG.MinOccupancy + 1, DynamicVGPRBlockSize);
18701891
IncreaseOccupancy = WavesPerEU.second > DAG.MinOccupancy;
18711892

1872-
auto ClearOptRegionsIf = [&](bool Cond) -> bool {
1873-
if (Cond) {
1874-
// We won't try to increase occupancy.
1875-
IncreaseOccupancy = false;
1876-
OptRegions.clear();
1877-
}
1878-
return Cond;
1879-
};
1880-
18811893
// Collect optimizable regions. If there is spilling in any region we will
1882-
// just try to reduce ArchVGPR spilling. Otherwise we will try to increase
1883-
// occupancy by one in the whole function.
1894+
// just try to reduce spilling. Otherwise we will try to increase occupancy by
1895+
// one in the whole function.
18841896
for (unsigned I = 0, E = DAG.Regions.size(); I != E; ++I) {
18851897
GCNRegPressure &RP = DAG.Pressure[I];
1886-
1887-
// Check whether SGPR pressures prevents us from eliminating spilling.
1888-
unsigned NumSGPRs = RP.getSGPRNum();
1889-
if (NumSGPRs > MaxSGPRsNoSpill)
1890-
ClearOptRegionsIf(IncreaseOccupancy);
1891-
1892-
ExcessRP Excess(ST, RP, MaxVGPRsNoSpill);
1893-
if (Excess) {
1894-
ClearOptRegionsIf(IncreaseOccupancy);
1898+
// We allow ArchVGPR or AGPR savings to count as savings of the other kind
1899+
// of VGPR only when trying to eliminate spilling. We cannot do this when
1900+
// trying to increase occupancy since VGPR class swaps only occur later in
1901+
// the register allocator i.e., the scheduler will not be able to reason
1902+
// about these savings and will not report an increase in the achievable
1903+
// occupancy, triggering rollbacks.
1904+
ExcessRP Excess(ST, RP, MaxSGPRsNoSpill, MaxVGPRsNoSpill,
1905+
/*CombineVGPRSavings=*/true);
1906+
if (Excess && IncreaseOccupancy) {
1907+
// There is spilling in the region and we were so far trying to increase
1908+
// occupancy. Strop trying that and focus on reducing spilling.
1909+
IncreaseOccupancy = false;
1910+
OptRegions.clear();
18951911
} else if (IncreaseOccupancy) {
1896-
// Check whether SGPR pressure prevents us from increasing occupancy.
1897-
if (ClearOptRegionsIf(NumSGPRs > MaxSGPRsIncOcc)) {
1898-
if (DAG.MinOccupancy >= WavesPerEU.first)
1899-
return false;
1900-
continue;
1901-
}
1902-
if ((Excess = ExcessRP(ST, RP, MaxVGPRsIncOcc))) {
1903-
// We can only rematerialize ArchVGPRs at this point.
1904-
unsigned NumArchVGPRsToRemat = Excess.ArchVGPRs + Excess.VGPRs;
1905-
bool NotEnoughArchVGPRs = NumArchVGPRsToRemat > RP.getArchVGPRNum();
1906-
if (ClearOptRegionsIf(Excess.AGPRs || NotEnoughArchVGPRs)) {
1907-
if (DAG.MinOccupancy >= WavesPerEU.first)
1908-
return false;
1909-
continue;
1910-
}
1911-
}
1912+
// There is no spilling in the region, try to increase occupancy.
1913+
Excess = ExcessRP(ST, RP, MaxSGPRsIncOcc, MaxVGPRsIncOcc,
1914+
/*CombineVGPRSavings=*/false);
19121915
}
19131916
if (Excess)
19141917
OptRegions.insert({I, Excess});
@@ -1928,23 +1931,27 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
19281931
#endif
19291932

19301933
// When we are reducing spilling, the target is the minimum target number of
1931-
// waves/EU determined by the subtarget.
1932-
TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1 : WavesPerEU.first;
1934+
// waves/EU determined by the subtarget. In cases where either one of
1935+
// "amdgpu-num-sgpr" or "amdgpu-num-vgpr" are set on the function, the current
1936+
// minimum region occupancy may be higher than the latter.
1937+
TargetOcc = IncreaseOccupancy ? DAG.MinOccupancy + 1
1938+
: std::max(DAG.MinOccupancy, WavesPerEU.first);
19331939

19341940
// Accounts for a reduction in RP in an optimizable region. Returns whether we
19351941
// estimate that we have identified enough rematerialization opportunities to
19361942
// achieve our goal, and sets Progress to true when this particular reduction
19371943
// in pressure was helpful toward that goal.
19381944
auto ReduceRPInRegion = [&](auto OptIt, LaneBitmask Mask,
1945+
const TargetRegisterClass *RC,
19391946
bool &Progress) -> bool {
19401947
ExcessRP &Excess = OptIt->getSecond();
1941-
// We allow saved ArchVGPRs to be considered as free spill slots for AGPRs
1942-
// only when we are just trying to eliminate spilling to memory. At this
1943-
// point we err on the conservative side and do not increase
1944-
// register-to-register spilling for the sake of increasing occupancy.
1945-
Progress |=
1946-
Excess.saveArchVGPRs(SIRegisterInfo::getNumCoveredRegs(Mask),
1947-
/*UseArchVGPRForAGPRSpill=*/!IncreaseOccupancy);
1948+
unsigned NumRegs = SIRegisterInfo::getNumCoveredRegs(Mask);
1949+
if (SRI->isSGPRClass(RC))
1950+
Progress |= Excess.saveSGPRs(NumRegs);
1951+
else if (SRI->isAGPRClass(RC))
1952+
Progress |= Excess.saveAGPRs(NumRegs);
1953+
else
1954+
Progress |= Excess.saveArchVGPRs(NumRegs);
19481955
if (!Excess)
19491956
OptRegions.erase(OptIt->getFirst());
19501957
return OptRegions.empty();
@@ -1966,10 +1973,9 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
19661973
if (!isTriviallyReMaterializable(DefMI))
19671974
continue;
19681975

1969-
// We only support rematerializing virtual VGPRs with one definition.
1976+
// We only support rematerializing virtual registers with one definition.
19701977
Register Reg = DefMI.getOperand(0).getReg();
1971-
if (!Reg.isVirtual() || !SRI->isVGPRClass(DAG.MRI.getRegClass(Reg)) ||
1972-
!DAG.MRI.hasOneDef(Reg))
1978+
if (!Reg.isVirtual() || !DAG.MRI.hasOneDef(Reg))
19731979
continue;
19741980

19751981
// We only care to rematerialize the instruction if it has a single
@@ -2007,14 +2013,15 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
20072013
Rematerializations.try_emplace(&DefMI, UseMI).first->second;
20082014

20092015
bool RematUseful = false;
2016+
const TargetRegisterClass *RC = DAG.MRI.getRegClass(Reg);
20102017
if (auto It = OptRegions.find(I); It != OptRegions.end()) {
20112018
// Optimistically consider that moving the instruction out of its
20122019
// defining region will reduce RP in the latter; this assumes that
20132020
// maximum RP in the region is reached somewhere between the defining
20142021
// instruction and the end of the region.
20152022
REMAT_DEBUG(dbgs() << " Defining region is optimizable\n");
20162023
LaneBitmask Mask = DAG.RegionLiveOuts.getLiveRegsForRegionIdx(I)[Reg];
2017-
if (ReduceRPInRegion(It, Mask, RematUseful))
2024+
if (ReduceRPInRegion(It, Mask, RC, RematUseful))
20182025
return true;
20192026
}
20202027

@@ -2034,7 +2041,7 @@ bool PreRARematStage::canIncreaseOccupancyOrReduceSpill() {
20342041
// instruction's use.
20352042
if (auto It = OptRegions.find(LIRegion); It != OptRegions.end()) {
20362043
REMAT_DEBUG(dbgs() << " Live-in in region " << LIRegion << '\n');
2037-
if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RematUseful))
2044+
if (ReduceRPInRegion(It, DAG.LiveIns[LIRegion][Reg], RC, RematUseful))
20382045
return true;
20392046
}
20402047
}

llvm/lib/Target/AMDGPU/GCNSchedStrategy.h

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -440,8 +440,6 @@ class ClusteredLowOccStage : public GCNSchedStage {
440440
/// estimates reducing spilling or increasing occupancy is possible, as few
441441
/// instructions as possible are rematerialized to reduce potential negative
442442
/// effects on function latency.
443-
///
444-
/// TODO: We should extend this to work on SGPRs and AGPRs as well.
445443
class PreRARematStage : public GCNSchedStage {
446444
private:
447445
/// Useful information about a rematerializable instruction.

0 commit comments

Comments
 (0)