Skip to content

Commit 6b97132

Browse files
committed
[AMDGPU] Fold more AGPR copies/PHIs in SIFoldOperands
Generalize `tryFoldLCSSAPhi` into `tryFoldPhiAGPR` which works on any kind of PHI node (not just LCSSA ones) and attempts to create AGPR Phis more aggressively. Also adds a GFX908-only "cleanup" function `tryOptimizeAGPRPhis` which tries to minimize AGPR to AGPR copies on GFX908, which doesn't have a ACCVGPR MOV instruction (so AGPR-AGPR copies become 2 or 3 instructions as they need a VGPR temp). The reason why this is needed is because D143731 + the new `tryFoldPhiAGPR` may create a lot more PHIs (one 32xfloat PHI becomes 32 float phis), and if each PHI hits the same AGPR (like in `test_mfma_loop_agpr_init`) they will be lowered to 32 copies from the same AGPR, which will each become 2-3 instructions. Creating a VGPR cache in this case prevents all those copies from being generated (we have AGPR-VGPR copies instead which are trivial). This is a prepation patch intended to prevent regressions in D143731 when AGPRs are involved. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D144099
1 parent a7dcf39 commit 6b97132

File tree

2 files changed

+631
-31
lines changed

2 files changed

+631
-31
lines changed

llvm/lib/Target/AMDGPU/SIFoldOperands.cpp

Lines changed: 221 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -111,9 +111,11 @@ class SIFoldOperands : public MachineFunctionPass {
111111
std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
112112
bool tryFoldOMod(MachineInstr &MI);
113113
bool tryFoldRegSequence(MachineInstr &MI);
114-
bool tryFoldLCSSAPhi(MachineInstr &MI);
114+
bool tryFoldPhiAGPR(MachineInstr &MI);
115115
bool tryFoldLoad(MachineInstr &MI);
116116

117+
bool tryOptimizeAGPRPhis(MachineBasicBlock &MBB);
118+
117119
public:
118120
SIFoldOperands() : MachineFunctionPass(ID) {
119121
initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
@@ -138,6 +140,16 @@ char SIFoldOperands::ID = 0;
138140

139141
char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
140142

143+
static const TargetRegisterClass *getRegOpRC(const MachineRegisterInfo &MRI,
144+
const TargetRegisterInfo &TRI,
145+
const MachineOperand &MO) {
146+
const TargetRegisterClass *RC = MRI.getRegClass(MO.getReg());
147+
if (const TargetRegisterClass *SubRC =
148+
TRI.getSubRegisterClass(RC, MO.getSubReg()))
149+
RC = SubRC;
150+
return RC;
151+
}
152+
141153
// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
142154
static unsigned macToMad(unsigned Opc) {
143155
switch (Opc) {
@@ -1631,52 +1643,133 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
16311643
return true;
16321644
}
16331645

1634-
// Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI.
1646+
// Try to hoist an AGPR to VGPR copy across a PHI.
16351647
// This should allow folding of an AGPR into a consumer which may support it.
1636-
// I.e.:
16371648
//
1638-
// loop: // loop:
1639-
// %1:vreg = COPY %0:areg // exit:
1640-
// exit: => // %1:areg = PHI %0:areg, %loop
1641-
// %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg
1642-
bool SIFoldOperands::tryFoldLCSSAPhi(MachineInstr &PHI) {
1649+
// Example 1: LCSSA PHI
1650+
// loop:
1651+
// %1:vreg = COPY %0:areg
1652+
// exit:
1653+
// %2:vreg = PHI %1:vreg, %loop
1654+
// =>
1655+
// loop:
1656+
// exit:
1657+
// %1:areg = PHI %0:areg, %loop
1658+
// %2:vreg = COPY %1:areg
1659+
//
1660+
// Example 2: PHI with multiple incoming values:
1661+
// entry:
1662+
// %1:vreg = GLOBAL_LOAD(..)
1663+
// loop:
1664+
// %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1665+
// %3:areg = COPY %2:vreg
1666+
// %4:areg = (instr using %3:areg)
1667+
// %5:vreg = COPY %4:areg
1668+
// =>
1669+
// entry:
1670+
// %1:vreg = GLOBAL_LOAD(..)
1671+
// %2:areg = COPY %1:vreg
1672+
// loop:
1673+
// %3:areg = PHI %2:areg, %entry, %X:areg,
1674+
// %4:areg = (instr using %3:areg)
1675+
bool SIFoldOperands::tryFoldPhiAGPR(MachineInstr &PHI) {
16431676
assert(PHI.isPHI());
16441677

1645-
if (PHI.getNumExplicitOperands() != 3) // Single input LCSSA PHI
1646-
return false;
1647-
1648-
Register PhiIn = PHI.getOperand(1).getReg();
16491678
Register PhiOut = PHI.getOperand(0).getReg();
1650-
if (PHI.getOperand(1).getSubReg() ||
1651-
!TRI->isVGPR(*MRI, PhiIn) || !TRI->isVGPR(*MRI, PhiOut))
1679+
if (!TRI->isVGPR(*MRI, PhiOut))
16521680
return false;
16531681

1654-
// A single use should not matter for correctness, but if it has another use
1655-
// inside the loop we may perform copy twice in a worst case.
1656-
if (!MRI->hasOneNonDBGUse(PhiIn))
1657-
return false;
1682+
// Iterate once over all incoming values of the PHI to check if this PHI is
1683+
// eligible, and determine the exact AGPR RC we'll target.
1684+
const TargetRegisterClass *ARC = nullptr;
1685+
for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1686+
MachineOperand &MO = PHI.getOperand(K);
16581687

1659-
MachineInstr *Copy = MRI->getVRegDef(PhiIn);
1660-
if (!Copy || !Copy->isCopy())
1661-
return false;
1688+
Register PhiIn = MO.getReg();
1689+
if (MO.getSubReg() || !TRI->isVGPR(*MRI, PhiIn))
1690+
return false;
1691+
1692+
MachineInstr *Copy = MRI->getVRegDef(PhiIn);
1693+
if (!Copy || !Copy->isCopy())
1694+
continue;
16621695

1663-
Register CopyIn = Copy->getOperand(1).getReg();
1664-
if (!TRI->isAGPR(*MRI, CopyIn) || Copy->getOperand(1).getSubReg())
1696+
Register CopyIn = Copy->getOperand(1).getReg();
1697+
if (CopyIn.isVirtual() && TRI->isAGPR(*MRI, CopyIn)) {
1698+
const TargetRegisterClass *CopyInRC =
1699+
getRegOpRC(*MRI, *TRI, Copy->getOperand(1));
1700+
if (ARC && !ARC->hasSubClassEq(CopyInRC))
1701+
return false;
1702+
ARC = CopyInRC;
1703+
}
1704+
}
1705+
1706+
if (!ARC)
16651707
return false;
16661708

1667-
const TargetRegisterClass *ARC = MRI->getRegClass(CopyIn);
1709+
// Rewrite the PHI's incoming values to ARC.
1710+
LLVM_DEBUG(dbgs() << "Folding AGPR copies into: " << PHI);
1711+
for (unsigned K = 1; K < PHI.getNumExplicitOperands(); K += 2) {
1712+
MachineOperand &MO = PHI.getOperand(K);
1713+
Register Reg = MO.getReg();
1714+
1715+
MachineBasicBlock::iterator InsertPt;
1716+
MachineBasicBlock *InsertMBB = nullptr;
1717+
1718+
// Look at the def of Reg, ignoring all copies.
1719+
bool UseAccVGPRWrite = false;
1720+
if (MachineInstr *Def = MRI->getVRegDef(Reg)) {
1721+
1722+
// Look at pre-existing COPY instructions from ARC: Steal the operand. If
1723+
// the copy was single-use, it will be removed by DCE later.
1724+
if (Def->isCopy()) {
1725+
MachineOperand &CopyIn = Def->getOperand(1);
1726+
if (CopyIn.getReg().isVirtual() &&
1727+
getRegOpRC(*MRI, *TRI, CopyIn)->hasSubClassEq(ARC)) {
1728+
MO.setReg(CopyIn.getReg());
1729+
MO.setSubReg(CopyIn.getSubReg());
1730+
continue;
1731+
}
1732+
1733+
// If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1734+
// GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1735+
// to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1736+
// is unlikely to be profitable.
1737+
if (!ST->hasGFX90AInsts() && !MRI->hasOneNonDBGUse(Reg) &&
1738+
TRI->isSGPRReg(*MRI, CopyIn.getReg()))
1739+
UseAccVGPRWrite = true;
1740+
}
1741+
1742+
InsertPt = ++Def->getIterator();
1743+
InsertMBB = Def->getParent();
1744+
} else {
1745+
InsertMBB = PHI.getOperand(MO.getOperandNo() + 1).getMBB();
1746+
InsertPt = InsertMBB->getFirstTerminator();
1747+
}
1748+
1749+
const unsigned CopyOpc =
1750+
UseAccVGPRWrite ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY;
1751+
Register NewReg = MRI->createVirtualRegister(ARC);
1752+
MachineInstr *MI = BuildMI(*InsertMBB, InsertPt, PHI.getDebugLoc(),
1753+
TII->get(CopyOpc), NewReg)
1754+
.addReg(Reg);
1755+
MO.setReg(NewReg);
1756+
1757+
(void)MI;
1758+
LLVM_DEBUG(dbgs() << " Created COPY: " << *MI);
1759+
}
1760+
1761+
// Replace the PHI's result with a new register.
16681762
Register NewReg = MRI->createVirtualRegister(ARC);
1669-
PHI.getOperand(1).setReg(CopyIn);
16701763
PHI.getOperand(0).setReg(NewReg);
16711764

1765+
// COPY that new register back to the original PhiOut register. This COPY will
1766+
// usually be folded out later.
16721767
MachineBasicBlock *MBB = PHI.getParent();
1673-
BuildMI(*MBB, MBB->getFirstNonPHI(), Copy->getDebugLoc(),
1768+
BuildMI(*MBB, MBB->getFirstNonPHI(), PHI.getDebugLoc(),
16741769
TII->get(AMDGPU::COPY), PhiOut)
1675-
.addReg(NewReg, RegState::Kill);
1676-
Copy->eraseFromParent(); // We know this copy had a single use.
1677-
1678-
LLVM_DEBUG(dbgs() << "Folded " << PHI);
1770+
.addReg(NewReg);
16791771

1772+
LLVM_DEBUG(dbgs() << " Done: Folded " << PHI);
16801773
return true;
16811774
}
16821775

@@ -1736,6 +1829,101 @@ bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
17361829
return true;
17371830
}
17381831

1832+
// tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
1833+
// For GFX90A and later, this is pretty much always a good thing, but for GFX908
1834+
// there's cases where it can create a lot more AGPR-AGPR copies, which are
1835+
// expensive on this architecture due to the lack of V_ACCVGPR_MOV.
1836+
//
1837+
// This function looks at all AGPR PHIs in a basic block and collects their
1838+
// operands. Then, it checks for register that are used more than once across
1839+
// all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
1840+
// having to create one VGPR temporary per use, which can get very messy if
1841+
// these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
1842+
// element).
1843+
//
1844+
// Example
1845+
// a:
1846+
// %in:agpr_256 = COPY %foo:vgpr_256
1847+
// c:
1848+
// %x:agpr_32 = ..
1849+
// b:
1850+
// %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
1851+
// %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
1852+
// %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
1853+
// =>
1854+
// a:
1855+
// %in:agpr_256 = COPY %foo:vgpr_256
1856+
// %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
1857+
// %tmp_agpr:agpr_32 = COPY %tmp
1858+
// c:
1859+
// %x:agpr_32 = ..
1860+
// b:
1861+
// %0:areg = PHI %tmp_agpr, %a, %x, %c
1862+
// %1:areg = PHI %tmp_agpr, %a, %y, %c
1863+
// %2:areg = PHI %tmp_agpr, %a, %z, %c
1864+
bool SIFoldOperands::tryOptimizeAGPRPhis(MachineBasicBlock &MBB) {
1865+
// This is only really needed on GFX908 where AGPR-AGPR copies are
1866+
// unreasonably difficult.
1867+
if (ST->hasGFX90AInsts())
1868+
return false;
1869+
1870+
// Look at all AGPR Phis and collect the register + subregister used.
1871+
DenseMap<std::pair<Register, unsigned>, std::vector<MachineOperand *>>
1872+
RegToMO;
1873+
1874+
for (auto &MI : MBB) {
1875+
if (!MI.isPHI())
1876+
break;
1877+
1878+
if (!TRI->isAGPR(*MRI, MI.getOperand(0).getReg()))
1879+
continue;
1880+
1881+
for (unsigned K = 1; K < MI.getNumOperands(); K += 2) {
1882+
MachineOperand &PhiMO = MI.getOperand(K);
1883+
RegToMO[{PhiMO.getReg(), PhiMO.getSubReg()}].push_back(&PhiMO);
1884+
}
1885+
}
1886+
1887+
// For all (Reg, SubReg) pair that are used more than once, cache the value in
1888+
// a VGPR.
1889+
bool Changed = false;
1890+
for (const auto &[Entry, MOs] : RegToMO) {
1891+
if (MOs.size() == 1)
1892+
continue;
1893+
1894+
const auto [Reg, SubReg] = Entry;
1895+
MachineInstr *Def = MRI->getVRegDef(Reg);
1896+
MachineBasicBlock *DefMBB = Def->getParent();
1897+
1898+
// Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
1899+
// out.
1900+
const TargetRegisterClass *ARC = getRegOpRC(*MRI, *TRI, *MOs.front());
1901+
Register TempVGPR =
1902+
MRI->createVirtualRegister(TRI->getEquivalentVGPRClass(ARC));
1903+
MachineInstr *VGPRCopy =
1904+
BuildMI(*DefMBB, ++Def->getIterator(), Def->getDebugLoc(),
1905+
TII->get(AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
1906+
.addReg(Reg, /* flags */ 0, SubReg);
1907+
1908+
// Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
1909+
Register TempAGPR = MRI->createVirtualRegister(ARC);
1910+
BuildMI(*DefMBB, ++VGPRCopy->getIterator(), Def->getDebugLoc(),
1911+
TII->get(AMDGPU::COPY), TempAGPR)
1912+
.addReg(TempVGPR);
1913+
1914+
LLVM_DEBUG(dbgs() << "Caching AGPR into VGPR: " << *VGPRCopy);
1915+
for (MachineOperand *MO : MOs) {
1916+
MO->setReg(TempAGPR);
1917+
MO->setSubReg(AMDGPU::NoSubRegister);
1918+
LLVM_DEBUG(dbgs() << " Changed PHI Operand: " << *MO << "\n");
1919+
}
1920+
1921+
Changed = true;
1922+
}
1923+
1924+
return Changed;
1925+
}
1926+
17391927
bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
17401928
if (skipFunction(MF.getFunction()))
17411929
return false;
@@ -1769,7 +1957,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
17691957
continue;
17701958
}
17711959

1772-
if (MI.isPHI() && tryFoldLCSSAPhi(MI)) {
1960+
if (MI.isPHI() && tryFoldPhiAGPR(MI)) {
17731961
Changed = true;
17741962
continue;
17751963
}
@@ -1794,6 +1982,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
17941982
!tryFoldOMod(MI))
17951983
Changed |= tryFoldClamp(MI);
17961984
}
1985+
1986+
Changed |= tryOptimizeAGPRPhis(*MBB);
17971987
}
17981988

17991989
return Changed;

0 commit comments

Comments
 (0)