@@ -111,9 +111,11 @@ class SIFoldOperands : public MachineFunctionPass {
111
111
std::pair<const MachineOperand *, int > isOMod (const MachineInstr &MI) const ;
112
112
bool tryFoldOMod (MachineInstr &MI);
113
113
bool tryFoldRegSequence (MachineInstr &MI);
114
- bool tryFoldLCSSAPhi (MachineInstr &MI);
114
+ bool tryFoldPhiAGPR (MachineInstr &MI);
115
115
bool tryFoldLoad (MachineInstr &MI);
116
116
117
+ bool tryOptimizeAGPRPhis (MachineBasicBlock &MBB);
118
+
117
119
public:
118
120
SIFoldOperands () : MachineFunctionPass(ID) {
119
121
initializeSIFoldOperandsPass (*PassRegistry::getPassRegistry ());
@@ -138,6 +140,16 @@ char SIFoldOperands::ID = 0;
138
140
139
141
char &llvm::SIFoldOperandsID = SIFoldOperands::ID;
140
142
143
+ static const TargetRegisterClass *getRegOpRC (const MachineRegisterInfo &MRI,
144
+ const TargetRegisterInfo &TRI,
145
+ const MachineOperand &MO) {
146
+ const TargetRegisterClass *RC = MRI.getRegClass (MO.getReg ());
147
+ if (const TargetRegisterClass *SubRC =
148
+ TRI.getSubRegisterClass (RC, MO.getSubReg ()))
149
+ RC = SubRC;
150
+ return RC;
151
+ }
152
+
141
153
// Map multiply-accumulate opcode to corresponding multiply-add opcode if any.
142
154
static unsigned macToMad (unsigned Opc) {
143
155
switch (Opc) {
@@ -1631,52 +1643,133 @@ bool SIFoldOperands::tryFoldRegSequence(MachineInstr &MI) {
1631
1643
return true ;
1632
1644
}
1633
1645
1634
- // Try to hoist an AGPR to VGPR copy out of the loop across a LCSSA PHI.
1646
+ // Try to hoist an AGPR to VGPR copy across a PHI.
1635
1647
// This should allow folding of an AGPR into a consumer which may support it.
1636
- // I.e.:
1637
1648
//
1638
- // loop: // loop:
1639
- // %1:vreg = COPY %0:areg // exit:
1640
- // exit: => // %1:areg = PHI %0:areg, %loop
1641
- // %2:vreg = PHI %1:vreg, %loop // %2:vreg = COPY %1:areg
1642
- bool SIFoldOperands::tryFoldLCSSAPhi (MachineInstr &PHI) {
1649
+ // Example 1: LCSSA PHI
1650
+ // loop:
1651
+ // %1:vreg = COPY %0:areg
1652
+ // exit:
1653
+ // %2:vreg = PHI %1:vreg, %loop
1654
+ // =>
1655
+ // loop:
1656
+ // exit:
1657
+ // %1:areg = PHI %0:areg, %loop
1658
+ // %2:vreg = COPY %1:areg
1659
+ //
1660
+ // Example 2: PHI with multiple incoming values:
1661
+ // entry:
1662
+ // %1:vreg = GLOBAL_LOAD(..)
1663
+ // loop:
1664
+ // %2:vreg = PHI %1:vreg, %entry, %5:vreg, %loop
1665
+ // %3:areg = COPY %2:vreg
1666
+ // %4:areg = (instr using %3:areg)
1667
+ // %5:vreg = COPY %4:areg
1668
+ // =>
1669
+ // entry:
1670
+ // %1:vreg = GLOBAL_LOAD(..)
1671
+ // %2:areg = COPY %1:vreg
1672
+ // loop:
1673
+ // %3:areg = PHI %2:areg, %entry, %X:areg,
1674
+ // %4:areg = (instr using %3:areg)
1675
+ bool SIFoldOperands::tryFoldPhiAGPR (MachineInstr &PHI) {
1643
1676
assert (PHI.isPHI ());
1644
1677
1645
- if (PHI.getNumExplicitOperands () != 3 ) // Single input LCSSA PHI
1646
- return false ;
1647
-
1648
- Register PhiIn = PHI.getOperand (1 ).getReg ();
1649
1678
Register PhiOut = PHI.getOperand (0 ).getReg ();
1650
- if (PHI.getOperand (1 ).getSubReg () ||
1651
- !TRI->isVGPR (*MRI, PhiIn) || !TRI->isVGPR (*MRI, PhiOut))
1679
+ if (!TRI->isVGPR (*MRI, PhiOut))
1652
1680
return false ;
1653
1681
1654
- // A single use should not matter for correctness, but if it has another use
1655
- // inside the loop we may perform copy twice in a worst case.
1656
- if (!MRI->hasOneNonDBGUse (PhiIn))
1657
- return false ;
1682
+ // Iterate once over all incoming values of the PHI to check if this PHI is
1683
+ // eligible, and determine the exact AGPR RC we'll target.
1684
+ const TargetRegisterClass *ARC = nullptr ;
1685
+ for (unsigned K = 1 ; K < PHI.getNumExplicitOperands (); K += 2 ) {
1686
+ MachineOperand &MO = PHI.getOperand (K);
1658
1687
1659
- MachineInstr *Copy = MRI->getVRegDef (PhiIn);
1660
- if (!Copy || !Copy->isCopy ())
1661
- return false ;
1688
+ Register PhiIn = MO.getReg ();
1689
+ if (MO.getSubReg () || !TRI->isVGPR (*MRI, PhiIn))
1690
+ return false ;
1691
+
1692
+ MachineInstr *Copy = MRI->getVRegDef (PhiIn);
1693
+ if (!Copy || !Copy->isCopy ())
1694
+ continue ;
1662
1695
1663
- Register CopyIn = Copy->getOperand (1 ).getReg ();
1664
- if (!TRI->isAGPR (*MRI, CopyIn) || Copy->getOperand (1 ).getSubReg ())
1696
+ Register CopyIn = Copy->getOperand (1 ).getReg ();
1697
+ if (CopyIn.isVirtual () && TRI->isAGPR (*MRI, CopyIn)) {
1698
+ const TargetRegisterClass *CopyInRC =
1699
+ getRegOpRC (*MRI, *TRI, Copy->getOperand (1 ));
1700
+ if (ARC && !ARC->hasSubClassEq (CopyInRC))
1701
+ return false ;
1702
+ ARC = CopyInRC;
1703
+ }
1704
+ }
1705
+
1706
+ if (!ARC)
1665
1707
return false ;
1666
1708
1667
- const TargetRegisterClass *ARC = MRI->getRegClass (CopyIn);
1709
+ // Rewrite the PHI's incoming values to ARC.
1710
+ LLVM_DEBUG (dbgs () << " Folding AGPR copies into: " << PHI);
1711
+ for (unsigned K = 1 ; K < PHI.getNumExplicitOperands (); K += 2 ) {
1712
+ MachineOperand &MO = PHI.getOperand (K);
1713
+ Register Reg = MO.getReg ();
1714
+
1715
+ MachineBasicBlock::iterator InsertPt;
1716
+ MachineBasicBlock *InsertMBB = nullptr ;
1717
+
1718
+ // Look at the def of Reg, ignoring all copies.
1719
+ bool UseAccVGPRWrite = false ;
1720
+ if (MachineInstr *Def = MRI->getVRegDef (Reg)) {
1721
+
1722
+ // Look at pre-existing COPY instructions from ARC: Steal the operand. If
1723
+ // the copy was single-use, it will be removed by DCE later.
1724
+ if (Def->isCopy ()) {
1725
+ MachineOperand &CopyIn = Def->getOperand (1 );
1726
+ if (CopyIn.getReg ().isVirtual () &&
1727
+ getRegOpRC (*MRI, *TRI, CopyIn)->hasSubClassEq (ARC)) {
1728
+ MO.setReg (CopyIn.getReg ());
1729
+ MO.setSubReg (CopyIn.getSubReg ());
1730
+ continue ;
1731
+ }
1732
+
1733
+ // If this is a multi-use SGPR -> VGPR copy, use V_ACCVGPR_WRITE on
1734
+ // GFX908 directly instead of a COPY. Otherwise, SIFoldOperand may try
1735
+ // to fold the sgpr -> vgpr -> agpr copy into a sgpr -> agpr copy which
1736
+ // is unlikely to be profitable.
1737
+ if (!ST->hasGFX90AInsts () && !MRI->hasOneNonDBGUse (Reg) &&
1738
+ TRI->isSGPRReg (*MRI, CopyIn.getReg ()))
1739
+ UseAccVGPRWrite = true ;
1740
+ }
1741
+
1742
+ InsertPt = ++Def->getIterator ();
1743
+ InsertMBB = Def->getParent ();
1744
+ } else {
1745
+ InsertMBB = PHI.getOperand (MO.getOperandNo () + 1 ).getMBB ();
1746
+ InsertPt = InsertMBB->getFirstTerminator ();
1747
+ }
1748
+
1749
+ const unsigned CopyOpc =
1750
+ UseAccVGPRWrite ? AMDGPU::V_ACCVGPR_WRITE_B32_e64 : AMDGPU::COPY;
1751
+ Register NewReg = MRI->createVirtualRegister (ARC);
1752
+ MachineInstr *MI = BuildMI (*InsertMBB, InsertPt, PHI.getDebugLoc (),
1753
+ TII->get (CopyOpc), NewReg)
1754
+ .addReg (Reg);
1755
+ MO.setReg (NewReg);
1756
+
1757
+ (void )MI;
1758
+ LLVM_DEBUG (dbgs () << " Created COPY: " << *MI);
1759
+ }
1760
+
1761
+ // Replace the PHI's result with a new register.
1668
1762
Register NewReg = MRI->createVirtualRegister (ARC);
1669
- PHI.getOperand (1 ).setReg (CopyIn);
1670
1763
PHI.getOperand (0 ).setReg (NewReg);
1671
1764
1765
+ // COPY that new register back to the original PhiOut register. This COPY will
1766
+ // usually be folded out later.
1672
1767
MachineBasicBlock *MBB = PHI.getParent ();
1673
- BuildMI (*MBB, MBB->getFirstNonPHI (), Copy-> getDebugLoc (),
1768
+ BuildMI (*MBB, MBB->getFirstNonPHI (), PHI. getDebugLoc (),
1674
1769
TII->get (AMDGPU::COPY), PhiOut)
1675
- .addReg (NewReg, RegState::Kill);
1676
- Copy->eraseFromParent (); // We know this copy had a single use.
1677
-
1678
- LLVM_DEBUG (dbgs () << " Folded " << PHI);
1770
+ .addReg (NewReg);
1679
1771
1772
+ LLVM_DEBUG (dbgs () << " Done: Folded " << PHI);
1680
1773
return true ;
1681
1774
}
1682
1775
@@ -1736,6 +1829,101 @@ bool SIFoldOperands::tryFoldLoad(MachineInstr &MI) {
1736
1829
return true ;
1737
1830
}
1738
1831
1832
+ // tryFoldPhiAGPR will aggressively try to create AGPR PHIs.
1833
+ // For GFX90A and later, this is pretty much always a good thing, but for GFX908
1834
+ // there's cases where it can create a lot more AGPR-AGPR copies, which are
1835
+ // expensive on this architecture due to the lack of V_ACCVGPR_MOV.
1836
+ //
1837
+ // This function looks at all AGPR PHIs in a basic block and collects their
1838
+ // operands. Then, it checks for register that are used more than once across
1839
+ // all PHIs and caches them in a VGPR. This prevents ExpandPostRAPseudo from
1840
+ // having to create one VGPR temporary per use, which can get very messy if
1841
+ // these PHIs come from a broken-up large PHI (e.g. 32 AGPR phis, one per vector
1842
+ // element).
1843
+ //
1844
+ // Example
1845
+ // a:
1846
+ // %in:agpr_256 = COPY %foo:vgpr_256
1847
+ // c:
1848
+ // %x:agpr_32 = ..
1849
+ // b:
1850
+ // %0:areg = PHI %in.sub0:agpr_32, %a, %x, %c
1851
+ // %1:areg = PHI %in.sub0:agpr_32, %a, %y, %c
1852
+ // %2:areg = PHI %in.sub0:agpr_32, %a, %z, %c
1853
+ // =>
1854
+ // a:
1855
+ // %in:agpr_256 = COPY %foo:vgpr_256
1856
+ // %tmp:vgpr_32 = V_ACCVGPR_READ_B32_e64 %in.sub0:agpr_32
1857
+ // %tmp_agpr:agpr_32 = COPY %tmp
1858
+ // c:
1859
+ // %x:agpr_32 = ..
1860
+ // b:
1861
+ // %0:areg = PHI %tmp_agpr, %a, %x, %c
1862
+ // %1:areg = PHI %tmp_agpr, %a, %y, %c
1863
+ // %2:areg = PHI %tmp_agpr, %a, %z, %c
1864
+ bool SIFoldOperands::tryOptimizeAGPRPhis (MachineBasicBlock &MBB) {
1865
+ // This is only really needed on GFX908 where AGPR-AGPR copies are
1866
+ // unreasonably difficult.
1867
+ if (ST->hasGFX90AInsts ())
1868
+ return false ;
1869
+
1870
+ // Look at all AGPR Phis and collect the register + subregister used.
1871
+ DenseMap<std::pair<Register, unsigned >, std::vector<MachineOperand *>>
1872
+ RegToMO;
1873
+
1874
+ for (auto &MI : MBB) {
1875
+ if (!MI.isPHI ())
1876
+ break ;
1877
+
1878
+ if (!TRI->isAGPR (*MRI, MI.getOperand (0 ).getReg ()))
1879
+ continue ;
1880
+
1881
+ for (unsigned K = 1 ; K < MI.getNumOperands (); K += 2 ) {
1882
+ MachineOperand &PhiMO = MI.getOperand (K);
1883
+ RegToMO[{PhiMO.getReg (), PhiMO.getSubReg ()}].push_back (&PhiMO);
1884
+ }
1885
+ }
1886
+
1887
+ // For all (Reg, SubReg) pair that are used more than once, cache the value in
1888
+ // a VGPR.
1889
+ bool Changed = false ;
1890
+ for (const auto &[Entry, MOs] : RegToMO) {
1891
+ if (MOs.size () == 1 )
1892
+ continue ;
1893
+
1894
+ const auto [Reg, SubReg] = Entry;
1895
+ MachineInstr *Def = MRI->getVRegDef (Reg);
1896
+ MachineBasicBlock *DefMBB = Def->getParent ();
1897
+
1898
+ // Create a copy in a VGPR using V_ACCVGPR_READ_B32_e64 so it's not folded
1899
+ // out.
1900
+ const TargetRegisterClass *ARC = getRegOpRC (*MRI, *TRI, *MOs.front ());
1901
+ Register TempVGPR =
1902
+ MRI->createVirtualRegister (TRI->getEquivalentVGPRClass (ARC));
1903
+ MachineInstr *VGPRCopy =
1904
+ BuildMI (*DefMBB, ++Def->getIterator (), Def->getDebugLoc (),
1905
+ TII->get (AMDGPU::V_ACCVGPR_READ_B32_e64), TempVGPR)
1906
+ .addReg (Reg, /* flags */ 0 , SubReg);
1907
+
1908
+ // Copy back to an AGPR and use that instead of the AGPR subreg in all MOs.
1909
+ Register TempAGPR = MRI->createVirtualRegister (ARC);
1910
+ BuildMI (*DefMBB, ++VGPRCopy->getIterator (), Def->getDebugLoc (),
1911
+ TII->get (AMDGPU::COPY), TempAGPR)
1912
+ .addReg (TempVGPR);
1913
+
1914
+ LLVM_DEBUG (dbgs () << " Caching AGPR into VGPR: " << *VGPRCopy);
1915
+ for (MachineOperand *MO : MOs) {
1916
+ MO->setReg (TempAGPR);
1917
+ MO->setSubReg (AMDGPU::NoSubRegister);
1918
+ LLVM_DEBUG (dbgs () << " Changed PHI Operand: " << *MO << " \n " );
1919
+ }
1920
+
1921
+ Changed = true ;
1922
+ }
1923
+
1924
+ return Changed;
1925
+ }
1926
+
1739
1927
bool SIFoldOperands::runOnMachineFunction (MachineFunction &MF) {
1740
1928
if (skipFunction (MF.getFunction ()))
1741
1929
return false ;
@@ -1769,7 +1957,7 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
1769
1957
continue ;
1770
1958
}
1771
1959
1772
- if (MI.isPHI () && tryFoldLCSSAPhi (MI)) {
1960
+ if (MI.isPHI () && tryFoldPhiAGPR (MI)) {
1773
1961
Changed = true ;
1774
1962
continue ;
1775
1963
}
@@ -1794,6 +1982,8 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
1794
1982
!tryFoldOMod (MI))
1795
1983
Changed |= tryFoldClamp (MI);
1796
1984
}
1985
+
1986
+ Changed |= tryOptimizeAGPRPhis (*MBB);
1797
1987
}
1798
1988
1799
1989
return Changed;
0 commit comments