@@ -300,6 +300,20 @@ void GCNHazardRecognizer::processBundle() {
300
300
CurrCycleInstr = nullptr ;
301
301
}
302
302
303
+ void GCNHazardRecognizer::runOnInstruction (MachineInstr *MI) {
304
+ assert (IsHazardRecognizerMode);
305
+
306
+ unsigned NumPreNoops = PreEmitNoops (MI);
307
+ EmitNoops (NumPreNoops);
308
+ if (MI->isInsideBundle ())
309
+ insertNoopsInBundle (MI, TII, NumPreNoops);
310
+ else
311
+ TII.insertNoops (*MI->getParent (), MachineBasicBlock::iterator (MI),
312
+ NumPreNoops);
313
+ EmitInstruction (MI);
314
+ AdvanceCycle ();
315
+ }
316
+
303
317
unsigned GCNHazardRecognizer::PreEmitNoops (MachineInstr *MI) {
304
318
IsHazardRecognizerMode = true ;
305
319
CurrCycleInstr = MI;
@@ -1087,6 +1101,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
1087
1101
fixVALUPartialForwardingHazard (MI);
1088
1102
fixVALUTransUseHazard (MI);
1089
1103
fixWMMAHazards (MI);
1104
+ fixShift64HighRegBug (MI);
1090
1105
}
1091
1106
1092
1107
bool GCNHazardRecognizer::fixVcmpxPermlaneHazards (MachineInstr *MI) {
@@ -1739,6 +1754,105 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
1739
1754
return true ;
1740
1755
}
1741
1756
1757
+ bool GCNHazardRecognizer::fixShift64HighRegBug (MachineInstr *MI) {
1758
+ if (!ST.hasShift64HighRegBug ())
1759
+ return false ;
1760
+
1761
+ switch (MI->getOpcode ()) {
1762
+ default :
1763
+ return false ;
1764
+ case AMDGPU::V_LSHLREV_B64_e64:
1765
+ case AMDGPU::V_LSHRREV_B64_e64:
1766
+ case AMDGPU::V_ASHRREV_I64_e64:
1767
+ break ;
1768
+ }
1769
+
1770
+ MachineOperand *Amt = TII.getNamedOperand (*MI, AMDGPU::OpName::src0);
1771
+ if (!Amt->isReg ())
1772
+ return false ;
1773
+
1774
+ Register AmtReg = Amt->getReg ();
1775
+ const MachineRegisterInfo &MRI = MF.getRegInfo ();
1776
+ // Check if this is a last VGPR in the allocation block.
1777
+ if (!TRI.isVGPR (MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7 ) != 7 )
1778
+ return false ;
1779
+
1780
+ if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed (AmtReg + 1 ))
1781
+ return false ;
1782
+
1783
+ MachineOperand *Src1 = TII.getNamedOperand (*MI, AMDGPU::OpName::src1);
1784
+ bool OverlappedSrc = Src1->isReg () && TRI.regsOverlap (Src1->getReg (), AmtReg);
1785
+ bool OverlappedDst = MI->modifiesRegister (AmtReg, &TRI);
1786
+ bool Overlapped = OverlappedSrc || OverlappedDst;
1787
+
1788
+ assert (!OverlappedDst || !OverlappedSrc ||
1789
+ Src1->getReg () == MI->getOperand (0 ).getReg ());
1790
+ assert (ST.needsAlignedVGPRs ());
1791
+ static_assert (AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1792
+
1793
+ Register NewReg;
1794
+ for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1795
+ : AMDGPU::VGPR_32RegClass) {
1796
+ if (!MI->modifiesRegister (Reg, &TRI) && !MI->readsRegister (Reg, &TRI)) {
1797
+ NewReg = Reg;
1798
+ break ;
1799
+ }
1800
+ }
1801
+
1802
+ Register NewAmt = Overlapped ? (Register)TRI.getSubReg (NewReg, AMDGPU::sub1)
1803
+ : NewReg;
1804
+ Register NewAmtLo;
1805
+
1806
+ if (Overlapped)
1807
+ NewAmtLo = TRI.getSubReg (NewReg, AMDGPU::sub0);
1808
+
1809
+ DebugLoc DL = MI->getDebugLoc ();
1810
+ MachineBasicBlock *MBB = MI->getParent ();
1811
+ // Insert a full wait count because found register might be pending a wait.
1812
+ BuildMI (*MBB, MI, DL, TII.get (AMDGPU::S_WAITCNT))
1813
+ .addImm (0 );
1814
+
1815
+ // Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1816
+ if (Overlapped)
1817
+ runOnInstruction (
1818
+ BuildMI (*MBB, MI, DL, TII.get (AMDGPU::V_SWAP_B32), NewAmtLo)
1819
+ .addDef (AmtReg - 1 )
1820
+ .addReg (AmtReg - 1 )
1821
+ .addReg (NewAmtLo));
1822
+ runOnInstruction (BuildMI (*MBB, MI, DL, TII.get (AMDGPU::V_SWAP_B32), NewAmt)
1823
+ .addDef (AmtReg)
1824
+ .addReg (AmtReg)
1825
+ .addReg (NewAmt));
1826
+
1827
+ // Instructions emitted after the current instruction will be processed by the
1828
+ // parent loop of the hazard recognizer in a natural way.
1829
+ BuildMI (*MBB, std::next (MI->getIterator ()), DL, TII.get (AMDGPU::V_SWAP_B32),
1830
+ AmtReg)
1831
+ .addDef (NewAmt)
1832
+ .addReg (NewAmt)
1833
+ .addReg (AmtReg);
1834
+ if (Overlapped)
1835
+ BuildMI (*MBB, std::next (MI->getIterator ()), DL, TII.get (AMDGPU::V_SWAP_B32),
1836
+ AmtReg - 1 )
1837
+ .addDef (NewAmtLo)
1838
+ .addReg (NewAmtLo)
1839
+ .addReg (AmtReg - 1 );
1840
+
1841
+ // Re-running hazard recognizer on the modified instruction is not necessary,
1842
+ // inserted V_SWAP_B32 has already both read and write new registers so
1843
+ // hazards related to these register has already been handled.
1844
+ Amt->setReg (NewAmt);
1845
+ Amt->setIsKill (false );
1846
+ if (OverlappedDst)
1847
+ MI->getOperand (0 ).setReg (NewReg);
1848
+ if (OverlappedSrc) {
1849
+ Src1->setReg (NewReg);
1850
+ Src1->setIsKill (false );
1851
+ }
1852
+
1853
+ return true ;
1854
+ }
1855
+
1742
1856
int GCNHazardRecognizer::checkNSAtoVMEMHazard (MachineInstr *MI) {
1743
1857
int NSAtoVMEMWaitStates = 1 ;
1744
1858
0 commit comments