Skip to content

Commit 95d497f

Browse files
committed
[AMDGPU] W/a hazard if 64 bit shift amount is a highest allocated VGPR
In this case gfx90a uses v0 instead of the correct register. Swap the value temporarily with a lower register and then swap it back. Unfortunately hazard recognizer works after wait count insertion, so we cannot simply reuse an arbitrary register, hence w/a also includes a full waitcount. This can be avoided if we run it from expandPostRAPseudo, but that is a complete misplacement. Differential Revision: https://reviews.llvm.org/D133067
1 parent e321c8d commit 95d497f

File tree

4 files changed

+375
-0
lines changed

4 files changed

+375
-0
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 114 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -300,6 +300,20 @@ void GCNHazardRecognizer::processBundle() {
300300
CurrCycleInstr = nullptr;
301301
}
302302

303+
void GCNHazardRecognizer::runOnInstruction(MachineInstr *MI) {
304+
assert(IsHazardRecognizerMode);
305+
306+
unsigned NumPreNoops = PreEmitNoops(MI);
307+
EmitNoops(NumPreNoops);
308+
if (MI->isInsideBundle())
309+
insertNoopsInBundle(MI, TII, NumPreNoops);
310+
else
311+
TII.insertNoops(*MI->getParent(), MachineBasicBlock::iterator(MI),
312+
NumPreNoops);
313+
EmitInstruction(MI);
314+
AdvanceCycle();
315+
}
316+
303317
unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
304318
IsHazardRecognizerMode = true;
305319
CurrCycleInstr = MI;
@@ -1087,6 +1101,7 @@ void GCNHazardRecognizer::fixHazards(MachineInstr *MI) {
10871101
fixVALUPartialForwardingHazard(MI);
10881102
fixVALUTransUseHazard(MI);
10891103
fixWMMAHazards(MI);
1104+
fixShift64HighRegBug(MI);
10901105
}
10911106

10921107
bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
@@ -1739,6 +1754,105 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
17391754
return true;
17401755
}
17411756

1757+
bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
1758+
if (!ST.hasShift64HighRegBug())
1759+
return false;
1760+
1761+
switch (MI->getOpcode()) {
1762+
default:
1763+
return false;
1764+
case AMDGPU::V_LSHLREV_B64_e64:
1765+
case AMDGPU::V_LSHRREV_B64_e64:
1766+
case AMDGPU::V_ASHRREV_I64_e64:
1767+
break;
1768+
}
1769+
1770+
MachineOperand *Amt = TII.getNamedOperand(*MI, AMDGPU::OpName::src0);
1771+
if (!Amt->isReg())
1772+
return false;
1773+
1774+
Register AmtReg = Amt->getReg();
1775+
const MachineRegisterInfo &MRI = MF.getRegInfo();
1776+
// Check if this is a last VGPR in the allocation block.
1777+
if (!TRI.isVGPR(MRI, AmtReg) || ((AmtReg - AMDGPU::VGPR0) & 7) != 7)
1778+
return false;
1779+
1780+
if (AmtReg != AMDGPU::VGPR255 && MRI.isPhysRegUsed(AmtReg + 1))
1781+
return false;
1782+
1783+
MachineOperand *Src1 = TII.getNamedOperand(*MI, AMDGPU::OpName::src1);
1784+
bool OverlappedSrc = Src1->isReg() && TRI.regsOverlap(Src1->getReg(), AmtReg);
1785+
bool OverlappedDst = MI->modifiesRegister(AmtReg, &TRI);
1786+
bool Overlapped = OverlappedSrc || OverlappedDst;
1787+
1788+
assert(!OverlappedDst || !OverlappedSrc ||
1789+
Src1->getReg() == MI->getOperand(0).getReg());
1790+
assert(ST.needsAlignedVGPRs());
1791+
static_assert(AMDGPU::VGPR0 + 1 == AMDGPU::VGPR1);
1792+
1793+
Register NewReg;
1794+
for (MCRegister Reg : Overlapped ? AMDGPU::VReg_64_Align2RegClass
1795+
: AMDGPU::VGPR_32RegClass) {
1796+
if (!MI->modifiesRegister(Reg, &TRI) && !MI->readsRegister(Reg, &TRI)) {
1797+
NewReg = Reg;
1798+
break;
1799+
}
1800+
}
1801+
1802+
Register NewAmt = Overlapped ? (Register)TRI.getSubReg(NewReg, AMDGPU::sub1)
1803+
: NewReg;
1804+
Register NewAmtLo;
1805+
1806+
if (Overlapped)
1807+
NewAmtLo = TRI.getSubReg(NewReg, AMDGPU::sub0);
1808+
1809+
DebugLoc DL = MI->getDebugLoc();
1810+
MachineBasicBlock *MBB = MI->getParent();
1811+
// Insert a full wait count because found register might be pending a wait.
1812+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_WAITCNT))
1813+
.addImm(0);
1814+
1815+
// Insert V_SWAP_B32 instruction(s) and run hazard recognizer on them.
1816+
if (Overlapped)
1817+
runOnInstruction(
1818+
BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmtLo)
1819+
.addDef(AmtReg - 1)
1820+
.addReg(AmtReg - 1)
1821+
.addReg(NewAmtLo));
1822+
runOnInstruction(BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_SWAP_B32), NewAmt)
1823+
.addDef(AmtReg)
1824+
.addReg(AmtReg)
1825+
.addReg(NewAmt));
1826+
1827+
// Instructions emitted after the current instruction will be processed by the
1828+
// parent loop of the hazard recognizer in a natural way.
1829+
BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1830+
AmtReg)
1831+
.addDef(NewAmt)
1832+
.addReg(NewAmt)
1833+
.addReg(AmtReg);
1834+
if (Overlapped)
1835+
BuildMI(*MBB, std::next(MI->getIterator()), DL, TII.get(AMDGPU::V_SWAP_B32),
1836+
AmtReg - 1)
1837+
.addDef(NewAmtLo)
1838+
.addReg(NewAmtLo)
1839+
.addReg(AmtReg - 1);
1840+
1841+
// Re-running hazard recognizer on the modified instruction is not necessary,
1842+
// inserted V_SWAP_B32 has already both read and write new registers so
1843+
// hazards related to these register has already been handled.
1844+
Amt->setReg(NewAmt);
1845+
Amt->setIsKill(false);
1846+
if (OverlappedDst)
1847+
MI->getOperand(0).setReg(NewReg);
1848+
if (OverlappedSrc) {
1849+
Src1->setReg(NewReg);
1850+
Src1->setIsKill(false);
1851+
}
1852+
1853+
return true;
1854+
}
1855+
17421856
int GCNHazardRecognizer::checkNSAtoVMEMHazard(MachineInstr *MI) {
17431857
int NSAtoVMEMWaitStates = 1;
17441858

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,10 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
7070
// instructions.
7171
void processBundle();
7272

73+
// Run on an individual instruction in hazard recognizer mode. This can be
74+
// used on a newly inserted instruction before returning from PreEmitNoops.
75+
void runOnInstruction(MachineInstr *MI);
76+
7377
int getWaitStatesSince(IsHazardFn IsHazard, int Limit);
7478
int getWaitStatesSinceDef(unsigned Reg, IsHazardFn IsHazardDef, int Limit);
7579
int getWaitStatesSinceSetReg(IsHazardFn IsHazard, int Limit);
@@ -101,6 +105,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
101105
bool fixVALUPartialForwardingHazard(MachineInstr *MI);
102106
bool fixVALUTransUseHazard(MachineInstr *MI);
103107
bool fixWMMAHazards(MachineInstr *MI);
108+
bool fixShift64HighRegBug(MachineInstr *MI);
104109

105110
int checkMAIHazards(MachineInstr *MI);
106111
int checkMAIHazards908(MachineInstr *MI);

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1008,6 +1008,12 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
10081008
return HasLdsBranchVmemWARHazard;
10091009
}
10101010

1011+
// Shift amount of a 64 bit shift cannot be a highest allocated register
1012+
// if also at the end of the allocation block.
1013+
bool hasShift64HighRegBug() const {
1014+
return GFX90AInsts && !GFX940Insts;
1015+
}
1016+
10111017
// Has one cycle hazard on transcendental instruction feeding a
10121018
// non transcendental VALU.
10131019
bool hasTransForwardingHazard() const { return GFX940Insts; }

0 commit comments

Comments
 (0)