Skip to content

Commit 1bc6e71

Browse files
committed
[AMDGPU] Do not only rely on BB number when finding bottom loop
We should also check that the "bottom" basic block of a loopis a successor of the "header" basic block, otherwise we don't propagate the information correctly when the CFG is complex. This fixes an important rendering problem with Wolfsentein 2, because of one vector-memory wait was missing. Differential Revision: https://reviews.llvm.org/D43831 llvm-svn: 330337
1 parent 9b20c24 commit 1bc6e71

File tree

2 files changed

+104
-20
lines changed

2 files changed

+104
-20
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -367,7 +367,7 @@ class SIInsertWaitcnts : public MachineFunctionPass {
367367
DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
368368
BlockWaitcntBracketsMap;
369369

370-
DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
370+
std::vector<MachineBasicBlock *> BlockWaitcntProcessedSet;
371371

372372
DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
373373

@@ -403,7 +403,8 @@ class SIInsertWaitcnts : public MachineFunctionPass {
403403
void updateEventWaitCntAfter(MachineInstr &Inst,
404404
BlockWaitcntBrackets *ScoreBrackets);
405405
void mergeInputScoreBrackets(MachineBasicBlock &Block);
406-
MachineBasicBlock *loopBottom(const MachineLoop *Loop);
406+
bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block);
407+
unsigned countNumBottomBlocks(const MachineLoop *Loop);
407408
void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
408409
void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
409410
bool isWaitcntStronger(unsigned LHS, unsigned RHS);
@@ -1568,15 +1569,29 @@ void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
15681569
}
15691570
}
15701571

1571-
/// Return the "bottom" block of a loop. This differs from
1572-
/// MachineLoop::getBottomBlock in that it works even if the loop is
1573-
/// discontiguous.
1574-
MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
1575-
MachineBasicBlock *Bottom = Loop->getHeader();
1576-
for (MachineBasicBlock *MBB : Loop->blocks())
1577-
if (MBB->getNumber() > Bottom->getNumber())
1578-
Bottom = MBB;
1579-
return Bottom;
1572+
/// Return true if the given basic block is a "bottom" block of a loop. This
1573+
/// differs from MachineLoop::getBottomBlock in that it works even if the loop
1574+
/// is discontiguous. This also handles multiple back-edges for the same
1575+
/// "header" block of a loop.
1576+
bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop,
1577+
const MachineBasicBlock *Block) {
1578+
for (MachineBasicBlock *MBB : Loop->blocks()) {
1579+
if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) {
1580+
return true;
1581+
}
1582+
}
1583+
return false;
1584+
}
1585+
1586+
/// Count the number of "bottom" basic blocks of a loop.
1587+
unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) {
1588+
unsigned Count = 0;
1589+
for (MachineBasicBlock *MBB : Loop->blocks()) {
1590+
if (MBB->isSuccessor(Loop->getHeader())) {
1591+
Count++;
1592+
}
1593+
}
1594+
return Count;
15801595
}
15811596

15821597
// Generate s_waitcnt instructions where needed.
@@ -1685,7 +1700,7 @@ void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
16851700

16861701
// Check if we need to force convergence at loop footer.
16871702
MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
1688-
if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
1703+
if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) {
16891704
LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
16901705
WaitcntData->print();
16911706
DEBUG(dbgs() << '\n';);
@@ -1773,6 +1788,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
17731788
TrackedWaitcntSet.clear();
17741789
BlockVisitedSet.clear();
17751790
VCCZBugHandledSet.clear();
1791+
LoopWaitcntDataMap.clear();
17761792

17771793
// Walk over the blocks in reverse post-dominator order, inserting
17781794
// s_waitcnt where needed.
@@ -1799,21 +1815,30 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
17991815
// If we are walking into the block from before the loop, then guarantee
18001816
// at least 1 re-walk over the loop to propagate the information, even if
18011817
// no S_WAITCNT instructions were generated.
1802-
if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I &&
1803-
(!BlockWaitcntProcessedSet.count(&MBB))) {
1804-
BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1805-
DEBUG(dbgs() << "set-revisit: Block"
1806-
<< ContainingLoop->getHeader()->getNumber() << '\n';);
1818+
if (ContainingLoop && ContainingLoop->getHeader() == &MBB) {
1819+
unsigned Count = countNumBottomBlocks(ContainingLoop);
1820+
1821+
// If the loop has multiple back-edges, and so more than one "bottom"
1822+
// basic block, we have to guarantee a re-walk over every blocks.
1823+
if ((std::count(BlockWaitcntProcessedSet.begin(),
1824+
BlockWaitcntProcessedSet.end(), &MBB) < Count)) {
1825+
BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
1826+
DEBUG(dbgs() << "set-revisit: Block"
1827+
<< ContainingLoop->getHeader()->getNumber() << '\n';);
1828+
}
18071829
}
18081830

18091831
// Walk over the instructions.
18101832
insertWaitcntInBlock(MF, MBB);
18111833

18121834
// Flag that waitcnts have been processed at least once.
1813-
BlockWaitcntProcessedSet.insert(&MBB);
1835+
BlockWaitcntProcessedSet.push_back(&MBB);
18141836

1815-
// See if we want to revisit the loop.
1816-
if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
1837+
// See if we want to revisit the loop. If a loop has multiple back-edges,
1838+
// we shouldn't revisit the same "bottom" basic block.
1839+
if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) &&
1840+
std::count(BlockWaitcntProcessedSet.begin(),
1841+
BlockWaitcntProcessedSet.end(), &MBB) == 1) {
18171842
MachineBasicBlock *EntryBB = ContainingLoop->getHeader();
18181843
BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
18191844
if (EntrySB && EntrySB->getRevisitLoop()) {
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-waitcnts -verify-machineinstrs | FileCheck -check-prefix=GCN %s
2+
3+
# GCN-LABEL: waitcnt-back-edge-loop
4+
# GCN: bb.2
5+
# GCN: S_WAITCNT 112
6+
# GCN: $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec
7+
8+
---
9+
name: waitcnt-back-edge-loop
10+
body: |
11+
bb.0:
12+
successors: %bb.1
13+
14+
$vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2
15+
$vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2
16+
$vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
17+
$vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
18+
$sgpr0_sgpr1 = V_CMP_EQ_U32_e64 3, killed $sgpr4, implicit $exec
19+
$vgpr3 = V_CNDMASK_B32_e64 -1082130432, 1065353216, killed $sgpr0_sgpr1, implicit $exec
20+
$vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec
21+
S_BRANCH %bb.1
22+
23+
bb.3:
24+
successors: %bb.1
25+
26+
$vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
27+
28+
bb.1:
29+
successors: %bb.5, %bb.2
30+
31+
$vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec
32+
V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec
33+
$vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
34+
S_CBRANCH_VCCZ %bb.5, implicit killed $vcc
35+
36+
bb.2:
37+
successors: %bb.4, %bb.3
38+
39+
V_CMP_EQ_U32_e32 9, killed $vgpr5, implicit-def $vcc, implicit $exec
40+
$vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
41+
S_CBRANCH_VCCZ %bb.3, implicit killed $vcc
42+
43+
bb.4:
44+
successors: %bb.3, %bb.1
45+
46+
$vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1)
47+
$vgpr4 = V_CVT_I32_F32_e32 $vgpr5, implicit $exec
48+
V_CMP_EQ_U32_e32 2, killed $vgpr4, implicit-def $vcc, implicit $exec
49+
$vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc
50+
$vgpr4 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec
51+
S_CBRANCH_VCCZ %bb.1, implicit killed $vcc
52+
S_BRANCH %bb.3
53+
54+
bb.5:
55+
56+
$vgpr4 = V_MAC_F32_e32 killed $vgpr0, killed $vgpr3, killed $vgpr4, implicit $exec
57+
EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec
58+
S_ENDPGM
59+
...

0 commit comments

Comments
 (0)