Skip to content

Commit 3cf539f

Browse files
authored
[AMDGPU] Combine or remove redundant waitcnts at the end of each MBB (#87539)
Call generateWaitcnt unconditionally at the end of SIInsertWaitcnts::insertWaitcntInBlock. Even if we don't need to generate a new waitcnt instruction it has the effect of combining or removing redundant waitcnts that were already present. Tests show various small improvements in waitcnt placement.
1 parent 61efea7 commit 3cf539f

13 files changed

+73
-126
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 15 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -708,9 +708,6 @@ class SIInsertWaitcnts : public MachineFunctionPass {
708708
WaitcntBrackets &ScoreBrackets,
709709
MachineInstr *OldWaitcntInstr,
710710
bool FlushVmCnt);
711-
bool generateWaitcntBlockEnd(MachineBasicBlock &Block,
712-
WaitcntBrackets &ScoreBrackets,
713-
MachineInstr *OldWaitcntInstr);
714711
bool generateWaitcnt(AMDGPU::Waitcnt Wait,
715712
MachineBasicBlock::instr_iterator It,
716713
MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets,
@@ -1902,31 +1899,6 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
19021899
OldWaitcntInstr);
19031900
}
19041901

1905-
// Add a waitcnt to flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the
1906-
// end of the given block if needed.
1907-
bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block,
1908-
WaitcntBrackets &ScoreBrackets,
1909-
MachineInstr *OldWaitcntInstr) {
1910-
AMDGPU::Waitcnt Wait;
1911-
1912-
unsigned LoadCntPending = ScoreBrackets.hasPendingEvent(LOAD_CNT);
1913-
unsigned SampleCntPending = ScoreBrackets.hasPendingEvent(SAMPLE_CNT);
1914-
unsigned BvhCntPending = ScoreBrackets.hasPendingEvent(BVH_CNT);
1915-
1916-
if (LoadCntPending == 0 && SampleCntPending == 0 && BvhCntPending == 0)
1917-
return false;
1918-
1919-
if (LoadCntPending != 0)
1920-
Wait.LoadCnt = 0;
1921-
if (SampleCntPending != 0)
1922-
Wait.SampleCnt = 0;
1923-
if (BvhCntPending != 0)
1924-
Wait.BvhCnt = 0;
1925-
1926-
return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
1927-
OldWaitcntInstr);
1928-
}
1929-
19301902
bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait,
19311903
MachineBasicBlock::instr_iterator It,
19321904
MachineBasicBlock &Block,
@@ -2355,9 +2327,22 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
23552327
++Iter;
23562328
}
23572329

2330+
// Flush the LOADcnt, SAMPLEcnt and BVHcnt counters at the end of the block if
2331+
// needed.
2332+
AMDGPU::Waitcnt Wait;
23582333
if (Block.getFirstTerminator() == Block.end() &&
2359-
isPreheaderToFlush(Block, ScoreBrackets))
2360-
Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr);
2334+
isPreheaderToFlush(Block, ScoreBrackets)) {
2335+
if (ScoreBrackets.hasPendingEvent(LOAD_CNT))
2336+
Wait.LoadCnt = 0;
2337+
if (ScoreBrackets.hasPendingEvent(SAMPLE_CNT))
2338+
Wait.SampleCnt = 0;
2339+
if (ScoreBrackets.hasPendingEvent(BVH_CNT))
2340+
Wait.BvhCnt = 0;
2341+
}
2342+
2343+
// Combine or remove any redundant waitcnts at the end of the block.
2344+
Modified |= generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets,
2345+
OldWaitcntInstr);
23612346

23622347
return Modified;
23632348
}

llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ define i32 @divergent_if_swap_brtarget_order0(i32 %value) {
1616
; CHECK-NEXT: s_waitcnt vmcnt(0)
1717
; CHECK-NEXT: .LBB0_2: ; %endif
1818
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
19-
; CHECK-NEXT: s_waitcnt vmcnt(0)
2019
; CHECK-NEXT: s_setpc_b64 s[30:31]
2120
entry:
2221
%c = icmp ne i32 %value, 0
@@ -44,7 +43,6 @@ define i32 @divergent_if_swap_brtarget_order1(i32 %value) {
4443
; CHECK-NEXT: s_waitcnt vmcnt(0)
4544
; CHECK-NEXT: .LBB1_2: ; %endif
4645
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
47-
; CHECK-NEXT: s_waitcnt vmcnt(0)
4846
; CHECK-NEXT: s_setpc_b64 s[30:31]
4947
entry:
5048
%c = icmp ne i32 %value, 0
@@ -74,7 +72,6 @@ define i32 @divergent_if_nonboolean_condition0(i32 %value) {
7472
; CHECK-NEXT: s_waitcnt vmcnt(0)
7573
; CHECK-NEXT: .LBB2_2: ; %endif
7674
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
77-
; CHECK-NEXT: s_waitcnt vmcnt(0)
7875
; CHECK-NEXT: s_setpc_b64 s[30:31]
7976
entry:
8077
%c = trunc i32 %value to i1
@@ -106,7 +103,6 @@ define i32 @divergent_if_nonboolean_condition1(ptr addrspace(1) %ptr) {
106103
; CHECK-NEXT: s_waitcnt vmcnt(0)
107104
; CHECK-NEXT: .LBB3_2: ; %endif
108105
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
109-
; CHECK-NEXT: s_waitcnt vmcnt(0)
110106
; CHECK-NEXT: s_setpc_b64 s[30:31]
111107
entry:
112108
%value = load i32, ptr addrspace(1) %ptr

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.private.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
131131
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
132132
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
133133
; GFX11-NEXT: .LBB1_2: ; %bb1
134-
; GFX11-NEXT: s_nop 0
135-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
136134
; GFX11-NEXT: s_endpgm
137135
%val = call i1 @llvm.amdgcn.is.private(ptr %ptr)
138136
br i1 %val, label %bb0, label %bb1

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.is.shared.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,6 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
131131
; GFX11-NEXT: global_store_b32 v[0:1], v0, off dlc
132132
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
133133
; GFX11-NEXT: .LBB1_2: ; %bb1
134-
; GFX11-NEXT: s_nop 0
135-
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
136134
; GFX11-NEXT: s_endpgm
137135
%val = call i1 @llvm.amdgcn.is.shared(ptr %ptr)
138136
br i1 %val, label %bb0, label %bb1

0 commit comments

Comments
 (0)