Skip to content

Commit 60ca220

Browse files
committed
[AMDGPU] Update EXECZ retention in SIPreEmitPeephole for GFX11/12
The check to maintain EXECZ branches only checks S_WAITCNT. Add handling for new waitcnt instructions in GFX11 and GFX12. Also add code to retain uniform jumps over barrier instructions.
1 parent 73f5f83 commit 60ca220

File tree

4 files changed

+832
-1
lines changed

4 files changed

+832
-1
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -936,6 +936,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
936936
Opcode == AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM;
937937
}
938938

939+
bool isBarrierRelated(unsigned Opcode) const {
940+
return isBarrierStart(Opcode) || Opcode == AMDGPU::S_BARRIER_WAIT ||
941+
Opcode == AMDGPU::S_BARRIER_INIT_M0 ||
942+
Opcode == AMDGPU::S_BARRIER_INIT_IMM ||
943+
Opcode == AMDGPU::S_BARRIER_JOIN_IMM ||
944+
Opcode == AMDGPU::S_BARRIER_LEAVE;
945+
}
946+
939947
static bool doesNotReadTiedSource(const MachineInstr &MI) {
940948
return MI.getDesc().TSFlags & SIInstrFlags::TiedSourceNotRead;
941949
}
@@ -967,6 +975,29 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
967975
}
968976
}
969977

978+
bool isWaitcnt(unsigned Opcode) const {
979+
switch (getNonSoftWaitcntOpcode(Opcode)) {
980+
case AMDGPU::S_WAITCNT:
981+
case AMDGPU::S_WAITCNT_VSCNT:
982+
case AMDGPU::S_WAITCNT_VMCNT:
983+
case AMDGPU::S_WAITCNT_EXPCNT:
984+
case AMDGPU::S_WAITCNT_LGKMCNT:
985+
case AMDGPU::S_WAIT_LOADCNT:
986+
case AMDGPU::S_WAIT_LOADCNT_DSCNT:
987+
case AMDGPU::S_WAIT_STORECNT:
988+
case AMDGPU::S_WAIT_STORECNT_DSCNT:
989+
case AMDGPU::S_WAIT_SAMPLECNT:
990+
case AMDGPU::S_WAIT_BVHCNT:
991+
case AMDGPU::S_WAIT_EXPCNT:
992+
case AMDGPU::S_WAIT_DSCNT:
993+
case AMDGPU::S_WAIT_KMCNT:
994+
case AMDGPU::S_WAIT_IDLE:
995+
return true;
996+
default:
997+
return false;
998+
}
999+
}
1000+
9701001
bool isVGPRCopy(const MachineInstr &MI) const {
9711002
assert(isCopyInstr(MI));
9721003
Register Dest = MI.getOperand(0).getReg();

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -328,7 +328,11 @@ bool SIPreEmitPeephole::mustRetainExeczBranch(
328328

329329
// These instructions are potentially expensive even if EXEC = 0.
330330
if (TII->isSMRD(MI) || TII->isVMEM(MI) || TII->isFLAT(MI) ||
331-
TII->isDS(MI) || MI.getOpcode() == AMDGPU::S_WAITCNT)
331+
TII->isDS(MI) || TII->isWaitcnt(MI.getOpcode()))
332+
return true;
333+
334+
// Uniform bypass of barriers should be respected.
335+
if (TII->isBarrierRelated(MI.getOpcode()))
332336
return true;
333337

334338
++NumInstr;
Lines changed: 216 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,216 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass si-pre-emit-peephole -amdgpu-skip-threshold=10 -verify-machineinstrs %s -o - | FileCheck %s
3+
4+
---
5+
name: skip_waitcnt_vscnt
6+
body: |
7+
; CHECK-LABEL: name: skip_waitcnt_vscnt
8+
; CHECK: bb.0:
9+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
10+
; CHECK-NEXT: {{ $}}
11+
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
12+
; CHECK-NEXT: {{ $}}
13+
; CHECK-NEXT: bb.1:
14+
; CHECK-NEXT: successors: %bb.2(0x80000000)
15+
; CHECK-NEXT: {{ $}}
16+
; CHECK-NEXT: V_NOP_e32 implicit $exec
17+
; CHECK-NEXT: S_WAITCNT_VSCNT $sgpr_null, 0
18+
; CHECK-NEXT: {{ $}}
19+
; CHECK-NEXT: bb.2:
20+
; CHECK-NEXT: S_ENDPGM 0
21+
bb.0:
22+
successors: %bb.1, %bb.2
23+
S_CBRANCH_EXECZ %bb.2, implicit $exec
24+
25+
bb.1:
26+
successors: %bb.2
27+
V_NOP_e32 implicit $exec
28+
S_WAITCNT_VSCNT $sgpr_null, 0
29+
30+
bb.2:
31+
S_ENDPGM 0
32+
...
33+
34+
---
35+
name: skip_waitcnt_expcnt
36+
body: |
37+
; CHECK-LABEL: name: skip_waitcnt_expcnt
38+
; CHECK: bb.0:
39+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
40+
; CHECK-NEXT: {{ $}}
41+
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
42+
; CHECK-NEXT: {{ $}}
43+
; CHECK-NEXT: bb.1:
44+
; CHECK-NEXT: successors: %bb.2(0x80000000)
45+
; CHECK-NEXT: {{ $}}
46+
; CHECK-NEXT: V_NOP_e32 implicit $exec
47+
; CHECK-NEXT: S_WAITCNT_EXPCNT $sgpr_null, 0
48+
; CHECK-NEXT: {{ $}}
49+
; CHECK-NEXT: bb.2:
50+
; CHECK-NEXT: S_ENDPGM 0
51+
bb.0:
52+
successors: %bb.1, %bb.2
53+
S_CBRANCH_EXECZ %bb.2, implicit $exec
54+
55+
bb.1:
56+
successors: %bb.2
57+
V_NOP_e32 implicit $exec
58+
S_WAITCNT_EXPCNT $sgpr_null, 0
59+
60+
bb.2:
61+
S_ENDPGM 0
62+
...
63+
64+
---
65+
name: skip_waitcnt_vmcnt
66+
body: |
67+
; CHECK-LABEL: name: skip_waitcnt_vmcnt
68+
; CHECK: bb.0:
69+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
70+
; CHECK-NEXT: {{ $}}
71+
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
72+
; CHECK-NEXT: {{ $}}
73+
; CHECK-NEXT: bb.1:
74+
; CHECK-NEXT: successors: %bb.2(0x80000000)
75+
; CHECK-NEXT: {{ $}}
76+
; CHECK-NEXT: V_NOP_e32 implicit $exec
77+
; CHECK-NEXT: S_WAITCNT_VMCNT $sgpr_null, 0
78+
; CHECK-NEXT: {{ $}}
79+
; CHECK-NEXT: bb.2:
80+
; CHECK-NEXT: S_ENDPGM 0
81+
bb.0:
82+
successors: %bb.1, %bb.2
83+
S_CBRANCH_EXECZ %bb.2, implicit $exec
84+
85+
bb.1:
86+
successors: %bb.2
87+
V_NOP_e32 implicit $exec
88+
S_WAITCNT_VMCNT $sgpr_null, 0
89+
90+
bb.2:
91+
S_ENDPGM 0
92+
...
93+
94+
---
95+
name: skip_waitcnt_lgkmcnt
96+
body: |
97+
; CHECK-LABEL: name: skip_waitcnt_lgkmcnt
98+
; CHECK: bb.0:
99+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
100+
; CHECK-NEXT: {{ $}}
101+
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
102+
; CHECK-NEXT: {{ $}}
103+
; CHECK-NEXT: bb.1:
104+
; CHECK-NEXT: successors: %bb.2(0x80000000)
105+
; CHECK-NEXT: {{ $}}
106+
; CHECK-NEXT: V_NOP_e32 implicit $exec
107+
; CHECK-NEXT: S_WAITCNT_LGKMCNT $sgpr_null, 0
108+
; CHECK-NEXT: {{ $}}
109+
; CHECK-NEXT: bb.2:
110+
; CHECK-NEXT: S_ENDPGM 0
111+
bb.0:
112+
successors: %bb.1, %bb.2
113+
S_CBRANCH_EXECZ %bb.2, implicit $exec
114+
115+
bb.1:
116+
successors: %bb.2
117+
V_NOP_e32 implicit $exec
118+
S_WAITCNT_LGKMCNT $sgpr_null, 0
119+
120+
bb.2:
121+
S_ENDPGM 0
122+
...
123+
124+
---
125+
name: skip_wait_idle
126+
body: |
127+
; CHECK-LABEL: name: skip_wait_idle
128+
; CHECK: bb.0:
129+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
130+
; CHECK-NEXT: {{ $}}
131+
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
132+
; CHECK-NEXT: {{ $}}
133+
; CHECK-NEXT: bb.1:
134+
; CHECK-NEXT: successors: %bb.2(0x80000000)
135+
; CHECK-NEXT: {{ $}}
136+
; CHECK-NEXT: V_NOP_e32 implicit $exec
137+
; CHECK-NEXT: S_WAIT_IDLE
138+
; CHECK-NEXT: {{ $}}
139+
; CHECK-NEXT: bb.2:
140+
; CHECK-NEXT: S_ENDPGM 0
141+
bb.0:
142+
successors: %bb.1, %bb.2
143+
S_CBRANCH_EXECZ %bb.2, implicit $exec
144+
145+
bb.1:
146+
successors: %bb.2
147+
V_NOP_e32 implicit $exec
148+
S_WAIT_IDLE
149+
150+
bb.2:
151+
S_ENDPGM 0
152+
...
153+
154+
---
155+
name: skip_barrier
156+
body: |
157+
; CHECK-LABEL: name: skip_barrier
158+
; CHECK: bb.0:
159+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
160+
; CHECK-NEXT: {{ $}}
161+
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
162+
; CHECK-NEXT: {{ $}}
163+
; CHECK-NEXT: bb.1:
164+
; CHECK-NEXT: successors: %bb.2(0x80000000)
165+
; CHECK-NEXT: {{ $}}
166+
; CHECK-NEXT: V_NOP_e32 implicit $exec
167+
; CHECK-NEXT: S_BARRIER
168+
; CHECK-NEXT: {{ $}}
169+
; CHECK-NEXT: bb.2:
170+
; CHECK-NEXT: S_ENDPGM 0
171+
bb.0:
172+
successors: %bb.1, %bb.2
173+
S_CBRANCH_EXECZ %bb.2, implicit $exec
174+
175+
bb.1:
176+
successors: %bb.2
177+
V_NOP_e32 implicit $exec
178+
S_BARRIER
179+
180+
bb.2:
181+
S_ENDPGM 0
182+
...
183+
184+
---
185+
name: skip_bvh
186+
body: |
187+
; CHECK-LABEL: name: skip_bvh
188+
; CHECK: bb.0:
189+
; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
190+
; CHECK-NEXT: {{ $}}
191+
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec
192+
; CHECK-NEXT: {{ $}}
193+
; CHECK-NEXT: bb.1:
194+
; CHECK-NEXT: successors: %bb.2(0x80000000)
195+
; CHECK-NEXT: {{ $}}
196+
; CHECK-NEXT: V_NOP_e32 implicit $exec
197+
; CHECK-NEXT: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 = IMPLICIT_DEF
198+
; CHECK-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
199+
; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
200+
; CHECK-NEXT: {{ $}}
201+
; CHECK-NEXT: bb.2:
202+
; CHECK-NEXT: S_ENDPGM 0
203+
bb.0:
204+
successors: %bb.1, %bb.2
205+
S_CBRANCH_EXECZ %bb.2, implicit $exec
206+
207+
bb.1:
208+
successors: %bb.2
209+
V_NOP_e32 implicit $exec
210+
$vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14 = IMPLICIT_DEF
211+
$sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
212+
$vgpr0_vgpr1_vgpr2_vgpr3 = IMAGE_BVH_INTERSECT_RAY_sa_gfx11 $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, implicit $exec :: (dereferenceable load (s128), addrspace 7)
213+
214+
bb.2:
215+
S_ENDPGM 0
216+
...

0 commit comments

Comments
 (0)