Skip to content

Commit f811482

Browse files
authored
[AMDGPU] SIWholeQuadMode: Ensure earliest WQM entry point for PS (#123266)
Ensure shaders running WQM (PS) enter at the earliest point irrespective of WQM marking.
1 parent 0e4a10d commit f811482

File tree

3 files changed

+21
-7
lines changed

3 files changed

+21
-7
lines changed

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1305,7 +1305,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
13051305
// Record initial state is block information.
13061306
BI.InitialState = State;
13071307

1308-
for (;;) {
1308+
for (unsigned Idx = 0;; ++Idx) {
13091309
MachineBasicBlock::iterator Next = II;
13101310
char Needs = StateExact | StateWQM; // Strict mode is disabled by default.
13111311
char OutNeeds = 0;
@@ -1316,6 +1316,10 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, bool IsEntry) {
13161316
if (FirstStrict == IE)
13171317
FirstStrict = II;
13181318

1319+
// Adjust needs if this is first instruction of WQM requiring shader.
1320+
if (IsEntry && Idx == 0 && (BI.InNeeds & StateWQM))
1321+
Needs = StateWQM;
1322+
13191323
// First, figure out the allowed states (Needs) based on the propagated
13201324
// flags.
13211325
if (II != IE) {
@@ -1801,6 +1805,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
18011805
lowerKillInstrs(true);
18021806
Changed = true;
18031807
} else {
1808+
// Mark entry for WQM if required.
1809+
if (GlobalFlags & StateWQM)
1810+
Blocks[&Entry].InNeeds |= StateWQM;
18041811
// Wave mode switching requires full lowering pass.
18051812
for (auto BII : Blocks)
18061813
processBlock(*BII.first, BII.first == &Entry);

llvm/test/CodeGen/AMDGPU/dagcombine-fma-fmad.ll

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,12 +5,16 @@
55
define amdgpu_ps float @_amdgpu_ps_main() #0 {
66
; GFX10-LABEL: _amdgpu_ps_main:
77
; GFX10: ; %bb.0: ; %.entry
8+
; GFX10-NEXT: s_mov_b32 s0, exec_lo
9+
; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo
810
; GFX10-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
9-
; GFX10-NEXT: v_mov_b32_e32 v4, 0
11+
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
12+
; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s0
1013
; GFX10-NEXT: s_waitcnt vmcnt(0)
1114
; GFX10-NEXT: s_clause 0x1
1215
; GFX10-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
1316
; GFX10-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
17+
; GFX10-NEXT: v_mov_b32_e32 v4, 0
1418
; GFX10-NEXT: s_waitcnt vmcnt(0)
1519
; GFX10-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
1620
; GFX10-NEXT: s_clause 0x3
@@ -70,12 +74,15 @@ define amdgpu_ps float @_amdgpu_ps_main() #0 {
7074
;
7175
; GFX11-LABEL: _amdgpu_ps_main:
7276
; GFX11: ; %bb.0: ; %.entry
77+
; GFX11-NEXT: s_mov_b32 s0, exec_lo
78+
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
7379
; GFX11-NEXT: image_sample v[0:1], v[0:1], s[0:7], s[0:3] dmask:0x3 dim:SQ_RSRC_IMG_2D
74-
; GFX11-NEXT: v_mov_b32_e32 v4, 0
80+
; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s0
7581
; GFX11-NEXT: s_waitcnt vmcnt(0)
7682
; GFX11-NEXT: s_clause 0x1
7783
; GFX11-NEXT: image_sample v2, v[0:1], s[0:7], s[0:3] dmask:0x4 dim:SQ_RSRC_IMG_2D
7884
; GFX11-NEXT: image_sample v3, v[0:1], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
85+
; GFX11-NEXT: v_mov_b32_e32 v4, 0
7986
; GFX11-NEXT: s_waitcnt vmcnt(0)
8087
; GFX11-NEXT: image_load_mip v4, v[2:4], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D unorm
8188
; GFX11-NEXT: s_clause 0x3

llvm/test/CodeGen/AMDGPU/wqm.ll

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1842,10 +1842,10 @@ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg
18421842
; GFX9-W64-LABEL: test_kill_1:
18431843
; GFX9-W64: ; %bb.0: ; %main_body
18441844
; GFX9-W64-NEXT: s_mov_b64 s[12:13], exec
1845-
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2
18461845
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
18471846
; GFX9-W64-NEXT: v_mov_b32_e32 v5, v0
18481847
; GFX9-W64-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1
1848+
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v2
18491849
; GFX9-W64-NEXT: s_and_b64 exec, exec, s[12:13]
18501850
; GFX9-W64-NEXT: s_waitcnt vmcnt(0)
18511851
; GFX9-W64-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf
@@ -1866,10 +1866,10 @@ define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg
18661866
; GFX10-W32-LABEL: test_kill_1:
18671867
; GFX10-W32: ; %bb.0: ; %main_body
18681868
; GFX10-W32-NEXT: s_mov_b32 s12, exec_lo
1869-
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2
18701869
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
18711870
; GFX10-W32-NEXT: v_mov_b32_e32 v5, v0
18721871
; GFX10-W32-NEXT: image_sample v0, v1, s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_1D
1872+
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v2
18731873
; GFX10-W32-NEXT: s_and_b32 exec_lo, exec_lo, s12
18741874
; GFX10-W32-NEXT: s_waitcnt vmcnt(0)
18751875
; GFX10-W32-NEXT: image_sample v[0:3], v0, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D
@@ -2174,8 +2174,8 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
21742174
; GFX9-W64-LABEL: test_scc:
21752175
; GFX9-W64: ; %bb.0: ; %main_body
21762176
; GFX9-W64-NEXT: s_mov_b64 s[2:3], exec
2177-
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
21782177
; GFX9-W64-NEXT: s_wqm_b64 exec, exec
2178+
; GFX9-W64-NEXT: v_mov_b32_e32 v4, v0
21792179
; GFX9-W64-NEXT: s_cmp_lt_i32 s0, 1
21802180
; GFX9-W64-NEXT: s_cbranch_scc0 .LBB39_2
21812181
; GFX9-W64-NEXT: ; %bb.1: ; %else
@@ -2199,9 +2199,9 @@ define amdgpu_ps <4 x float> @test_scc(i32 inreg %sel, i32 %idx) #1 {
21992199
;
22002200
; GFX10-W32-LABEL: test_scc:
22012201
; GFX10-W32: ; %bb.0: ; %main_body
2202-
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
22032202
; GFX10-W32-NEXT: s_mov_b32 s1, exec_lo
22042203
; GFX10-W32-NEXT: s_wqm_b32 exec_lo, exec_lo
2204+
; GFX10-W32-NEXT: v_mov_b32_e32 v4, v0
22052205
; GFX10-W32-NEXT: s_cmp_lt_i32 s0, 1
22062206
; GFX10-W32-NEXT: s_cbranch_scc0 .LBB39_2
22072207
; GFX10-W32-NEXT: ; %bb.1: ; %else

0 commit comments

Comments
 (0)