Skip to content

Commit 976f3b3

Browse files
committed
[AMDGPU] Only allow implicit WQM in pixel shaders
Implicit derivatives are only valid in pixel shaders, hence only implicitly enable WQM for pixel shaders. This avoids unintended WQM in other shader types (e.g. compute) when image sampling instructions are used. Reviewed By: rampitec Differential Revision: https://reviews.llvm.org/D114414
1 parent 581f837 commit 976f3b3

File tree

4 files changed

+178
-28
lines changed

4 files changed

+178
-28
lines changed

llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,8 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
487487
bool WQMOutputs = MF.getFunction().hasFnAttribute("amdgpu-ps-wqm-outputs");
488488
SmallVector<MachineInstr *, 4> SetInactiveInstrs;
489489
SmallVector<MachineInstr *, 4> SoftWQMInstrs;
490+
bool HasImplicitDerivatives =
491+
MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
490492

491493
// We need to visit the basic blocks in reverse post-order so that we visit
492494
// defs before uses, in particular so that we don't accidentally mark an
@@ -506,6 +508,11 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
506508
// If LOD is not supported WQM is not needed.
507509
if (!ST->hasExtendedImageInsts())
508510
continue;
511+
// Only generate implicit WQM if implicit derivatives are required.
512+
// This avoids inserting unintended WQM if a shader type without
513+
// implicit derivatives uses an image sampling instruction.
514+
if (!HasImplicitDerivatives)
515+
continue;
509516
// Sampling instructions don't need to produce results for all pixels
510517
// in a quad, they just require all inputs of a quad to have been
511518
// computed for derivatives.

llvm/test/CodeGen/AMDGPU/memory_clause.ll

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -391,27 +391,25 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
391391
; GCN-NEXT: s_mov_b32 s16, SCRATCH_RSRC_DWORD0
392392
; GCN-NEXT: s_mov_b32 s17, SCRATCH_RSRC_DWORD1
393393
; GCN-NEXT: s_mov_b32 s18, -1
394+
; GCN-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x24
395+
; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
394396
; GCN-NEXT: s_mov_b32 s19, 0xe00000
395397
; GCN-NEXT: s_add_u32 s16, s16, s3
396398
; GCN-NEXT: s_addc_u32 s17, s17, 0
397-
; GCN-NEXT: s_mov_b64 s[12:13], exec
398-
; GCN-NEXT: s_wqm_b64 exec, exec
399399
; GCN-NEXT: v_mov_b32_e32 v0, 0x40b00000
400-
; GCN-NEXT: s_load_dwordx2 s[14:15], s[0:1], 0x24
401-
; GCN-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x44
402400
; GCN-NEXT: buffer_store_dword v0, off, s[16:19], 0 offset:4
403401
; GCN-NEXT: s_waitcnt vmcnt(0)
404-
; GCN-NEXT: ;;#ASMSTART
405-
; GCN-NEXT: ;;#ASMEND
406-
; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4
407402
; GCN-NEXT: s_brev_b32 s0, 1
408403
; GCN-NEXT: s_waitcnt lgkmcnt(0)
409-
; GCN-NEXT: v_mov_b32_e32 v0, s14
404+
; GCN-NEXT: v_mov_b32_e32 v0, s12
410405
; GCN-NEXT: s_mov_b32 s3, 0
411406
; GCN-NEXT: s_mov_b32 s1, s0
412407
; GCN-NEXT: s_mov_b32 s2, s0
413-
; GCN-NEXT: v_mov_b32_e32 v1, s15
414-
; GCN-NEXT: s_and_b64 exec, exec, s[12:13]
408+
; GCN-NEXT: v_mov_b32_e32 v1, s13
409+
; GCN-NEXT: ;;#ASMSTART
410+
; GCN-NEXT: ;;#ASMEND
411+
; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4
412+
; GCN-NEXT: s_nop 0
415413
; GCN-NEXT: image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1
416414
; GCN-NEXT: s_waitcnt vmcnt(0)
417415
; GCN-NEXT: v_add_f32_e32 v0, v2, v0
@@ -424,25 +422,22 @@ define amdgpu_kernel void @flat_scratch_load(float %a, float %b, <8 x i32> %desc
424422
; GCN-SCRATCH-NEXT: s_addc_u32 s3, s3, 0
425423
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2
426424
; GCN-SCRATCH-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3
427-
; GCN-SCRATCH-NEXT: s_mov_b32 s9, exec_lo
428-
; GCN-SCRATCH-NEXT: s_wqm_b32 exec_lo, exec_lo
429425
; GCN-SCRATCH-NEXT: s_clause 0x1
430426
; GCN-SCRATCH-NEXT: s_load_dwordx2 s[10:11], s[0:1], 0x24
431427
; GCN-SCRATCH-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x44
432428
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40b00000
433429
; GCN-SCRATCH-NEXT: s_brev_b32 s8, 1
430+
; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8
434431
; GCN-SCRATCH-NEXT: scratch_store_dword off, v0, off offset:4
435432
; GCN-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0
436433
; GCN-SCRATCH-NEXT: ;;#ASMSTART
437434
; GCN-SCRATCH-NEXT: ;;#ASMEND
435+
; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off offset:4
438436
; GCN-SCRATCH-NEXT: s_waitcnt lgkmcnt(0)
439437
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v0, s10
440438
; GCN-SCRATCH-NEXT: v_mov_b32_e32 v1, s11
441-
; GCN-SCRATCH-NEXT: s_and_b32 exec_lo, exec_lo, s9
442439
; GCN-SCRATCH-NEXT: s_mov_b32 s11, 0
443-
; GCN-SCRATCH-NEXT: s_mov_b32 s9, s8
444440
; GCN-SCRATCH-NEXT: s_mov_b32 s10, s8
445-
; GCN-SCRATCH-NEXT: scratch_load_dword v2, off, off offset:4
446441
; GCN-SCRATCH-NEXT: image_sample v0, v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D
447442
; GCN-SCRATCH-NEXT: s_waitcnt vmcnt(0)
448443
; GCN-SCRATCH-NEXT: v_add_f32_e32 v0, v2, v0

llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,19 +9,22 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
99

1010
; GFX9-LABEL: non_preserved_vgpr_tuple8:
1111
; GFX9: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
12-
; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
13-
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
14-
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
15-
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
1612

1713
; GFX9: v_mov_b32_e32 v36, v16
1814
; GFX9-NEXT: v_mov_b32_e32 v35, v15
1915
; GFX9-NEXT: v_mov_b32_e32 v34, v14
2016
; GFX9-NEXT: v_mov_b32_e32 v33, v13
2117
; GFX9-NEXT: v_mov_b32_e32 v32, v12
18+
19+
; GFX9: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
20+
; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
21+
; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
22+
; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
23+
2224
; GFX9: ;;#ASMSTART
2325
; GFX9-NEXT: ;;#ASMEND
2426
; GFX9: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
27+
; GFX9-NEXT: s_addk_i32 s32, 0x800
2528
; GFX9-NEXT: s_getpc_b64 s[4:5]
2629
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
2730
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
@@ -39,21 +42,23 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
3942
;
4043
; GFX10-LABEL: non_preserved_vgpr_tuple8:
4144
; GFX10: buffer_store_dword v40, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
42-
; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
43-
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
44-
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
45-
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
4645

4746
; GFX10: v_mov_b32_e32 v36, v16
4847
; GFX10-NEXT: v_mov_b32_e32 v35, v15
4948
; GFX10-NEXT: v_mov_b32_e32 v34, v14
5049
; GFX10-NEXT: v_mov_b32_e32 v33, v13
5150
; GFX10-NEXT: v_mov_b32_e32 v32, v12
5251

52+
; GFX10: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
53+
; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
54+
; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
55+
; GFX10-NEXT: buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
56+
5357
; GFX10: ;;#ASMSTART
5458
; GFX10-NEXT: ;;#ASMEND
5559

5660
; GFX10: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
61+
; GFX10-NEXT: s_addk_i32 s32, 0x400
5762
; GFX10-NEXT: s_waitcnt_depctr 0xffe3
5863
; GFX10-NEXT: s_getpc_b64 s[4:5]
5964
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
@@ -100,6 +105,7 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
100105
; GFX9-NEXT: v_mov_b32_e32 v41, v12
101106

102107
; GFX9: image_gather4_c_b_cl v[0:3], v[41:45], s[36:43], s[4:7] dmask:0x1
108+
; GFX9-NEXT: s_addk_i32 s32, 0x800
103109
; GFX9-NEXT: s_getpc_b64 s[4:5]
104110
; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
105111
; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
@@ -133,12 +139,9 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
133139
; GFX10-NEXT: s_getpc_b64 s[4:5]
134140
; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
135141
; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
136-
; GFX10-NEXT: v_mov_b32_e32 v41, v16
142+
; GFX10-NEXT: v_writelane_b32 v40, s30, 8
137143
; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0
138-
; GFX10-NEXT: v_mov_b32_e32 v42, v15
139-
; GFX10-NEXT: v_mov_b32_e32 v43, v14
140-
; GFX10-NEXT: v_mov_b32_e32 v44, v13
141-
; GFX10-NEXT: v_mov_b32_e32 v45, v12
144+
; GFX10-NEXT: v_writelane_b32 v40, s31, 9
142145
; GFX10-NEXT: s_waitcnt vmcnt(0)
143146
; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off
144147
; GFX10-NEXT: s_waitcnt lgkmcnt(0)

llvm/test/CodeGen/AMDGPU/wqm.mir

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,48 @@
11
# RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs -run-pass si-wqm -o - %s | FileCheck %s
22

3+
--- |
4+
define amdgpu_ps void @test_strict_wwm_scc() {
5+
ret void
6+
}
7+
define amdgpu_ps void @test_strict_wwm_scc2() {
8+
ret void
9+
}
10+
define amdgpu_ps void @no_cfg() {
11+
ret void
12+
}
13+
define amdgpu_ps void @copy_exec() {
14+
ret void
15+
}
16+
define amdgpu_ps void @scc_always_live() {
17+
ret void
18+
}
19+
define amdgpu_ps void @test_wwm_set_inactive_propagation() {
20+
ret void
21+
}
22+
define amdgpu_ps void @test_wqm_lr_phi() {
23+
ret void
24+
}
25+
define amdgpu_cs void @no_wqm_in_cs() {
26+
ret void
27+
}
28+
define amdgpu_es void @no_wqm_in_es() {
29+
ret void
30+
}
31+
define amdgpu_gs void @no_wqm_in_gs() {
32+
ret void
33+
}
34+
define amdgpu_hs void @no_wqm_in_hs() {
35+
ret void
36+
}
37+
define amdgpu_ls void @no_wqm_in_ls() {
38+
ret void
39+
}
40+
define amdgpu_vs void @no_wqm_in_vs() {
41+
ret void
42+
}
43+
...
44+
---
45+
346
---
447
# Check for awareness that s_or_saveexec_b64 clobbers SCC
548
#
@@ -298,3 +341,105 @@ body: |
298341
$vgpr1 = COPY %4.sub1:vreg_128
299342
SI_RETURN_TO_EPILOG $vgpr0, $vgpr1
300343
...
344+
345+
---
346+
#CHECK-LABEL: name: no_wqm_in_cs
347+
#CHECK-NOT: S_WQM
348+
name: no_wqm_in_cs
349+
tracksRegLiveness: true
350+
body: |
351+
bb.0:
352+
liveins: $vgpr1, $vgpr2
353+
354+
undef %0.sub0:vreg_64 = COPY $vgpr1
355+
%0.sub1:vreg_64 = COPY $vgpr2
356+
%100:sgpr_256 = IMPLICIT_DEF
357+
%101:sgpr_128 = IMPLICIT_DEF
358+
359+
%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
360+
...
361+
362+
---
363+
#CHECK-LABEL: name: no_wqm_in_es
364+
#CHECK-NOT: S_WQM
365+
name: no_wqm_in_es
366+
tracksRegLiveness: true
367+
body: |
368+
bb.0:
369+
liveins: $vgpr1, $vgpr2
370+
371+
undef %0.sub0:vreg_64 = COPY $vgpr1
372+
%0.sub1:vreg_64 = COPY $vgpr2
373+
%100:sgpr_256 = IMPLICIT_DEF
374+
%101:sgpr_128 = IMPLICIT_DEF
375+
376+
%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
377+
...
378+
379+
---
380+
#CHECK-LABEL: name: no_wqm_in_gs
381+
#CHECK-NOT: S_WQM
382+
name: no_wqm_in_gs
383+
tracksRegLiveness: true
384+
body: |
385+
bb.0:
386+
liveins: $vgpr1, $vgpr2
387+
388+
undef %0.sub0:vreg_64 = COPY $vgpr1
389+
%0.sub1:vreg_64 = COPY $vgpr2
390+
%100:sgpr_256 = IMPLICIT_DEF
391+
%101:sgpr_128 = IMPLICIT_DEF
392+
393+
%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
394+
...
395+
396+
---
397+
#CHECK-LABEL: name: no_wqm_in_hs
398+
#CHECK-NOT: S_WQM
399+
name: no_wqm_in_hs
400+
tracksRegLiveness: true
401+
body: |
402+
bb.0:
403+
liveins: $vgpr1, $vgpr2
404+
405+
undef %0.sub0:vreg_64 = COPY $vgpr1
406+
%0.sub1:vreg_64 = COPY $vgpr2
407+
%100:sgpr_256 = IMPLICIT_DEF
408+
%101:sgpr_128 = IMPLICIT_DEF
409+
410+
%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
411+
...
412+
413+
---
414+
#CHECK-LABEL: name: no_wqm_in_ls
415+
#CHECK-NOT: S_WQM
416+
name: no_wqm_in_ls
417+
tracksRegLiveness: true
418+
body: |
419+
bb.0:
420+
liveins: $vgpr1, $vgpr2
421+
422+
undef %0.sub0:vreg_64 = COPY $vgpr1
423+
%0.sub1:vreg_64 = COPY $vgpr2
424+
%100:sgpr_256 = IMPLICIT_DEF
425+
%101:sgpr_128 = IMPLICIT_DEF
426+
427+
%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
428+
...
429+
430+
---
431+
#CHECK-LABEL: name: no_wqm_in_vs
432+
#CHECK-NOT: S_WQM
433+
name: no_wqm_in_vs
434+
tracksRegLiveness: true
435+
body: |
436+
bb.0:
437+
liveins: $vgpr1, $vgpr2
438+
439+
undef %0.sub0:vreg_64 = COPY $vgpr1
440+
%0.sub1:vreg_64 = COPY $vgpr2
441+
%100:sgpr_256 = IMPLICIT_DEF
442+
%101:sgpr_128 = IMPLICIT_DEF
443+
444+
%4:vreg_128 = IMAGE_SAMPLE_V4_V2 %0:vreg_64, %100:sgpr_256, %101:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4)
445+
...

0 commit comments

Comments
 (0)