Skip to content

Commit 14a6133

Browse files
committed
[AMDGPU] - Fix gfx12 waitcnt type for image_msaa_load
image_msaa_load is actually encoded as a VSAMPLE instruction and requires the appropriate waitcnt variant.
1 parent e1622e1 commit 14a6133

File tree

2 files changed

+19
-15
lines changed

2 files changed

+19
-15
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -187,8 +187,12 @@ VmemType getVmemType(const MachineInstr &Inst) {
187187
const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
188188
const AMDGPU::MIMGBaseOpcodeInfo *BaseInfo =
189189
AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode);
190-
return BaseInfo->BVH ? VMEM_BVH
191-
: BaseInfo->Sampler ? VMEM_SAMPLER : VMEM_NOSAMPLER;
190+
// The test for MSAA here is because gfx12+ image_msaa_load is actually
191+
// encoded as VSAMPLE and requires the appropriate s_waitcnt variant for that.
192+
// Pre-gfx12 doesn't care since all vmem types result in the same s_waitcnt.
193+
return BaseInfo->BVH ? VMEM_BVH
194+
: BaseInfo->Sampler || BaseInfo->MSAA ? VMEM_SAMPLER
195+
: VMEM_NOSAMPLER;
192196
}
193197

194198
unsigned &getCounterRef(AMDGPU::Waitcnt &Wait, InstCounterType T) {

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa(<8 x i32> inreg %rsrc, i32 %s, i32 %t,
1212
; GFX12-LABEL: load_2dmsaa:
1313
; GFX12: ; %bb.0: ; %main_body
1414
; GFX12-NEXT: image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00]
15-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
15+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
1616
; GFX12-NEXT: ; return to shader part epilog
1717
main_body:
1818
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -50,7 +50,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrsp
5050
; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02]
5151
; GFX12-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
5252
; GFX12-NEXT: image_msaa_load v[0:4], [v7, v6, v5], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x07,0x06,0x05,0x00]
53-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
53+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
5454
; GFX12-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x08,0x00,0x00,0x00]
5555
; GFX12-NEXT: ; return to shader part epilog
5656
main_body:
@@ -71,7 +71,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa(<8 x i32> inreg %rsrc, i32 %s, i3
7171
; GFX12-LABEL: load_2darraymsaa:
7272
; GFX12: ; %bb.0: ; %main_body
7373
; GFX12-NEXT: image_msaa_load v[0:3], [v0, v1, v2, v3], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm ; encoding: [0x07,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03]
74-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
74+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
7575
; GFX12-NEXT: ; return to shader part epilog
7676
main_body:
7777
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i32(i32 4, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -110,7 +110,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr ad
110110
; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02]
111111
; GFX12-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e]
112112
; GFX12-NEXT: image_msaa_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x08,0x07,0x06,0x05]
113-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
113+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
114114
; GFX12-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x09,0x00,0x00,0x00]
115115
; GFX12-NEXT: ; return to shader part epilog
116116
main_body:
@@ -131,7 +131,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_glc(<8 x i32> inreg %rsrc, i32 %s, i32
131131
; GFX12-LABEL: load_2dmsaa_glc:
132132
; GFX12: ; %bb.0: ; %main_body
133133
; GFX12-NEXT: image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_NT ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x10,0x00,0x00,0x01,0x02,0x00]
134-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
134+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
135135
; GFX12-NEXT: ; return to shader part epilog
136136
main_body:
137137
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 1)
@@ -148,7 +148,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_slc(<8 x i32> inreg %rsrc, i32 %s, i32
148148
; GFX12-LABEL: load_2dmsaa_slc:
149149
; GFX12: ; %bb.0: ; %main_body
150150
; GFX12-NEXT: image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_HT ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x20,0x00,0x00,0x01,0x02,0x00]
151-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
151+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
152152
; GFX12-NEXT: ; return to shader part epilog
153153
main_body:
154154
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 2)
@@ -165,7 +165,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_glc_slc(<8 x i32> inreg %rsrc, i32 %s,
165165
; GFX12-LABEL: load_2dmsaa_glc_slc:
166166
; GFX12: ; %bb.0: ; %main_body
167167
; GFX12-NEXT: image_msaa_load v[0:3], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm th:TH_LOAD_LU ; encoding: [0x06,0x20,0x46,0xe4,0x00,0x00,0x30,0x00,0x00,0x01,0x02,0x00]
168-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
168+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
169169
; GFX12-NEXT: ; return to shader part epilog
170170
main_body:
171171
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 3)
@@ -182,7 +182,7 @@ define amdgpu_ps <4 x half> @load_2dmsaa_d16(<8 x i32> inreg %rsrc, i32 %s, i32
182182
; GFX12-LABEL: load_2dmsaa_d16:
183183
; GFX12: ; %bb.0: ; %main_body
184184
; GFX12-NEXT: image_msaa_load v[0:1], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm d16 ; encoding: [0x26,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00]
185-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
185+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
186186
; GFX12-NEXT: ; return to shader part epilog
187187
main_body:
188188
%v = call <4 x half> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -215,7 +215,7 @@ define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addr
215215
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf]
216216
; GFX12-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
217217
; GFX12-NEXT: image_msaa_load v[0:2], [v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x05,0x04,0x03,0x00]
218-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
218+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
219219
; GFX12-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x06,0x00,0x00,0x00]
220220
; GFX12-NEXT: ; return to shader part epilog
221221
main_body:
@@ -236,7 +236,7 @@ define amdgpu_ps <4 x half> @load_2darraymsaa_d16(<8 x i32> inreg %rsrc, i32 %s,
236236
; GFX12-LABEL: load_2darraymsaa_d16:
237237
; GFX12: ; %bb.0: ; %main_body
238238
; GFX12-NEXT: image_msaa_load v[0:1], [v0, v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm d16 ; encoding: [0x27,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03]
239-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
239+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
240240
; GFX12-NEXT: ; return to shader part epilog
241241
main_body:
242242
%v = call <4 x half> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -269,7 +269,7 @@ define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr
269269
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf]
270270
; GFX12-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e]
271271
; GFX12-NEXT: image_msaa_load v[0:2], [v6, v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x06,0x05,0x04,0x03]
272-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
272+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
273273
; GFX12-NEXT: global_store_b32 v7, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x07,0x00,0x00,0x00]
274274
; GFX12-NEXT: ; return to shader part epilog
275275
main_body:
@@ -292,7 +292,7 @@ define amdgpu_ps <4 x float> @load_2dmsaa_a16(<8 x i32> inreg %rsrc, i16 %s, i16
292292
; GFX12: ; %bb.0: ; %main_body
293293
; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
294294
; GFX12-NEXT: image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm a16 ; encoding: [0x46,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
295-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
295+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
296296
; GFX12-NEXT: ; return to shader part epilog
297297
main_body:
298298
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32.i16(i32 1, i16 %s, i16 %t, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
@@ -313,7 +313,7 @@ define amdgpu_ps <4 x float> @load_2darraymsaa_a16(<8 x i32> inreg %rsrc, i16 %s
313313
; GFX12-NEXT: v_perm_b32 v2, v3, v2, 0x5040100 ; encoding: [0x02,0x00,0x44,0xd6,0x03,0x05,0xfe,0x03,0x00,0x01,0x04,0x05]
314314
; GFX12-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; encoding: [0x00,0x00,0x44,0xd6,0x01,0x01,0xfe,0x03,0x00,0x01,0x04,0x05]
315315
; GFX12-NEXT: image_msaa_load v[0:3], [v0, v2], s[0:7] dmask:0x4 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm a16 ; encoding: [0x47,0x20,0x06,0xe5,0x00,0x00,0x00,0x00,0x00,0x02,0x00,0x00]
316-
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
316+
; GFX12-NEXT: s_wait_samplecnt 0x0 ; encoding: [0x00,0x00,0xc2,0xbf]
317317
; GFX12-NEXT: ; return to shader part epilog
318318
main_body:
319319
%v = call <4 x float> @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32.i16(i32 4, i16 %s, i16 %t, i16 %slice, i16 %fragid, <8 x i32> %rsrc, i32 0, i32 0)

0 commit comments

Comments
 (0)