1
1
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2
2
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
3
3
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
4
+ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
5
+ ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
4
6
5
7
define amdgpu_ps void @v_interp_f32 (float inreg %i , float inreg %j , i32 inreg %m0 ) #0 {
6
8
; GFX11-LABEL: v_interp_f32:
@@ -21,6 +23,25 @@ define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m
21
23
; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
22
24
; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done
23
25
; GFX11-NEXT: s_endpgm
26
+ ;
27
+ ; GFX12-LABEL: v_interp_f32:
28
+ ; GFX12: ; %bb.0: ; %main_body
29
+ ; GFX12-NEXT: s_mov_b32 s3, exec_lo
30
+ ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
31
+ ; GFX12-NEXT: s_mov_b32 m0, s2
32
+ ; GFX12-NEXT: ds_param_load v0, attr0.y wait_va_vdst:15 wait_vm_vsrc:1
33
+ ; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1
34
+ ; GFX12-NEXT: s_mov_b32 exec_lo, s3
35
+ ; GFX12-NEXT: v_mov_b32_e32 v2, s0
36
+ ; GFX12-NEXT: v_mov_b32_e32 v4, s1
37
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
38
+ ; GFX12-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
39
+ ; GFX12-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
40
+ ; GFX12-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
41
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
42
+ ; GFX12-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
43
+ ; GFX12-NEXT: export mrt0 v3, v2, v5, v4 done
44
+ ; GFX12-NEXT: s_endpgm
24
45
main_body:
25
46
%p0 = call float @llvm.amdgcn.lds.param.load (i32 1 , i32 0 , i32 %m0 )
26
47
%p1 = call float @llvm.amdgcn.lds.param.load (i32 0 , i32 1 , i32 %m0 )
@@ -57,6 +78,31 @@ define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inr
57
78
; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
58
79
; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done
59
80
; GFX11-NEXT: s_endpgm
81
+ ;
82
+ ; GFX12-LABEL: v_interp_f32_many:
83
+ ; GFX12: ; %bb.0: ; %main_body
84
+ ; GFX12-NEXT: s_mov_b32 s3, exec_lo
85
+ ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
86
+ ; GFX12-NEXT: s_mov_b32 m0, s2
87
+ ; GFX12-NEXT: ds_param_load v0, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
88
+ ; GFX12-NEXT: ds_param_load v1, attr1.x wait_va_vdst:15 wait_vm_vsrc:1
89
+ ; GFX12-NEXT: ds_param_load v2, attr2.x wait_va_vdst:15 wait_vm_vsrc:1
90
+ ; GFX12-NEXT: ds_param_load v3, attr3.x wait_va_vdst:15 wait_vm_vsrc:1
91
+ ; GFX12-NEXT: s_mov_b32 exec_lo, s3
92
+ ; GFX12-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
93
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
94
+ ; GFX12-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
95
+ ; GFX12-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
96
+ ; GFX12-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
97
+ ; GFX12-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
98
+ ; GFX12-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
99
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
100
+ ; GFX12-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
101
+ ; GFX12-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
102
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4)
103
+ ; GFX12-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
104
+ ; GFX12-NEXT: export mrt0 v6, v7, v8, v4 done
105
+ ; GFX12-NEXT: s_endpgm
60
106
main_body:
61
107
%p0 = call float @llvm.amdgcn.lds.param.load (i32 0 , i32 0 , i32 %m0 )
62
108
%p1 = call float @llvm.amdgcn.lds.param.load (i32 0 , i32 1 , i32 %m0 )
@@ -99,6 +145,31 @@ define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0
99
145
; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
100
146
; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done
101
147
; GFX11-NEXT: s_endpgm
148
+ ;
149
+ ; GFX12-LABEL: v_interp_f32_many_vm:
150
+ ; GFX12: ; %bb.0: ; %main_body
151
+ ; GFX12-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
152
+ ; GFX12-NEXT: s_mov_b32 m0, s0
153
+ ; GFX12-NEXT: s_mov_b32 s0, exec_lo
154
+ ; GFX12-NEXT: s_wqm_b32 exec_lo, exec_lo
155
+ ; GFX12-NEXT: ds_param_load v2, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
156
+ ; GFX12-NEXT: ds_param_load v3, attr1.x wait_va_vdst:15 wait_vm_vsrc:1
157
+ ; GFX12-NEXT: ds_param_load v4, attr2.x wait_va_vdst:15 wait_vm_vsrc:1
158
+ ; GFX12-NEXT: ds_param_load v5, attr3.x wait_va_vdst:15 wait_vm_vsrc:1
159
+ ; GFX12-NEXT: s_mov_b32 exec_lo, s0
160
+ ; GFX12-NEXT: s_wait_loadcnt 0x0
161
+ ; GFX12-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
162
+ ; GFX12-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
163
+ ; GFX12-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
164
+ ; GFX12-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
165
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
166
+ ; GFX12-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
167
+ ; GFX12-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
168
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
169
+ ; GFX12-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
170
+ ; GFX12-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
171
+ ; GFX12-NEXT: export mrt0 v6, v7, v8, v0 done
172
+ ; GFX12-NEXT: s_endpgm
102
173
main_body:
103
174
%i.ptr = getelementptr float , ptr addrspace (1 ) %ptr , i32 1
104
175
%i = load float , ptr addrspace (1 ) %i.ptr , align 4
@@ -156,6 +227,42 @@ define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m
156
227
; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
157
228
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
158
229
; GFX11-FAKE16-NEXT: ; return to shader part epilog
230
+ ;
231
+ ; GFX12-TRUE16-LABEL: v_interp_f16:
232
+ ; GFX12-TRUE16: ; %bb.0: ; %main_body
233
+ ; GFX12-TRUE16-NEXT: s_mov_b32 s3, exec_lo
234
+ ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
235
+ ; GFX12-TRUE16-NEXT: s_mov_b32 m0, s2
236
+ ; GFX12-TRUE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
237
+ ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s3
238
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s0
239
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s1
240
+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
241
+ ; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
242
+ ; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
243
+ ; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
244
+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
245
+ ; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
246
+ ; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
247
+ ; GFX12-TRUE16-NEXT: ; return to shader part epilog
248
+ ;
249
+ ; GFX12-FAKE16-LABEL: v_interp_f16:
250
+ ; GFX12-FAKE16: ; %bb.0: ; %main_body
251
+ ; GFX12-FAKE16-NEXT: s_mov_b32 s3, exec_lo
252
+ ; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
253
+ ; GFX12-FAKE16-NEXT: s_mov_b32 m0, s2
254
+ ; GFX12-FAKE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
255
+ ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s3
256
+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
257
+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1
258
+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
259
+ ; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
260
+ ; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
261
+ ; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
262
+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
263
+ ; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
264
+ ; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
265
+ ; GFX12-FAKE16-NEXT: ; return to shader part epilog
159
266
main_body:
160
267
%p0 = call float @llvm.amdgcn.lds.param.load (i32 0 , i32 0 , i32 %m0 )
161
268
%l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16 (float %p0 , float %i , float %p0 , i1 0 )
@@ -202,6 +309,42 @@ define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inre
202
309
; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
203
310
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
204
311
; GFX11-FAKE16-NEXT: ; return to shader part epilog
312
+ ;
313
+ ; GFX12-TRUE16-LABEL: v_interp_rtz_f16:
314
+ ; GFX12-TRUE16: ; %bb.0: ; %main_body
315
+ ; GFX12-TRUE16-NEXT: s_mov_b32 s3, exec_lo
316
+ ; GFX12-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
317
+ ; GFX12-TRUE16-NEXT: s_mov_b32 m0, s2
318
+ ; GFX12-TRUE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
319
+ ; GFX12-TRUE16-NEXT: s_mov_b32 exec_lo, s3
320
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, s0
321
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v2, s1
322
+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
323
+ ; GFX12-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
324
+ ; GFX12-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
325
+ ; GFX12-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
326
+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
327
+ ; GFX12-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
328
+ ; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
329
+ ; GFX12-TRUE16-NEXT: ; return to shader part epilog
330
+ ;
331
+ ; GFX12-FAKE16-LABEL: v_interp_rtz_f16:
332
+ ; GFX12-FAKE16: ; %bb.0: ; %main_body
333
+ ; GFX12-FAKE16-NEXT: s_mov_b32 s3, exec_lo
334
+ ; GFX12-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
335
+ ; GFX12-FAKE16-NEXT: s_mov_b32 m0, s2
336
+ ; GFX12-FAKE16-NEXT: ds_param_load v1, attr0.x wait_va_vdst:15 wait_vm_vsrc:1
337
+ ; GFX12-FAKE16-NEXT: s_mov_b32 exec_lo, s3
338
+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, s0
339
+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1
340
+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
341
+ ; GFX12-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
342
+ ; GFX12-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
343
+ ; GFX12-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
344
+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
345
+ ; GFX12-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
346
+ ; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
347
+ ; GFX12-FAKE16-NEXT: ; return to shader part epilog
205
348
main_body:
206
349
%p0 = call float @llvm.amdgcn.lds.param.load (i32 0 , i32 0 , i32 %m0 )
207
350
%l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16 (float %p0 , float %i , float %p0 , i1 0 )
@@ -237,6 +380,31 @@ define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #
237
380
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
238
381
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0
239
382
; GFX11-FAKE16-NEXT: ; return to shader part epilog
383
+ ;
384
+ ; GFX12-TRUE16-LABEL: v_interp_f16_imm_params:
385
+ ; GFX12-TRUE16: ; %bb.0: ; %main_body
386
+ ; GFX12-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
387
+ ; GFX12-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
388
+ ; GFX12-TRUE16-NEXT: v_mov_b32_e32 v3, s1
389
+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
390
+ ; GFX12-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
391
+ ; GFX12-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
392
+ ; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
393
+ ; GFX12-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
394
+ ; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
395
+ ; GFX12-TRUE16-NEXT: ; return to shader part epilog
396
+ ;
397
+ ; GFX12-FAKE16-LABEL: v_interp_f16_imm_params:
398
+ ; GFX12-FAKE16: ; %bb.0: ; %main_body
399
+ ; GFX12-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
400
+ ; GFX12-FAKE16-NEXT: v_mov_b32_e32 v2, s1
401
+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
402
+ ; GFX12-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
403
+ ; GFX12-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
404
+ ; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
405
+ ; GFX12-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
406
+ ; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0
407
+ ; GFX12-FAKE16-NEXT: ; return to shader part epilog
240
408
main_body:
241
409
%l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16 (float 0 .0 , float %i , float 0 .0 , i1 0 )
242
410
%l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16 (float 0 .0 , float %j , float 0 .0 , i1 0 )
0 commit comments