Skip to content

Commit 85142f5

Browse files
authored
[AMDGPU][True16][CodeGen] support for true16 for vinterp 16bit instructions (#116702)
vinterp 16bit instructions codeGen support in True16 format Currently only enable two tests, will enable more when more true16 instructions are supported
1 parent 4c4606a commit 85142f5

File tree

3 files changed

+358
-224
lines changed

3 files changed

+358
-224
lines changed

llvm/lib/Target/AMDGPU/VINTERPInstructions.td

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,9 +181,43 @@ multiclass VInterpF16Pat <SDPatternOperator op, Instruction inst,
181181
def : VInterpF16Pat<op, inst, dst_type, 1, high_pat>;
182182
}
183183

184+
class VInterpF16Pat_t16 <SDPatternOperator op, Instruction inst,
185+
ValueType dstVT, bit high, bit isP2> : GCNPat <
186+
(dstVT (op
187+
(VINTERPMods f32:$src0, i32:$src0_modifiers),
188+
(VINTERPMods f32:$src1, i32:$src1_modifiers),
189+
(VINTERPMods f32:$src2, i32:$src2_modifiers),
190+
!if(high, (i1 -1), (i1 0)))),
191+
(inst $src0_modifiers,
192+
(f16 (EXTRACT_SUBREG VGPR_32:$src0, !if(high, hi16, lo16))),
193+
$src1_modifiers, VGPR_32:$src1,
194+
$src2_modifiers,
195+
!if(isP2, (f32 VGPR_32:$src2),
196+
(f16 (EXTRACT_SUBREG VGPR_32:$src2, !if(high, hi16, lo16)))),
197+
0, /* clamp */
198+
7) /* wait_exp */
199+
>;
200+
201+
multiclass VInterpF16Pat_t16 <SDPatternOperator op, Instruction inst,
202+
ValueType dstVT, bit isP2> {
203+
def : VInterpF16Pat_t16<op, inst, dstVT, 0, isP2>;
204+
def : VInterpF16Pat_t16<op, inst, dstVT, 1, isP2>;
205+
}
206+
184207
def : VInterpF32Pat<int_amdgcn_interp_inreg_p10, V_INTERP_P10_F32_inreg>;
185208
def : VInterpF32Pat<int_amdgcn_interp_inreg_p2, V_INTERP_P2_F32_inreg>;
186209

210+
let True16Predicate = UseRealTrue16Insts in {
211+
defm : VInterpF16Pat_t16<int_amdgcn_interp_inreg_p10_f16,
212+
V_INTERP_P10_F16_F32_inreg_t16, f32, 0>;
213+
defm : VInterpF16Pat_t16<int_amdgcn_interp_inreg_p2_f16,
214+
V_INTERP_P2_F16_F32_inreg_t16, f16, 1>;
215+
defm : VInterpF16Pat_t16<int_amdgcn_interp_p10_rtz_f16,
216+
V_INTERP_P10_RTZ_F16_F32_inreg_t16, f32, 0>;
217+
defm : VInterpF16Pat_t16<int_amdgcn_interp_p2_rtz_f16,
218+
V_INTERP_P2_RTZ_F16_F32_inreg_t16, f16, 1>;
219+
}
220+
187221
let True16Predicate = UseFakeTrue16Insts in {
188222
defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
189223
V_INTERP_P10_F16_F32_inreg_fake16, f32,

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll

Lines changed: 162 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,26 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
2+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
3+
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
34

45
define amdgpu_ps void @v_interp_f32(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
5-
; GCN-LABEL: v_interp_f32:
6-
; GCN: ; %bb.0: ; %main_body
7-
; GCN-NEXT: s_mov_b32 s3, exec_lo
8-
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
9-
; GCN-NEXT: s_mov_b32 m0, s2
10-
; GCN-NEXT: lds_param_load v0, attr0.y wait_vdst:15
11-
; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15
12-
; GCN-NEXT: s_mov_b32 exec_lo, s3
13-
; GCN-NEXT: v_mov_b32_e32 v2, s0
14-
; GCN-NEXT: v_mov_b32_e32 v4, s1
15-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
16-
; GCN-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
17-
; GCN-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
18-
; GCN-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
19-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
20-
; GCN-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
21-
; GCN-NEXT: exp mrt0 v3, v2, v5, v4 done
22-
; GCN-NEXT: s_endpgm
6+
; GFX11-LABEL: v_interp_f32:
7+
; GFX11: ; %bb.0: ; %main_body
8+
; GFX11-NEXT: s_mov_b32 s3, exec_lo
9+
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
10+
; GFX11-NEXT: s_mov_b32 m0, s2
11+
; GFX11-NEXT: lds_param_load v0, attr0.y wait_vdst:15
12+
; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15
13+
; GFX11-NEXT: s_mov_b32 exec_lo, s3
14+
; GFX11-NEXT: v_mov_b32_e32 v2, s0
15+
; GFX11-NEXT: v_mov_b32_e32 v4, s1
16+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
17+
; GFX11-NEXT: v_interp_p10_f32 v3, v0, v2, v0 wait_exp:1
18+
; GFX11-NEXT: v_interp_p10_f32 v2, v1, v2, v1 wait_exp:0
19+
; GFX11-NEXT: v_interp_p2_f32 v5, v0, v4, v3 wait_exp:7
20+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
21+
; GFX11-NEXT: v_interp_p2_f32 v4, v1, v4, v5 wait_exp:7
22+
; GFX11-NEXT: exp mrt0 v3, v2, v5, v4 done
23+
; GFX11-NEXT: s_endpgm
2324
main_body:
2425
%p0 = call float @llvm.amdgcn.lds.param.load(i32 1, i32 0, i32 %m0)
2526
%p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -32,30 +33,30 @@ main_body:
3233
}
3334

3435
define amdgpu_ps void @v_interp_f32_many(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
35-
; GCN-LABEL: v_interp_f32_many:
36-
; GCN: ; %bb.0: ; %main_body
37-
; GCN-NEXT: s_mov_b32 s3, exec_lo
38-
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
39-
; GCN-NEXT: s_mov_b32 m0, s2
40-
; GCN-NEXT: lds_param_load v0, attr0.x wait_vdst:15
41-
; GCN-NEXT: lds_param_load v1, attr1.x wait_vdst:15
42-
; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15
43-
; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15
44-
; GCN-NEXT: s_mov_b32 exec_lo, s3
45-
; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
46-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
47-
; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
48-
; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
49-
; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
50-
; GCN-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
51-
; GCN-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
52-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
53-
; GCN-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
54-
; GCN-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
55-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4)
56-
; GCN-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
57-
; GCN-NEXT: exp mrt0 v6, v7, v8, v4 done
58-
; GCN-NEXT: s_endpgm
36+
; GFX11-LABEL: v_interp_f32_many:
37+
; GFX11: ; %bb.0: ; %main_body
38+
; GFX11-NEXT: s_mov_b32 s3, exec_lo
39+
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
40+
; GFX11-NEXT: s_mov_b32 m0, s2
41+
; GFX11-NEXT: lds_param_load v0, attr0.x wait_vdst:15
42+
; GFX11-NEXT: lds_param_load v1, attr1.x wait_vdst:15
43+
; GFX11-NEXT: lds_param_load v2, attr2.x wait_vdst:15
44+
; GFX11-NEXT: lds_param_load v3, attr3.x wait_vdst:15
45+
; GFX11-NEXT: s_mov_b32 exec_lo, s3
46+
; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
47+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4)
48+
; GFX11-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3
49+
; GFX11-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2
50+
; GFX11-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1
51+
; GFX11-NEXT: v_interp_p10_f32 v4, v3, v4, v3 wait_exp:0
52+
; GFX11-NEXT: v_interp_p2_f32 v6, v0, v5, v6 wait_exp:7
53+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
54+
; GFX11-NEXT: v_interp_p2_f32 v7, v1, v5, v7 wait_exp:7
55+
; GFX11-NEXT: v_interp_p2_f32 v8, v2, v5, v8 wait_exp:7
56+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
57+
; GFX11-NEXT: v_interp_p2_f32 v4, v3, v5, v4 wait_exp:7
58+
; GFX11-NEXT: exp mrt0 v6, v7, v8, v4 done
59+
; GFX11-NEXT: s_endpgm
5960
main_body:
6061
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
6162
%p1 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 1, i32 %m0)
@@ -74,30 +75,30 @@ main_body:
7475
}
7576

7677
define amdgpu_ps void @v_interp_f32_many_vm(ptr addrspace(1) %ptr, i32 inreg %m0) #0 {
77-
; GCN-LABEL: v_interp_f32_many_vm:
78-
; GCN: ; %bb.0: ; %main_body
79-
; GCN-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
80-
; GCN-NEXT: s_mov_b32 m0, s0
81-
; GCN-NEXT: s_mov_b32 s0, exec_lo
82-
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
83-
; GCN-NEXT: lds_param_load v2, attr0.x wait_vdst:15
84-
; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15
85-
; GCN-NEXT: lds_param_load v4, attr2.x wait_vdst:15
86-
; GCN-NEXT: lds_param_load v5, attr3.x wait_vdst:15
87-
; GCN-NEXT: s_mov_b32 exec_lo, s0
88-
; GCN-NEXT: s_waitcnt vmcnt(0)
89-
; GCN-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
90-
; GCN-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
91-
; GCN-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
92-
; GCN-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
93-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
94-
; GCN-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
95-
; GCN-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
96-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
97-
; GCN-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
98-
; GCN-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
99-
; GCN-NEXT: exp mrt0 v6, v7, v8, v0 done
100-
; GCN-NEXT: s_endpgm
78+
; GFX11-LABEL: v_interp_f32_many_vm:
79+
; GFX11: ; %bb.0: ; %main_body
80+
; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off offset:4
81+
; GFX11-NEXT: s_mov_b32 m0, s0
82+
; GFX11-NEXT: s_mov_b32 s0, exec_lo
83+
; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo
84+
; GFX11-NEXT: lds_param_load v2, attr0.x wait_vdst:15
85+
; GFX11-NEXT: lds_param_load v3, attr1.x wait_vdst:15
86+
; GFX11-NEXT: lds_param_load v4, attr2.x wait_vdst:15
87+
; GFX11-NEXT: lds_param_load v5, attr3.x wait_vdst:15
88+
; GFX11-NEXT: s_mov_b32 exec_lo, s0
89+
; GFX11-NEXT: s_waitcnt vmcnt(0)
90+
; GFX11-NEXT: v_interp_p10_f32 v6, v2, v0, v2 wait_exp:3
91+
; GFX11-NEXT: v_interp_p10_f32 v7, v3, v0, v3 wait_exp:2
92+
; GFX11-NEXT: v_interp_p10_f32 v8, v4, v0, v4 wait_exp:1
93+
; GFX11-NEXT: v_interp_p10_f32 v0, v5, v0, v5 wait_exp:0
94+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
95+
; GFX11-NEXT: v_interp_p2_f32 v6, v2, v1, v6 wait_exp:7
96+
; GFX11-NEXT: v_interp_p2_f32 v7, v3, v1, v7 wait_exp:7
97+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
98+
; GFX11-NEXT: v_interp_p2_f32 v8, v4, v1, v8 wait_exp:7
99+
; GFX11-NEXT: v_interp_p2_f32 v0, v5, v1, v0 wait_exp:7
100+
; GFX11-NEXT: exp mrt0 v6, v7, v8, v0 done
101+
; GFX11-NEXT: s_endpgm
101102
main_body:
102103
%i.ptr = getelementptr float, ptr addrspace(1) %ptr, i32 1
103104
%i = load float, ptr addrspace(1) %i.ptr, align 4
@@ -120,23 +121,41 @@ main_body:
120121
}
121122

122123
define amdgpu_ps half @v_interp_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
123-
; GCN-LABEL: v_interp_f16:
124-
; GCN: ; %bb.0: ; %main_body
125-
; GCN-NEXT: s_mov_b32 s3, exec_lo
126-
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
127-
; GCN-NEXT: s_mov_b32 m0, s2
128-
; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
129-
; GCN-NEXT: s_mov_b32 exec_lo, s3
130-
; GCN-NEXT: v_mov_b32_e32 v0, s0
131-
; GCN-NEXT: v_mov_b32_e32 v2, s1
132-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
133-
; GCN-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
134-
; GCN-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
135-
; GCN-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
136-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
137-
; GCN-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
138-
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
139-
; GCN-NEXT: ; return to shader part epilog
124+
; GFX11-TRUE16-LABEL: v_interp_f16:
125+
; GFX11-TRUE16: ; %bb.0: ; %main_body
126+
; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo
127+
; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
128+
; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2
129+
; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
130+
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3
131+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
132+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1
133+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
134+
; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
135+
; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
136+
; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
137+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
138+
; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
139+
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
140+
; GFX11-TRUE16-NEXT: ; return to shader part epilog
141+
;
142+
; GFX11-FAKE16-LABEL: v_interp_f16:
143+
; GFX11-FAKE16: ; %bb.0: ; %main_body
144+
; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo
145+
; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
146+
; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2
147+
; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
148+
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3
149+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
150+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
151+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
152+
; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v3, v1, v0, v1 wait_exp:0
153+
; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
154+
; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v3, v1, v2, v3 wait_exp:7
155+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
156+
; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
157+
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
158+
; GFX11-FAKE16-NEXT: ; return to shader part epilog
140159
main_body:
141160
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
142161
%l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float %p0, float %i, float %p0, i1 0)
@@ -148,23 +167,41 @@ main_body:
148167
}
149168

150169
define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
151-
; GCN-LABEL: v_interp_rtz_f16:
152-
; GCN: ; %bb.0: ; %main_body
153-
; GCN-NEXT: s_mov_b32 s3, exec_lo
154-
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
155-
; GCN-NEXT: s_mov_b32 m0, s2
156-
; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
157-
; GCN-NEXT: s_mov_b32 exec_lo, s3
158-
; GCN-NEXT: v_mov_b32_e32 v0, s0
159-
; GCN-NEXT: v_mov_b32_e32 v2, s1
160-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
161-
; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
162-
; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
163-
; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
164-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
165-
; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
166-
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
167-
; GCN-NEXT: ; return to shader part epilog
170+
; GFX11-TRUE16-LABEL: v_interp_rtz_f16:
171+
; GFX11-TRUE16: ; %bb.0: ; %main_body
172+
; GFX11-TRUE16-NEXT: s_mov_b32 s3, exec_lo
173+
; GFX11-TRUE16-NEXT: s_wqm_b32 exec_lo, exec_lo
174+
; GFX11-TRUE16-NEXT: s_mov_b32 m0, s2
175+
; GFX11-TRUE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
176+
; GFX11-TRUE16-NEXT: s_mov_b32 exec_lo, s3
177+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, s0
178+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v2, s1
179+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
180+
; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1.l, v0, v1.l wait_exp:0
181+
; GFX11-TRUE16-NEXT: v_interp_p10_rtz_f16_f32 v4, v1.h, v0, v1.h wait_exp:7
182+
; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.l, v1.l, v2, v3 wait_exp:7
183+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
184+
; GFX11-TRUE16-NEXT: v_interp_p2_rtz_f16_f32 v0.h, v1.h, v2, v4 wait_exp:7
185+
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v0.h
186+
; GFX11-TRUE16-NEXT: ; return to shader part epilog
187+
;
188+
; GFX11-FAKE16-LABEL: v_interp_rtz_f16:
189+
; GFX11-FAKE16: ; %bb.0: ; %main_body
190+
; GFX11-FAKE16-NEXT: s_mov_b32 s3, exec_lo
191+
; GFX11-FAKE16-NEXT: s_wqm_b32 exec_lo, exec_lo
192+
; GFX11-FAKE16-NEXT: s_mov_b32 m0, s2
193+
; GFX11-FAKE16-NEXT: lds_param_load v1, attr0.x wait_vdst:15
194+
; GFX11-FAKE16-NEXT: s_mov_b32 exec_lo, s3
195+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, s0
196+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
197+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
198+
; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
199+
; GFX11-FAKE16-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
200+
; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
201+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
202+
; GFX11-FAKE16-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
203+
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v3, v0
204+
; GFX11-FAKE16-NEXT: ; return to shader part epilog
168205
main_body:
169206
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
170207
%l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
@@ -176,17 +213,30 @@ main_body:
176213
}
177214

178215
define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
179-
; GCN-LABEL: v_interp_f16_imm_params:
180-
; GCN: ; %bb.0: ; %main_body
181-
; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
182-
; GCN-NEXT: v_mov_b32_e32 v2, s1
183-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
184-
; GCN-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
185-
; GCN-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
186-
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
187-
; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
188-
; GCN-NEXT: v_add_f16_e32 v0, v1, v0
189-
; GCN-NEXT: ; return to shader part epilog
216+
; GFX11-TRUE16-LABEL: v_interp_f16_imm_params:
217+
; GFX11-TRUE16: ; %bb.0: ; %main_body
218+
; GFX11-TRUE16-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, 0
219+
; GFX11-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0
220+
; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, s1
221+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
222+
; GFX11-TRUE16-NEXT: v_interp_p10_f16_f32 v1, v0.l, v1, v0.l wait_exp:7
223+
; GFX11-TRUE16-NEXT: v_interp_p2_f16_f32 v0.l, v0.l, v3, v2 wait_exp:7
224+
; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
225+
; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.h, v1
226+
; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.h, v0.l
227+
; GFX11-TRUE16-NEXT: ; return to shader part epilog
228+
;
229+
; GFX11-FAKE16-LABEL: v_interp_f16_imm_params:
230+
; GFX11-FAKE16: ; %bb.0: ; %main_body
231+
; GFX11-FAKE16-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
232+
; GFX11-FAKE16-NEXT: v_mov_b32_e32 v2, s1
233+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
234+
; GFX11-FAKE16-NEXT: v_interp_p10_f16_f32 v1, v0, v1, v0 wait_exp:7
235+
; GFX11-FAKE16-NEXT: v_interp_p2_f16_f32 v0, v0, v2, v0 wait_exp:7
236+
; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
237+
; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v1, v1
238+
; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v1, v0
239+
; GFX11-FAKE16-NEXT: ; return to shader part epilog
190240
main_body:
191241
%l_p0 = call float @llvm.amdgcn.interp.inreg.p10.f16(float 0.0, float %i, float 0.0, i1 0)
192242
%l_p1 = call half @llvm.amdgcn.interp.inreg.p2.f16(float 0.0, float %j, float 0.0, i1 0)

0 commit comments

Comments
 (0)