Skip to content

Commit 216b5e9

Browse files
authored
[AMDGPU] Expose RTZ version of f16 interpolation for gfx11+ (#86614)
1 parent f87bde2 commit 216b5e9

File tree

5 files changed

+89
-1
lines changed

5 files changed

+89
-1
lines changed

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2066,6 +2066,24 @@ def int_amdgcn_interp_inreg_p2_f16 :
20662066
[IntrNoMem, IntrSpeculatable,
20672067
ImmArg<ArgIndex<3>>]>;
20682068

2069+
// llvm.amdgcn.interp.p10.rtz.f16 <p>, <i>, <p0>, <high>
2070+
// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
2071+
// high selects whether high or low 16-bits are used for p and p0 operands
2072+
def int_amdgcn_interp_p10_rtz_f16:
2073+
DefaultAttrsIntrinsic<[llvm_float_ty],
2074+
[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
2075+
[IntrNoMem, IntrSpeculatable,
2076+
ImmArg<ArgIndex<3>>]>;
2077+
2078+
// llvm.amdgcn.interp.p2.rtz.f16 <p>, <j>, <tmp>, <high>
2079+
// gfx11+ fp16 interpolation intrinsic, with round-toward-zero rounding mode.
2080+
// high selects whether high or low 16-bits are used for p operand
2081+
def int_amdgcn_interp_p2_rtz_f16 :
2082+
DefaultAttrsIntrinsic<[llvm_half_ty],
2083+
[llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_i1_ty],
2084+
[IntrNoMem, IntrSpeculatable,
2085+
ImmArg<ArgIndex<3>>]>;
2086+
20692087
// Deprecated: use llvm.amdgcn.live.mask instead.
20702088
def int_amdgcn_ps_live : DefaultAttrsIntrinsic <
20712089
[llvm_i1_ty],

llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3135,6 +3135,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
31353135
case Intrinsic::amdgcn_interp_inreg_p2:
31363136
case Intrinsic::amdgcn_interp_inreg_p10_f16:
31373137
case Intrinsic::amdgcn_interp_inreg_p2_f16:
3138+
case Intrinsic::amdgcn_interp_p10_rtz_f16:
3139+
case Intrinsic::amdgcn_interp_p2_rtz_f16:
31383140
applyDefaultMapping(OpdMapper);
31393141
return;
31403142
case Intrinsic::amdgcn_permlane16:
@@ -4778,7 +4780,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
47784780
case Intrinsic::amdgcn_interp_inreg_p10:
47794781
case Intrinsic::amdgcn_interp_inreg_p2:
47804782
case Intrinsic::amdgcn_interp_inreg_p10_f16:
4781-
case Intrinsic::amdgcn_interp_inreg_p2_f16: {
4783+
case Intrinsic::amdgcn_interp_inreg_p2_f16:
4784+
case Intrinsic::amdgcn_interp_p10_rtz_f16:
4785+
case Intrinsic::amdgcn_interp_p2_rtz_f16: {
47824786
unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
47834787
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
47844788
OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);

llvm/lib/Target/AMDGPU/VINTERPInstructions.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,12 @@ defm : VInterpF16Pat<int_amdgcn_interp_inreg_p10_f16,
173173
defm : VInterpF16Pat<int_amdgcn_interp_inreg_p2_f16,
174174
V_INTERP_P2_F16_F32_inreg, f16,
175175
[VINTERPModsHi, VINTERPMods, VINTERPMods]>;
176+
defm : VInterpF16Pat<int_amdgcn_interp_p10_rtz_f16,
177+
V_INTERP_P10_RTZ_F16_F32_inreg, f32,
178+
[VINTERPModsHi, VINTERPMods, VINTERPModsHi]>;
179+
defm : VInterpF16Pat<int_amdgcn_interp_p2_rtz_f16,
180+
V_INTERP_P2_RTZ_F16_F32_inreg, f16,
181+
[VINTERPModsHi, VINTERPMods, VINTERPMods]>;
176182

177183
//===----------------------------------------------------------------------===//
178184
// VINTERP Real Instructions

llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,34 @@ main_body:
147147
ret half %res
148148
}
149149

150+
define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
151+
; GCN-LABEL: v_interp_rtz_f16:
152+
; GCN: ; %bb.0: ; %main_body
153+
; GCN-NEXT: s_mov_b32 s3, exec_lo
154+
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
155+
; GCN-NEXT: s_mov_b32 m0, s2
156+
; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
157+
; GCN-NEXT: s_mov_b32 exec_lo, s3
158+
; GCN-NEXT: v_mov_b32_e32 v0, s0
159+
; GCN-NEXT: v_mov_b32_e32 v2, s1
160+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
161+
; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
162+
; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
163+
; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
164+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
165+
; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
166+
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
167+
; GCN-NEXT: ; return to shader part epilog
168+
main_body:
169+
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
170+
%l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
171+
%l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0)
172+
%h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1)
173+
%h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1)
174+
%res = fadd half %l_p1, %h_p1
175+
ret half %res
176+
}
177+
150178
define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
151179
; GCN-LABEL: v_interp_f16_imm_params:
152180
; GCN: ; %bb.0: ; %main_body
@@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
172200
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
173201
declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
174202
declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
203+
declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0
204+
declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0
175205
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
176206
declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
177207

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,34 @@ main_body:
147147
ret half %res
148148
}
149149

150+
define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
151+
; GCN-LABEL: v_interp_rtz_f16:
152+
; GCN: ; %bb.0: ; %main_body
153+
; GCN-NEXT: s_mov_b32 s3, exec_lo
154+
; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
155+
; GCN-NEXT: s_mov_b32 m0, s2
156+
; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
157+
; GCN-NEXT: s_mov_b32 exec_lo, s3
158+
; GCN-NEXT: v_mov_b32_e32 v0, s0
159+
; GCN-NEXT: v_mov_b32_e32 v2, s1
160+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
161+
; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
162+
; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
163+
; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
164+
; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
165+
; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
166+
; GCN-NEXT: v_add_f16_e32 v0, v3, v0
167+
; GCN-NEXT: ; return to shader part epilog
168+
main_body:
169+
%p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
170+
%l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
171+
%l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0)
172+
%h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1)
173+
%h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1)
174+
%res = fadd half %l_p1, %h_p1
175+
ret half %res
176+
}
177+
150178
define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
151179
; GCN-LABEL: v_interp_f16_imm_params:
152180
; GCN: ; %bb.0: ; %main_body
@@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
172200
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
173201
declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
174202
declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
203+
declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0
204+
declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0
175205
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
176206
declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
177207

0 commit comments

Comments
 (0)