Skip to content

Commit 69b9ddb

Browse files
committed
true16 codegen for v_fma_mixlo/hi_f16
1 parent e269c2b commit 69b9ddb

File tree

4 files changed

+694
-242
lines changed

4 files changed

+694
-242
lines changed

llvm/lib/Target/AMDGPU/VOP3PInstructions.td

Lines changed: 79 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -154,10 +154,12 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_
154154
multiclass MadFmaMixPats<SDPatternOperator fma_like,
155155
Instruction mix_inst,
156156
Instruction mixlo_inst,
157-
Instruction mixhi_inst> {
157+
Instruction mixhi_inst,
158+
bit HasFP32Denormals> {
158159
// At least one of the operands needs to be an fpextend of an f16
159160
// for this to be worthwhile, so we need three patterns here.
160161
// TODO: Could we use a predicate to inspect src1/2/3 instead?
162+
let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]) in {
161163
def : GCNPat <
162164
(f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)),
163165
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)),
@@ -177,6 +179,45 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
177179
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
178180
DSTCLAMP.NONE)>;
179181

182+
def : GCNPat <
183+
(AMDGPUclamp (build_vector
184+
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
185+
(f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
186+
(f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
187+
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
188+
(f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
189+
(f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
190+
(v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
191+
$hi_src1_modifiers, $hi_src1,
192+
$hi_src2_modifiers, $hi_src2,
193+
DSTCLAMP.ENABLE,
194+
(mixlo_inst $lo_src0_modifiers, $lo_src0,
195+
$lo_src1_modifiers, $lo_src1,
196+
$lo_src2_modifiers, $lo_src2,
197+
DSTCLAMP.ENABLE,
198+
(i32 (IMPLICIT_DEF)))))
199+
>;
200+
201+
def : GCNPat <
202+
(f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
203+
(f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
204+
(mixlo_inst $src0_modifiers, $src0,
205+
$src1_modifiers, $src1,
206+
(i32 0), (i32 0),
207+
DSTCLAMP.NONE,
208+
(i32 (IMPLICIT_DEF)))
209+
>;
210+
211+
def : GCNPat <
212+
(build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
213+
(f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
214+
(v2f16 (mixhi_inst $src0_modifiers, $src0,
215+
$src1_modifiers, $src1,
216+
(i32 0), (i32 0),
217+
DSTCLAMP.NONE,
218+
VGPR_32:$elt0))
219+
>;
220+
180221
def : GCNPat <
181222
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
182223
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
@@ -187,10 +228,14 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
187228
DSTCLAMP.NONE,
188229
(i32 (IMPLICIT_DEF)))
189230
>;
231+
} // End OtherPredicates
190232

191233
// FIXME: Special case handling for maxhi (especially for clamp)
192234
// because dealing with the write to high half of the register is
193235
// difficult.
236+
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
237+
let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]), True16Predicate = p in {
238+
194239
def : GCNPat <
195240
(build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
196241
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
@@ -215,44 +260,44 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
215260
VGPR_32:$elt0))
216261
>;
217262

218-
def : GCNPat <
219-
(AMDGPUclamp (build_vector
220-
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
221-
(f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
222-
(f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
223-
(f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
224-
(f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
225-
(f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
226-
(v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
227-
$hi_src1_modifiers, $hi_src1,
228-
$hi_src2_modifiers, $hi_src2,
229-
DSTCLAMP.ENABLE,
230-
(mixlo_inst $lo_src0_modifiers, $lo_src0,
231-
$lo_src1_modifiers, $lo_src1,
232-
$lo_src2_modifiers, $lo_src2,
233-
DSTCLAMP.ENABLE,
234-
(i32 (IMPLICIT_DEF)))))
235-
>;
263+
} // end OtherPredicates
236264

265+
let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]), True16Predicate = UseRealTrue16Insts in {
237266
def : GCNPat <
238-
(f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
239-
(f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
240-
(mixlo_inst $src0_modifiers, $src0,
267+
(build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
268+
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
269+
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1),
270+
(v2f16 (mixlo_inst $src0_modifiers, $src0,
241271
$src1_modifiers, $src1,
242-
(i32 0), (i32 0),
272+
$src2_modifiers, $src2,
243273
DSTCLAMP.NONE,
244-
(i32 (IMPLICIT_DEF)))
274+
(REG_SEQUENCE VGPR_32, (f16 (IMPLICIT_DEF)), lo16, $elt1, hi16)))
245275
>;
246276

247277
def : GCNPat <
248-
(build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
249-
(f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
278+
(build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
279+
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
280+
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
250281
(v2f16 (mixhi_inst $src0_modifiers, $src0,
251282
$src1_modifiers, $src1,
252-
(i32 0), (i32 0),
283+
$src2_modifiers, $src2,
253284
DSTCLAMP.NONE,
254-
VGPR_32:$elt0))
285+
(REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
286+
>;
287+
288+
def : GCNPat <
289+
(build_vector
290+
f16:$elt0,
291+
(AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
292+
(f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
293+
(f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
294+
(v2f16 (mixhi_inst $src0_modifiers, $src0,
295+
$src1_modifiers, $src1,
296+
$src2_modifiers, $src2,
297+
DSTCLAMP.ENABLE,
298+
(REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
255299
>;
300+
} // end OtherPredicates
256301
}
257302

258303
class MinimumMaximumByMinimum3Maximum3VOP3P<SDPatternOperator node,
@@ -266,7 +311,8 @@ def : MinimumMaximumByMinimum3Maximum3VOP3P<fminimum, V_PK_MINIMUM3_F16>;
266311
def : MinimumMaximumByMinimum3Maximum3VOP3P<fmaximum, V_PK_MAXIMUM3_F16>;
267312
}
268313

269-
let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in {
314+
let SubtargetPredicate = HasMadMixInsts in {
315+
let OtherPredicates = [NoFP32Denormals] in {
270316

271317
// These are VOP3a-like opcodes which accept no omod.
272318
// Size of src arguments (16/32) is controlled by op_sel.
@@ -284,9 +330,10 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
284330
}
285331
} // End FPDPRounding = 1
286332
}
333+
} // OtherPredicates = [NoFP32Denormals]
287334

288-
defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
289-
} // End SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals]
335+
defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16, 0 /*HasFP32Denormals*/>;
336+
} // End SubtargetPredicate = HasMadMixInsts
290337

291338

292339
// Essentially the same as the mad_mix versions
@@ -306,7 +353,7 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
306353
} // End FPDPRounding = 1
307354
}
308355

309-
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
356+
defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16, 1 /*HasPF32Denormals*/>;
310357
}
311358

312359
// Defines patterns that extract signed 4bit from each Idx[0].

llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll

Lines changed: 28 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11 %s
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-TRUE16 %s
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11,SDAG-GFX11,SDAG-GFX11-FAKE16 %s
34
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,SDAG-GFX9 %s
45
; RUN: llc -mtriple=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefixes=VI,SDAG-VI %s
56
; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefixes=SDAG-CI %s
@@ -329,14 +330,23 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src
329330
}
330331

331332
define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %src0, half %src1, half %src2) #0 {
332-
; GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
333-
; GFX11: ; %bb.0:
334-
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
335-
; GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
336-
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
337-
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
338-
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
339-
; GFX11-NEXT: s_setpc_b64 s[30:31]
333+
; SDAG-GFX11-TRUE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
334+
; SDAG-GFX11-TRUE16: ; %bb.0:
335+
; SDAG-GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
336+
; SDAG-GFX11-TRUE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
337+
; SDAG-GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
338+
; SDAG-GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0
339+
; SDAG-GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
340+
; SDAG-GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31]
341+
;
342+
; SDAG-GFX11-FAKE16-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
343+
; SDAG-GFX11-FAKE16: ; %bb.0:
344+
; SDAG-GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
345+
; SDAG-GFX11-FAKE16-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
346+
; SDAG-GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
347+
; SDAG-GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0
348+
; SDAG-GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
349+
; SDAG-GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31]
340350
;
341351
; GFX9-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
342352
; GFX9: ; %bb.0:
@@ -363,6 +373,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %
363373
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
364374
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
365375
;
376+
; GISEL-GFX11-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
377+
; GISEL-GFX11: ; %bb.0:
378+
; GISEL-GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
379+
; GISEL-GFX11-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] clamp
380+
; GISEL-GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
381+
; GISEL-GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
382+
; GISEL-GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
383+
; GISEL-GFX11-NEXT: s_setpc_b64 s[30:31]
384+
;
366385
; GISEL-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
367386
; GISEL-CI: ; %bb.0:
368387
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)

0 commit comments

Comments
 (0)