Skip to content

Commit 5048d82

Browse files
committed
true16 for v_cvt_pk_bf8/fp8_f32
1 parent 5fc3e76 commit 5048d82

File tree

10 files changed

+419
-149
lines changed

10 files changed

+419
-149
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8960,10 +8960,14 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
89608960
// Adding vdst_in operand is already covered for these DPP instructions in
89618961
// cvtVOP3DPP.
89628962
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
8963-
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
8964-
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
8965-
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
8966-
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12 ||
8963+
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 ||
8964+
Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx12 ||
8965+
Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx12 ||
8966+
Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx12 ||
8967+
Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp_gfx12 ||
8968+
Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp_gfx12 ||
8969+
Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp8_gfx12 ||
8970+
Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx12 ||
89678971
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
89688972
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
89698973
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2822,6 +2822,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
28222822
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
28232823
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
28242824
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
2825+
def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
28252826
def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
28262827
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
28272828
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 64 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,36 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
570570
let HasExtVOP3DPP = 1;
571571
}
572572

573+
def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> {
574+
defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel);
575+
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
576+
0, HasModifiers, HasSrc2Mods,
577+
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
578+
Tail);
579+
let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
580+
Src2VOP3DPP, NumSrcArgs, 0, HasModifiers,
581+
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
582+
Src2ModVOP3DPP, false>.ret,
583+
Tail);
584+
let HasClamp = 0;
585+
let HasExtVOP3DPP = 1;
586+
}
587+
588+
def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> {
589+
defvar Tail = (ins VGPR_16:$vdst_in, op_sel0:$op_sel);
590+
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
591+
0, HasModifiers, HasSrc2Mods,
592+
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
593+
Tail);
594+
let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
595+
Src2VOP3DPP, NumSrcArgs, 0, HasModifiers,
596+
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
597+
Src2ModVOP3DPP, false>.ret,
598+
Tail);
599+
let HasClamp = 0;
600+
let HasExtVOP3DPP = 1;
601+
}
602+
573603
def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
574604
VOP3_OPSEL> {
575605
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
@@ -675,8 +705,12 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I
675705
let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
676706
SchedRW = [WriteFloatCvt] in {
677707
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
678-
defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
679-
defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
708+
defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile,
709+
VOP3_CVT_PK_F8_F32_Profile_t16,
710+
VOP3_CVT_PK_F8_F32_Profile_fake16>;
711+
defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile,
712+
VOP3_CVT_PK_F8_F32_Profile_t16,
713+
VOP3_CVT_PK_F8_F32_Profile_fake16>;
680714

681715
let SubtargetPredicate = isGFX12Plus in {
682716
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
@@ -698,6 +732,21 @@ class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : G
698732
(inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0)
699733
>;
700734

735+
multiclass Cvt_PK_F8_F32_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst> {
736+
def : GCNPat<
737+
(i32 (node f32:$src0, f32:$src1, i32:$old, -1)),
738+
(REG_SEQUENCE VGPR_32,
739+
(i16 (EXTRACT_SUBREG $old, lo16)), lo16,
740+
(i16 (inst SRCMODS.DST_OP_SEL, $src0, 0, $src1, (i16 (EXTRACT_SUBREG $old, hi16)), 0)), hi16)
741+
>;
742+
def : GCNPat<
743+
(i32 (node f32:$src0, f32:$src1, i32:$old, 0)),
744+
(REG_SEQUENCE VGPR_32,
745+
(i16 (inst 0, $src0, 0, $src1, (i16 (EXTRACT_SUBREG $old, lo16)), 0)), lo16,
746+
(i16 (EXTRACT_SUBREG $old, hi16)), hi16)
747+
>;
748+
}
749+
701750
class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
702751
(i32 (node f32:$src0, i32:$src1, i32:$old, index)),
703752
(inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
@@ -712,9 +761,20 @@ class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType
712761

713762
let OtherPredicates = [HasFP8ConversionInsts] in {
714763
foreach Index = [0, -1] in {
764+
let True16Predicate = NotHasTrue16BitInsts in {
715765
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
716766
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
717767
}
768+
let True16Predicate = UseFakeTrue16Insts in {
769+
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_fake16_e64>;
770+
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_fake16_e64>;
771+
}
772+
}
773+
774+
let True16Predicate = UseRealTrue16Insts in {
775+
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_t16_e64>;
776+
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_bf8_f32, V_CVT_PK_BF8_F32_t16_e64>;
777+
}
718778

719779
let SubtargetPredicate = isGFX940Plus in {
720780
foreach Index = [0, 1, 2, 3] in {
@@ -1642,8 +1702,8 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m
16421702
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
16431703
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
16441704

1645-
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>;
1646-
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>;
1705+
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
1706+
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
16471707
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
16481708
defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
16491709

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll

Lines changed: 94 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX942 %s
44
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
55
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
6-
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
6+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
7+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
78
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
89

910
declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
@@ -275,17 +276,29 @@ define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
275276
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
276277
; GFX9X-NEXT: s_setpc_b64 s[30:31]
277278
;
278-
; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
279-
; GFX12: ; %bb.0:
280-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
281-
; GFX12-NEXT: s_wait_expcnt 0x0
282-
; GFX12-NEXT: s_wait_samplecnt 0x0
283-
; GFX12-NEXT: s_wait_bvhcnt 0x0
284-
; GFX12-NEXT: s_wait_kmcnt 0x0
285-
; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1
286-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
287-
; GFX12-NEXT: v_mov_b32_e32 v0, v2
288-
; GFX12-NEXT: s_setpc_b64 s[30:31]
279+
; GFX12-TRUE16-LABEL: test_cvt_pk_bf8_f32_word0:
280+
; GFX12-TRUE16: ; %bb.0:
281+
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
282+
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
283+
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
284+
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
285+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
286+
; GFX12-TRUE16-NEXT: v_cvt_pk_bf8_f32 v2.l, v0, v1
287+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
288+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
289+
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
290+
;
291+
; GFX12-FAKE16-LABEL: test_cvt_pk_bf8_f32_word0:
292+
; GFX12-FAKE16: ; %bb.0:
293+
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
294+
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
295+
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
296+
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
297+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
298+
; GFX12-FAKE16-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1
299+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
300+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
301+
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
289302
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
290303
ret i32 %ret
291304
}
@@ -299,17 +312,29 @@ define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
299312
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
300313
; GFX9X-NEXT: s_setpc_b64 s[30:31]
301314
;
302-
; GFX12-LABEL: test_cvt_pk_bf8_f32_word1:
303-
; GFX12: ; %bb.0:
304-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
305-
; GFX12-NEXT: s_wait_expcnt 0x0
306-
; GFX12-NEXT: s_wait_samplecnt 0x0
307-
; GFX12-NEXT: s_wait_bvhcnt 0x0
308-
; GFX12-NEXT: s_wait_kmcnt 0x0
309-
; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
310-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
311-
; GFX12-NEXT: v_mov_b32_e32 v0, v2
312-
; GFX12-NEXT: s_setpc_b64 s[30:31]
315+
; GFX12-TRUE16-LABEL: test_cvt_pk_bf8_f32_word1:
316+
; GFX12-TRUE16: ; %bb.0:
317+
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
318+
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
319+
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
320+
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
321+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
322+
; GFX12-TRUE16-NEXT: v_cvt_pk_bf8_f32 v2.h, v0, v1 op_sel:[0,0,1]
323+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
324+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
325+
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
326+
;
327+
; GFX12-FAKE16-LABEL: test_cvt_pk_bf8_f32_word1:
328+
; GFX12-FAKE16: ; %bb.0:
329+
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
330+
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
331+
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
332+
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
333+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
334+
; GFX12-FAKE16-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
335+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
336+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
337+
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
313338
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
314339
ret i32 %ret
315340
}
@@ -322,17 +347,29 @@ define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
322347
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
323348
; GFX9X-NEXT: s_setpc_b64 s[30:31]
324349
;
325-
; GFX12-LABEL: test_cvt_pk_fp8_f32_word0:
326-
; GFX12: ; %bb.0:
327-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
328-
; GFX12-NEXT: s_wait_expcnt 0x0
329-
; GFX12-NEXT: s_wait_samplecnt 0x0
330-
; GFX12-NEXT: s_wait_bvhcnt 0x0
331-
; GFX12-NEXT: s_wait_kmcnt 0x0
332-
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1
333-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
334-
; GFX12-NEXT: v_mov_b32_e32 v0, v2
335-
; GFX12-NEXT: s_setpc_b64 s[30:31]
350+
; GFX12-TRUE16-LABEL: test_cvt_pk_fp8_f32_word0:
351+
; GFX12-TRUE16: ; %bb.0:
352+
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
353+
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
354+
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
355+
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
356+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
357+
; GFX12-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.l, v0, v1
358+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
359+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
360+
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
361+
;
362+
; GFX12-FAKE16-LABEL: test_cvt_pk_fp8_f32_word0:
363+
; GFX12-FAKE16: ; %bb.0:
364+
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
365+
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
366+
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
367+
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
368+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
369+
; GFX12-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1
370+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
371+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
372+
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
336373
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
337374
ret i32 %ret
338375
}
@@ -346,17 +383,29 @@ define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
346383
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
347384
; GFX9X-NEXT: s_setpc_b64 s[30:31]
348385
;
349-
; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
350-
; GFX12: ; %bb.0:
351-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
352-
; GFX12-NEXT: s_wait_expcnt 0x0
353-
; GFX12-NEXT: s_wait_samplecnt 0x0
354-
; GFX12-NEXT: s_wait_bvhcnt 0x0
355-
; GFX12-NEXT: s_wait_kmcnt 0x0
356-
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
357-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
358-
; GFX12-NEXT: v_mov_b32_e32 v0, v2
359-
; GFX12-NEXT: s_setpc_b64 s[30:31]
386+
; GFX12-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1:
387+
; GFX12-TRUE16: ; %bb.0:
388+
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
389+
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
390+
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
391+
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
392+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
393+
; GFX12-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.h, v0, v1 op_sel:[0,0,1]
394+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
395+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
396+
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
397+
;
398+
; GFX12-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1:
399+
; GFX12-FAKE16: ; %bb.0:
400+
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
401+
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
402+
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
403+
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
404+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
405+
; GFX12-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
406+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
407+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
408+
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
360409
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
361410
ret i32 %ret
362411
}

llvm/test/MC/AMDGPU/gfx12_asm_vop3.s

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,23 +1169,35 @@ v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
11691169
v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
11701170
// GFX12: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
11711171

1172-
v_cvt_pk_fp8_f32 v1, v2, v3
1173-
// GFX12: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
1172+
v_cvt_pk_fp8_f32 v1.l, v2, v3
1173+
// GFX12: v_cvt_pk_fp8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
11741174

1175-
v_cvt_pk_fp8_f32 v1, -v2, |v3|
1176-
// GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
1175+
v_cvt_pk_fp8_f32 v1.l, -v2, |v3|
1176+
// GFX12: v_cvt_pk_fp8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
11771177

1178-
v_cvt_pk_fp8_f32 v1, s2, 3
1179-
// GFX12: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
1178+
v_cvt_pk_fp8_f32 v1.l, s2, 3
1179+
// GFX12: v_cvt_pk_fp8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
11801180

1181-
v_cvt_pk_bf8_f32 v1, v2, v3
1182-
// GFX12: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
1181+
v_cvt_pk_fp8_f32 v1.h v2, v3
1182+
// GFX12: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
11831183

1184-
v_cvt_pk_bf8_f32 v1, -v2, |v3|
1185-
// GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
1184+
v_cvt_pk_fp8_f32 v255.h v2, v3
1185+
// GFX12: v_cvt_pk_fp8_f32 v255.h, v2, v3 op_sel:[0,0,1] ; encoding: [0xff,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
11861186

1187-
v_cvt_pk_bf8_f32 v1, s2, 3
1188-
// GFX12: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
1187+
v_cvt_pk_bf8_f32 v1.l, v2, v3
1188+
// GFX12: v_cvt_pk_bf8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
1189+
1190+
v_cvt_pk_bf8_f32 v1.l, -v2, |v3|
1191+
// GFX12: v_cvt_pk_bf8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
1192+
1193+
v_cvt_pk_bf8_f32 v1.l, s2, 3
1194+
// GFX12: v_cvt_pk_bf8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
1195+
1196+
v_cvt_pk_bf8_f32 v1.h, v2, v3
1197+
// GFX12: v_cvt_pk_bf8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x6a,0xd7,0x02,0x07,0x02,0x00]
1198+
1199+
v_cvt_pk_bf8_f32 v255.h, -v2, |v3|
1200+
// GFX12: v_cvt_pk_bf8_f32 v255.h, -v2, |v3| op_sel:[0,0,1] ; encoding: [0xff,0x42,0x6a,0xd7,0x02,0x07,0x02,0x20]
11891201

11901202
v_cvt_sr_fp8_f32 v1, v2, v3
11911203
// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]

0 commit comments

Comments
 (0)