Skip to content

Commit d2f06b2

Browse files
authored
[AMDGPU][True16][MC][CodeGen] true16 mode for v_cvt_pk_bf8/fp8_f32 (#141881)
Update true16/fake16 profile with v_cvt_pk_bf8/fp8_f32, keeping the vdst_in profile, and update codegen pattern. update mc test and codegen test.
1 parent 90e906a commit d2f06b2

File tree

10 files changed

+421
-149
lines changed

10 files changed

+421
-149
lines changed

llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8975,10 +8975,14 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
89758975
// Adding vdst_in operand is already covered for these DPP instructions in
89768976
// cvtVOP3DPP.
89778977
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
8978-
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
8979-
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
8980-
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
8981-
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12 ||
8978+
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 ||
8979+
Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx12 ||
8980+
Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx12 ||
8981+
Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx12 ||
8982+
Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp_gfx12 ||
8983+
Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp_gfx12 ||
8984+
Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp8_gfx12 ||
8985+
Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx12 ||
89828986
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
89838987
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
89848988
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2822,6 +2822,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
28222822
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
28232823
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
28242824
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
2825+
def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
28252826
def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
28262827
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
28272828
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 66 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -570,6 +570,38 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
570570
let HasExtVOP3DPP = 1;
571571
}
572572

573+
def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> {
574+
defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel);
575+
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
576+
0, HasModifiers, HasSrc2Mods,
577+
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
578+
Tail);
579+
let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
580+
Src2VOP3DPP, NumSrcArgs, 0, HasModifiers,
581+
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
582+
Src2ModVOP3DPP, false>.ret,
583+
Tail);
584+
let HasClamp = 0;
585+
let HasExtVOP3DPP = 1;
586+
}
587+
588+
// This t16 profile with vdst_in operand is for backward compatibility and is used
589+
// for user controlled packing
590+
def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> {
591+
defvar Tail = (ins VGPR_16:$vdst_in, op_sel0:$op_sel);
592+
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
593+
0, HasModifiers, HasSrc2Mods,
594+
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
595+
Tail);
596+
let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
597+
Src2VOP3DPP, NumSrcArgs, 0, HasModifiers,
598+
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
599+
Src2ModVOP3DPP, false>.ret,
600+
Tail);
601+
let HasClamp = 0;
602+
let HasExtVOP3DPP = 1;
603+
}
604+
573605
def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
574606
VOP3_OPSEL> {
575607
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
@@ -675,8 +707,12 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I
675707
let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
676708
SchedRW = [WriteFloatCvt] in {
677709
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
678-
defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
679-
defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
710+
defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile,
711+
VOP3_CVT_PK_F8_F32_Profile_t16,
712+
VOP3_CVT_PK_F8_F32_Profile_fake16>;
713+
defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile,
714+
VOP3_CVT_PK_F8_F32_Profile_t16,
715+
VOP3_CVT_PK_F8_F32_Profile_fake16>;
680716

681717
let SubtargetPredicate = isGFX12Plus in {
682718
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
@@ -698,6 +734,21 @@ class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : G
698734
(inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0)
699735
>;
700736

737+
multiclass Cvt_PK_F8_F32_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst> {
738+
def : GCNPat<
739+
(i32 (node f32:$src0, f32:$src1, i32:$old, -1)),
740+
(REG_SEQUENCE VGPR_32,
741+
(i16 (EXTRACT_SUBREG $old, lo16)), lo16,
742+
(i16 (inst SRCMODS.DST_OP_SEL, $src0, 0, $src1, (i16 (EXTRACT_SUBREG $old, hi16)), 0)), hi16)
743+
>;
744+
def : GCNPat<
745+
(i32 (node f32:$src0, f32:$src1, i32:$old, 0)),
746+
(REG_SEQUENCE VGPR_32,
747+
(i16 (inst 0, $src0, 0, $src1, (i16 (EXTRACT_SUBREG $old, lo16)), 0)), lo16,
748+
(i16 (EXTRACT_SUBREG $old, hi16)), hi16)
749+
>;
750+
}
751+
701752
class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
702753
(i32 (node f32:$src0, i32:$src1, i32:$old, index)),
703754
(inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
@@ -712,9 +763,20 @@ class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType
712763

713764
let OtherPredicates = [HasFP8ConversionInsts] in {
714765
foreach Index = [0, -1] in {
766+
let True16Predicate = NotHasTrue16BitInsts in {
715767
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
716768
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
717769
}
770+
let True16Predicate = UseFakeTrue16Insts in {
771+
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_fake16_e64>;
772+
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_fake16_e64>;
773+
}
774+
}
775+
776+
let True16Predicate = UseRealTrue16Insts in {
777+
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_t16_e64>;
778+
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_bf8_f32, V_CVT_PK_BF8_F32_t16_e64>;
779+
}
718780

719781
let SubtargetPredicate = isGFX940Plus in {
720782
foreach Index = [0, 1, 2, 3] in {
@@ -1642,8 +1704,8 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m
16421704
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
16431705
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
16441706

1645-
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>;
1646-
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>;
1707+
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
1708+
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
16471709
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
16481710
defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;
16491711

llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll

Lines changed: 94 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,8 @@
33
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX942 %s
44
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
55
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
6-
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
6+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
7+
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
78
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
89

910
declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
@@ -275,17 +276,29 @@ define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
275276
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
276277
; GFX9X-NEXT: s_setpc_b64 s[30:31]
277278
;
278-
; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
279-
; GFX12: ; %bb.0:
280-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
281-
; GFX12-NEXT: s_wait_expcnt 0x0
282-
; GFX12-NEXT: s_wait_samplecnt 0x0
283-
; GFX12-NEXT: s_wait_bvhcnt 0x0
284-
; GFX12-NEXT: s_wait_kmcnt 0x0
285-
; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1
286-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
287-
; GFX12-NEXT: v_mov_b32_e32 v0, v2
288-
; GFX12-NEXT: s_setpc_b64 s[30:31]
279+
; GFX12-TRUE16-LABEL: test_cvt_pk_bf8_f32_word0:
280+
; GFX12-TRUE16: ; %bb.0:
281+
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
282+
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
283+
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
284+
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
285+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
286+
; GFX12-TRUE16-NEXT: v_cvt_pk_bf8_f32 v2.l, v0, v1
287+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
288+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
289+
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
290+
;
291+
; GFX12-FAKE16-LABEL: test_cvt_pk_bf8_f32_word0:
292+
; GFX12-FAKE16: ; %bb.0:
293+
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
294+
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
295+
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
296+
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
297+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
298+
; GFX12-FAKE16-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1
299+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
300+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
301+
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
289302
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
290303
ret i32 %ret
291304
}
@@ -299,17 +312,29 @@ define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
299312
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
300313
; GFX9X-NEXT: s_setpc_b64 s[30:31]
301314
;
302-
; GFX12-LABEL: test_cvt_pk_bf8_f32_word1:
303-
; GFX12: ; %bb.0:
304-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
305-
; GFX12-NEXT: s_wait_expcnt 0x0
306-
; GFX12-NEXT: s_wait_samplecnt 0x0
307-
; GFX12-NEXT: s_wait_bvhcnt 0x0
308-
; GFX12-NEXT: s_wait_kmcnt 0x0
309-
; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
310-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
311-
; GFX12-NEXT: v_mov_b32_e32 v0, v2
312-
; GFX12-NEXT: s_setpc_b64 s[30:31]
315+
; GFX12-TRUE16-LABEL: test_cvt_pk_bf8_f32_word1:
316+
; GFX12-TRUE16: ; %bb.0:
317+
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
318+
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
319+
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
320+
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
321+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
322+
; GFX12-TRUE16-NEXT: v_cvt_pk_bf8_f32 v2.h, v0, v1 op_sel:[0,0,1]
323+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
324+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
325+
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
326+
;
327+
; GFX12-FAKE16-LABEL: test_cvt_pk_bf8_f32_word1:
328+
; GFX12-FAKE16: ; %bb.0:
329+
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
330+
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
331+
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
332+
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
333+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
334+
; GFX12-FAKE16-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
335+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
336+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
337+
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
313338
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
314339
ret i32 %ret
315340
}
@@ -322,17 +347,29 @@ define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
322347
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
323348
; GFX9X-NEXT: s_setpc_b64 s[30:31]
324349
;
325-
; GFX12-LABEL: test_cvt_pk_fp8_f32_word0:
326-
; GFX12: ; %bb.0:
327-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
328-
; GFX12-NEXT: s_wait_expcnt 0x0
329-
; GFX12-NEXT: s_wait_samplecnt 0x0
330-
; GFX12-NEXT: s_wait_bvhcnt 0x0
331-
; GFX12-NEXT: s_wait_kmcnt 0x0
332-
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1
333-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
334-
; GFX12-NEXT: v_mov_b32_e32 v0, v2
335-
; GFX12-NEXT: s_setpc_b64 s[30:31]
350+
; GFX12-TRUE16-LABEL: test_cvt_pk_fp8_f32_word0:
351+
; GFX12-TRUE16: ; %bb.0:
352+
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
353+
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
354+
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
355+
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
356+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
357+
; GFX12-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.l, v0, v1
358+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
359+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
360+
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
361+
;
362+
; GFX12-FAKE16-LABEL: test_cvt_pk_fp8_f32_word0:
363+
; GFX12-FAKE16: ; %bb.0:
364+
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
365+
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
366+
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
367+
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
368+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
369+
; GFX12-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1
370+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
371+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
372+
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
336373
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
337374
ret i32 %ret
338375
}
@@ -346,17 +383,29 @@ define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
346383
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
347384
; GFX9X-NEXT: s_setpc_b64 s[30:31]
348385
;
349-
; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
350-
; GFX12: ; %bb.0:
351-
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
352-
; GFX12-NEXT: s_wait_expcnt 0x0
353-
; GFX12-NEXT: s_wait_samplecnt 0x0
354-
; GFX12-NEXT: s_wait_bvhcnt 0x0
355-
; GFX12-NEXT: s_wait_kmcnt 0x0
356-
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
357-
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
358-
; GFX12-NEXT: v_mov_b32_e32 v0, v2
359-
; GFX12-NEXT: s_setpc_b64 s[30:31]
386+
; GFX12-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1:
387+
; GFX12-TRUE16: ; %bb.0:
388+
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
389+
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
390+
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
391+
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
392+
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
393+
; GFX12-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.h, v0, v1 op_sel:[0,0,1]
394+
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
395+
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
396+
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
397+
;
398+
; GFX12-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1:
399+
; GFX12-FAKE16: ; %bb.0:
400+
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
401+
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
402+
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
403+
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
404+
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
405+
; GFX12-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
406+
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
407+
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
408+
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
360409
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
361410
ret i32 %ret
362411
}

llvm/test/MC/AMDGPU/gfx12_asm_vop3.s

Lines changed: 24 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1169,23 +1169,35 @@ v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
11691169
v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
11701170
// GFX12: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
11711171

1172-
v_cvt_pk_fp8_f32 v1, v2, v3
1173-
// GFX12: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
1172+
v_cvt_pk_fp8_f32 v1.l, v2, v3
1173+
// GFX12: v_cvt_pk_fp8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
11741174

1175-
v_cvt_pk_fp8_f32 v1, -v2, |v3|
1176-
// GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
1175+
v_cvt_pk_fp8_f32 v1.l, -v2, |v3|
1176+
// GFX12: v_cvt_pk_fp8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
11771177

1178-
v_cvt_pk_fp8_f32 v1, s2, 3
1179-
// GFX12: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
1178+
v_cvt_pk_fp8_f32 v1.l, s2, 3
1179+
// GFX12: v_cvt_pk_fp8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
11801180

1181-
v_cvt_pk_bf8_f32 v1, v2, v3
1182-
// GFX12: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
1181+
v_cvt_pk_fp8_f32 v1.h v2, v3
1182+
// GFX12: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
11831183

1184-
v_cvt_pk_bf8_f32 v1, -v2, |v3|
1185-
// GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
1184+
v_cvt_pk_fp8_f32 v255.h v2, v3
1185+
// GFX12: v_cvt_pk_fp8_f32 v255.h, v2, v3 op_sel:[0,0,1] ; encoding: [0xff,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]
11861186

1187-
v_cvt_pk_bf8_f32 v1, s2, 3
1188-
// GFX12: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
1187+
v_cvt_pk_bf8_f32 v1.l, v2, v3
1188+
// GFX12: v_cvt_pk_bf8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
1189+
1190+
v_cvt_pk_bf8_f32 v1.l, -v2, |v3|
1191+
// GFX12: v_cvt_pk_bf8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
1192+
1193+
v_cvt_pk_bf8_f32 v1.l, s2, 3
1194+
// GFX12: v_cvt_pk_bf8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
1195+
1196+
v_cvt_pk_bf8_f32 v1.h, v2, v3
1197+
// GFX12: v_cvt_pk_bf8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x6a,0xd7,0x02,0x07,0x02,0x00]
1198+
1199+
v_cvt_pk_bf8_f32 v255.h, -v2, |v3|
1200+
// GFX12: v_cvt_pk_bf8_f32 v255.h, -v2, |v3| op_sel:[0,0,1] ; encoding: [0xff,0x42,0x6a,0xd7,0x02,0x07,0x02,0x20]
11891201

11901202
v_cvt_sr_fp8_f32 v1, v2, v3
11911203
// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]

0 commit comments

Comments
 (0)