Skip to content

[AMDGPU][True16][MC][CodeGen] true16 mode for v_cvt_pk_bf8/fp8_f32 #141881

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8961,10 +8961,14 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
// Adding vdst_in operand is already covered for these DPP instructions in
// cvtVOP3DPP.
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12 ||
!(Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_BF8_F32_t16_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_t16_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_PK_BF8_F32_fake16_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_PK_FP8_F32_fake16_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp_gfx12 ||
Opc == AMDGPU::V_CVT_SR_FP8_F32_gfx12_e64_dpp8_gfx12 ||
Opc == AMDGPU::V_CVT_SR_BF8_F32_gfx12_e64_dpp_gfx12 ||
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2822,6 +2822,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
def VOP_I16_F32_F32 : VOPProfile <[i16, f32, f32, untyped]>;
def VOP_I32_I32_I32_ARITH : VOPProfile <[i32, i32, i32, untyped], /*EnableClamp=*/1>;
def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
def VOP_F32_F16_F16_F16 : VOPProfile <[f32, f16, f16, f16]>;
Expand Down
70 changes: 66 additions & 4 deletions llvm/lib/Target/AMDGPU/VOP3Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,38 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
let HasExtVOP3DPP = 1;
}

def VOP3_CVT_PK_F8_F32_Profile_fake16 : VOP3_Profile_Fake16<VOP_I16_F32_F32, VOP3_OPSEL> {
defvar Tail = (ins VGPR_32:$vdst_in, op_sel0:$op_sel);
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
0, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
Tail);
let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
Src2VOP3DPP, NumSrcArgs, 0, HasModifiers,
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
Src2ModVOP3DPP, false>.ret,
Tail);
let HasClamp = 0;
let HasExtVOP3DPP = 1;
}

// This t16 profile with vdst_in operand is for backward compatibility and is used
// for user controlled packing
def VOP3_CVT_PK_F8_F32_Profile_t16 : VOP3_Profile_True16<VOP_I16_F32_F32, VOP3_OPSEL> {
defvar Tail = (ins VGPR_16:$vdst_in, op_sel0:$op_sel);
let InsVOP3OpSel = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
0, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
Tail);
let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP,
Src2VOP3DPP, NumSrcArgs, 0, HasModifiers,
HasSrc2Mods, HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
Src2ModVOP3DPP, false>.ret,
Tail);
let HasClamp = 0;
let HasExtVOP3DPP = 1;
}

def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
VOP3_OPSEL> {
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
Expand Down Expand Up @@ -675,8 +707,12 @@ defm V_LSHL_ADD_U64 : VOP3Inst <"v_lshl_add_u64", VOP3_Profile<VOP_I64_I64_I32_I
let OtherPredicates = [HasFP8ConversionInsts], mayRaiseFPException = 0,
SchedRW = [WriteFloatCvt] in {
let Constraints = "$vdst = $vdst_in", DisableEncoding = "$vdst_in" in {
defm V_CVT_PK_FP8_F32 : VOP3Inst<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile>;
defm V_CVT_PK_BF8_F32 : VOP3Inst<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile>;
defm V_CVT_PK_FP8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_fp8_f32", VOP3_CVT_PK_F8_F32_Profile,
VOP3_CVT_PK_F8_F32_Profile_t16,
VOP3_CVT_PK_F8_F32_Profile_fake16>;
defm V_CVT_PK_BF8_F32 : VOP3Inst_t16_with_profiles<"v_cvt_pk_bf8_f32", VOP3_CVT_PK_F8_F32_Profile,
VOP3_CVT_PK_F8_F32_Profile_t16,
VOP3_CVT_PK_F8_F32_Profile_fake16>;

let SubtargetPredicate = isGFX12Plus in {
defm V_CVT_SR_FP8_F32_gfx12 : VOP3Inst<"v_cvt_sr_fp8_f32_gfx12", VOP3_CVT_SR_F8_ByteSel_Profile<f32>>;
Expand All @@ -698,6 +734,21 @@ class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : G
(inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0)
>;

multiclass Cvt_PK_F8_F32_t16_Pat<SDPatternOperator node, VOP3_Pseudo inst> {
def : GCNPat<
(i32 (node f32:$src0, f32:$src1, i32:$old, -1)),
(REG_SEQUENCE VGPR_32,
(i16 (EXTRACT_SUBREG $old, lo16)), lo16,
(i16 (inst SRCMODS.DST_OP_SEL, $src0, 0, $src1, (i16 (EXTRACT_SUBREG $old, hi16)), 0)), hi16)
>;
def : GCNPat<
(i32 (node f32:$src0, f32:$src1, i32:$old, 0)),
(REG_SEQUENCE VGPR_32,
(i16 (inst 0, $src0, 0, $src1, (i16 (EXTRACT_SUBREG $old, lo16)), 0)), lo16,
(i16 (EXTRACT_SUBREG $old, hi16)), hi16)
>;
}

class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
(i32 (node f32:$src0, i32:$src1, i32:$old, index)),
(inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
Expand All @@ -712,9 +763,20 @@ class Cvt_SR_F8_ByteSel_Pat<SDPatternOperator node, VOP3_Pseudo inst, ValueType

let OtherPredicates = [HasFP8ConversionInsts] in {
foreach Index = [0, -1] in {
let True16Predicate = NotHasTrue16BitInsts in {
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_e64>;
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_e64>;
}
let True16Predicate = UseFakeTrue16Insts in {
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_fp8_f32, Index, V_CVT_PK_FP8_F32_fake16_e64>;
def : Cvt_PK_F8_F32_Pat<int_amdgcn_cvt_pk_bf8_f32, Index, V_CVT_PK_BF8_F32_fake16_e64>;
}
}

let True16Predicate = UseRealTrue16Insts in {
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_fp8_f32, V_CVT_PK_FP8_F32_t16_e64>;
defm : Cvt_PK_F8_F32_t16_Pat<int_amdgcn_cvt_pk_bf8_f32, V_CVT_PK_BF8_F32_t16_e64>;
}

let SubtargetPredicate = isGFX940Plus in {
foreach Index = [0, 1, 2, 3] in {
Expand Down Expand Up @@ -1642,8 +1704,8 @@ defm V_MAXIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x368, "v_m
defm V_PERMLANE16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x30f>;
defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;

defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_gfx12<0x369>;
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_gfx12<0x36a>;
defm V_CVT_PK_FP8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x369, "v_cvt_pk_fp8_f32">;
defm V_CVT_PK_BF8_F32 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x36a, "v_cvt_pk_bf8_f32">;
defm V_CVT_SR_FP8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36b, "V_CVT_SR_FP8_F32_gfx12", "v_cvt_sr_fp8_f32" >;
defm V_CVT_SR_BF8_F32_gfx12 : VOP3_Realtriple_with_name_gfx12<0x36c, "V_CVT_SR_BF8_F32_gfx12", "v_cvt_sr_bf8_f32">;

Expand Down
139 changes: 94 additions & 45 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX942 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9X,GFX950 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16 %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s

declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
Expand Down Expand Up @@ -275,17 +276,29 @@ define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
; GFX9X-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-TRUE16-LABEL: test_cvt_pk_bf8_f32_word0:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_cvt_pk_bf8_f32 v2.l, v0, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: test_cvt_pk_bf8_f32_word0:
; GFX12-FAKE16: ; %bb.0:
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
ret i32 %ret
}
Expand All @@ -299,17 +312,29 @@ define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
; GFX9X-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_cvt_pk_bf8_f32_word1:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-TRUE16-LABEL: test_cvt_pk_bf8_f32_word1:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_cvt_pk_bf8_f32 v2.h, v0, v1 op_sel:[0,0,1]
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: test_cvt_pk_bf8_f32_word1:
; GFX12-FAKE16: ; %bb.0:
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
ret i32 %ret
}
Expand All @@ -322,17 +347,29 @@ define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
; GFX9X-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_cvt_pk_fp8_f32_word0:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-TRUE16-LABEL: test_cvt_pk_fp8_f32_word0:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.l, v0, v1
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: test_cvt_pk_fp8_f32_word0:
; GFX12-FAKE16: ; %bb.0:
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
ret i32 %ret
}
Expand All @@ -346,17 +383,29 @@ define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
; GFX9X-NEXT: v_mov_b32_e32 v0, v2
; GFX9X-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
; GFX12: ; %bb.0:
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-NEXT: s_wait_expcnt 0x0
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_mov_b32_e32 v0, v2
; GFX12-NEXT: s_setpc_b64 s[30:31]
; GFX12-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1:
; GFX12-TRUE16: ; %bb.0:
; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0
; GFX12-TRUE16-NEXT: v_cvt_pk_fp8_f32 v2.h, v0, v1 op_sel:[0,0,1]
; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-TRUE16-NEXT: v_mov_b32_e32 v0, v2
; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1:
; GFX12-FAKE16: ; %bb.0:
; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0
; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0
; GFX12-FAKE16-NEXT: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-FAKE16-NEXT: v_mov_b32_e32 v0, v2
; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31]
%ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
ret i32 %ret
}
Expand Down
36 changes: 24 additions & 12 deletions llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
Original file line number Diff line number Diff line change
Expand Up @@ -1169,23 +1169,35 @@ v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
// GFX12: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]

v_cvt_pk_fp8_f32 v1, v2, v3
// GFX12: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_fp8_f32 v1.l, v2, v3
// GFX12: v_cvt_pk_fp8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]

v_cvt_pk_fp8_f32 v1, -v2, |v3|
// GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
v_cvt_pk_fp8_f32 v1.l, -v2, |v3|
// GFX12: v_cvt_pk_fp8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]

v_cvt_pk_fp8_f32 v1, s2, 3
// GFX12: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
v_cvt_pk_fp8_f32 v1.l, s2, 3
// GFX12: v_cvt_pk_fp8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]

v_cvt_pk_bf8_f32 v1, v2, v3
// GFX12: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
v_cvt_pk_fp8_f32 v1.h v2, v3
// GFX12: v_cvt_pk_fp8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]

v_cvt_pk_bf8_f32 v1, -v2, |v3|
// GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
v_cvt_pk_fp8_f32 v255.h v2, v3
// GFX12: v_cvt_pk_fp8_f32 v255.h, v2, v3 op_sel:[0,0,1] ; encoding: [0xff,0x40,0x69,0xd7,0x02,0x07,0x02,0x00]

v_cvt_pk_bf8_f32 v1, s2, 3
// GFX12: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
v_cvt_pk_bf8_f32 v1.l, v2, v3
// GFX12: v_cvt_pk_bf8_f32 v1.l, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]

v_cvt_pk_bf8_f32 v1.l, -v2, |v3|
// GFX12: v_cvt_pk_bf8_f32 v1.l, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]

v_cvt_pk_bf8_f32 v1.l, s2, 3
// GFX12: v_cvt_pk_bf8_f32 v1.l, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]

v_cvt_pk_bf8_f32 v1.h, v2, v3
// GFX12: v_cvt_pk_bf8_f32 v1.h, v2, v3 op_sel:[0,0,1] ; encoding: [0x01,0x40,0x6a,0xd7,0x02,0x07,0x02,0x00]

v_cvt_pk_bf8_f32 v255.h, -v2, |v3|
// GFX12: v_cvt_pk_bf8_f32 v255.h, -v2, |v3| op_sel:[0,0,1] ; encoding: [0xff,0x42,0x6a,0xd7,0x02,0x07,0x02,0x20]

v_cvt_sr_fp8_f32 v1, v2, v3
// GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
Expand Down
Loading
Loading