Skip to content

[AMDGPU] Support byte_sel modifier for v_cvt_f32_fp8 and v_cvt_f32_bf8 #90887

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8634,8 +8634,8 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands,
}

if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::byte_sel)) {
assert(AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in));
Inst.addOperand(Inst.getOperand(0));
if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in))
Inst.addOperand(Inst.getOperand(0));
addOptionalImmOperand(Inst, Operands, OptionalIdx,
AMDGPUOperand::ImmTyByteSel);
}
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ bool GCNDPPCombine::isShrinkable(MachineInstr &MI) const {
if (!hasNoImmOrEqual(MI, AMDGPU::OpName::src0_modifiers, 0, Mask) ||
!hasNoImmOrEqual(MI, AMDGPU::OpName::src1_modifiers, 0, Mask) ||
!hasNoImmOrEqual(MI, AMDGPU::OpName::clamp, 0) ||
!hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0)) {
!hasNoImmOrEqual(MI, AMDGPU::OpName::omod, 0) ||
!hasNoImmOrEqual(MI, AMDGPU::OpName::byte_sel, 0)) {
LLVM_DEBUG(dbgs() << " Inst has non-default modifiers\n");
return false;
}
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4459,7 +4459,8 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,

// Check output modifiers
return !hasModifiersSet(MI, AMDGPU::OpName::omod) &&
!hasModifiersSet(MI, AMDGPU::OpName::clamp);
!hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
!hasModifiersSet(MI, AMDGPU::OpName::byte_sel);
}

// Set VCC operand with all flags from \p Orig, except for setting it as
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -2306,8 +2306,9 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsWMMA = 0;
field bit IsSWMMAC = 0;

field bit IsFP8 = 0;
field bit IsFP8SrcByteSel = 0;
field bit IsFP8DstByteSel = 0;
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);

field bit HasDst = !ne(DstVT.Value, untyped.Value);
field bit HasDst32 = HasDst;
Expand Down Expand Up @@ -2427,7 +2428,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field string AsmDPP8 = getAsmDPP8<HasDst, NumSrcArgs, 0 /*HasModifiers*/, DstVT>.ret;
field string AsmVOP3Base = getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, HasModifiers,
HasModifiers, DstVT, IsFP8DstByteSel>.ret;
HasModifiers, DstVT, IsFP8ByteSel>.ret;
field string Asm64 = AsmVOP3Base;
field string AsmVOP3P = getAsmVOP3P<NumSrcArgs, HasModifiers, HasClamp, HasOpSel>.ret;
field string AsmVOP3OpSel = getAsmVOP3OpSel<NumSrcArgs,
Expand Down
53 changes: 26 additions & 27 deletions llvm/lib/Target/AMDGPU/VOP1Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -625,42 +625,44 @@ def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F <v2f32, i32> {
let HasExtVOP3DPP = 0;
}

def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> {
let HasOpSel = 1;
class VOPProfile_Base_CVT_F_F8_ByteSel<ValueType DstVT> : VOPProfile<[DstVT, i32, untyped, untyped]> {
let IsFP8SrcByteSel = 1;
let HasOpSel = 0;
let HasExtDPP = 1;
let HasExtVOP3DPP = 1;
let IsFP8 = 1;
let HasExtSDWA = 0;
let HasClamp = 0;
let HasOMod = 0;
let HasModifiers = 1;
let Src1VOP3DPP = Src1RC64;
let HasModifiers = 0;

defvar bytesel = (ins ByteSel:$byte_sel);
let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
HasClamp, HasModifiers, HasSrc2Mods,
HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret,
bytesel);
let InsVOP3Base = !con(getInsVOP3Base<Src0VOP3DPP, Src1VOP3DPP, Src2VOP3DPP,
NumSrcArgs, HasClamp, HasModifiers, HasSrc2Mods,
HasOMod, Src0ModVOP3DPP, Src1ModVOP3DPP,
Src2ModVOP3DPP, HasOpSel>.ret,
bytesel);
}

let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts],
mayRaiseFPException = 0, SchedRW = [WriteFloatCvt] in {
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
defm V_CVT_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
defm V_CVT_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F_F8_ByteSel<f32>>;
defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
}

class Cvt_F32_F8_Pat_OpSel<SDPatternOperator node, bits<2> index,
VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
(f32 (node i32:$src, index)),
!if (index,
(inst_e64 !or(!if(index{0}, SRCMODS.OP_SEL_1, 0),
!if(index{1}, SRCMODS.OP_SEL_0, 0)),
$src, 0),
(inst_e32 $src))
class Cvt_F_F8_Pat_ByteSel<SDPatternOperator node, VOP3_Pseudo inst> : GCNPat<
(node i32:$src0, timm:$byte_sel),
(inst $src0, (as_i32timm $byte_sel))
>;

let SubtargetPredicate = isGFX12Plus, OtherPredicates = [HasFP8ConversionInsts] in {
foreach Index = [0, 1, 2, 3] in {
def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_fp8, Index,
V_CVT_F32_FP8_e32, V_CVT_F32_FP8_OP_SEL_e64>;
def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_bf8, Index,
V_CVT_F32_BF8_e32, V_CVT_F32_BF8_OP_SEL_e64>;
}
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_fp8, V_CVT_F32_FP8_OP_SEL_e64>;
def : Cvt_F_F8_Pat_ByteSel<int_amdgcn_cvt_f32_bf8, V_CVT_F32_BF8_OP_SEL_e64>;
}

class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
Expand Down Expand Up @@ -901,14 +903,11 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;


defm V_CVT_F32_FP8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
defm V_CVT_F32_BF8 : VOP1_Real_FULL_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;

// Define VOP1 instructions using the pseudo instruction with its old profile and
// VOP3 using the OpSel profile for the pseudo instruction.
defm V_CVT_F32_FP8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">;
defm V_CVT_F32_FP8 : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;

defm V_CVT_F32_BF8 : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">;
defm V_CVT_F32_BF8 : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;

defm V_CVT_PK_F32_FP8 : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8", "v_cvt_pk_f32_fp8">;
defm V_CVT_PK_F32_FP8 : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_OP_SEL", "v_cvt_pk_f32_fp8">;

Expand Down
29 changes: 19 additions & 10 deletions llvm/lib/Target/AMDGPU/VOPInstructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -306,9 +306,10 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {

class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;

class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0);
let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
class VOP3FP8OpSel_src_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
bits<2> byte_sel;
let Inst{11-12} = byte_sel; // NB: bit order is intentionally reversed!
let Inst{14-13} = 0; // op_sel2/3
}

class VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
Expand Down Expand Up @@ -755,10 +756,14 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
// OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0),
!if(P.IsFP8SrcByteSel, byte_sel{1}, ?));
let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0),
!if(P.IsFP8SrcByteSel, byte_sel{0}, ?));
let Inst{13} = !if(P.HasOpSel, !if(P.HasSrc2Mods, src2_modifiers{2}, 0),
!if(P.IsFP8DstByteSel, byte_sel{0}, ?));
let Inst{14} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{3}, 0),
!if(P.IsFP8DstByteSel, byte_sel{1}, ?));
let Inst{15} = !if(P.HasClamp, clamp, 0);
let Inst{25-16} = op;
let Inst{31-26} = 0x35;
Expand Down Expand Up @@ -1397,7 +1402,11 @@ multiclass VOP3_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
bit isSingle = 0> {
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
let IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
if ps.Pfl.IsFP8DstByteSel then {
if ps.Pfl.IsFP8SrcByteSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3FP8OpSel_src_bytesel_gfx11_gfx12<op, ps.Pfl>;
} else if ps.Pfl.IsFP8DstByteSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
Expand Down Expand Up @@ -1428,10 +1437,10 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
let AsmString = asmName # ps.AsmOperands,
IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
if ps.Pfl.IsFP8 then {
if ps.Pfl.IsFP8SrcByteSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
VOP3FP8OpSel_src_bytesel_gfx11_gfx12<op, ps.Pfl>;
} else if ps.Pfl.IsFP8DstByteSel then {
def _e64#Gen.Suffix :
VOP3_Real_Gen<ps, Gen>,
Expand Down
12 changes: 3 additions & 9 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) {
define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
; GFX12-LABEL: test_cvt_f32_bf8_byte1:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: ; return to shader part epilog
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1)
Expand All @@ -26,9 +24,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
; GFX12-LABEL: test_cvt_f32_bf8_byte2:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
; GFX12-NEXT: v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: ; return to shader part epilog
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2)
Expand All @@ -38,9 +34,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) {
; GFX12-LABEL: test_cvt_f32_fp8_byte3:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
; GFX12-NEXT: v_cvt_f32_fp8_e64_dpp v0, v0 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
; GFX12-NEXT: ; return to shader part epilog
%tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3)
Expand Down
22 changes: 11 additions & 11 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ body: |
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GFX12-NEXT: [[V_CVT_F32_BF8_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_dpp]]
; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_dpp]]
; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
%2:vgpr_32 = V_CVT_F32_BF8_e32 killed %1, implicit $mode, implicit $exec
%2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e32 killed %1, implicit $mode, implicit $exec
$vgpr0 = COPY %2
SI_RETURN_TO_EPILOG $vgpr0

Expand All @@ -34,13 +34,13 @@ body: |
; GFX12: liveins: $vgpr0
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_]]
; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64_dpp [[DEF]], [[COPY]], 2, 228, 15, 15, 1, implicit $mode, implicit $exec
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_dpp]]
; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
%2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, implicit $mode, implicit $exec
%2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 killed %1, 2, implicit $mode, implicit $exec
$vgpr0 = COPY %2
SI_RETURN_TO_EPILOG $vgpr0

Expand All @@ -56,13 +56,13 @@ body: |
; GFX12: liveins: $vgpr0
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_]]
; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64_dpp [[DEF]], [[COPY]], 3, 228, 15, 15, 1, implicit $mode, implicit $exec
; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_dpp]]
; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
%0:vgpr_32 = COPY $vgpr0
%1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
%2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, implicit $mode, implicit $exec
%2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 killed %1, 3, implicit $mode, implicit $exec
$vgpr0 = COPY %2
SI_RETURN_TO_EPILOG $vgpr0

Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ define float @test_cvt_f32_bf8_byte1(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
ret float %ret
Expand All @@ -67,7 +67,7 @@ define float @test_cvt_f32_bf8_byte2(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:2
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
ret float %ret
Expand All @@ -87,7 +87,7 @@ define float @test_cvt_f32_bf8_byte3(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1]
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
ret float %ret
Expand Down Expand Up @@ -127,7 +127,7 @@ define float @test_cvt_f32_fp8_byte1(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1]
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
ret float %ret
Expand All @@ -147,7 +147,7 @@ define float @test_cvt_f32_fp8_byte2(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0]
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:2
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
ret float %ret
Expand All @@ -167,7 +167,7 @@ define float @test_cvt_f32_fp8_byte3(i32 %a) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:3
; GFX12-NEXT: s_setpc_b64 s[30:31]
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
ret float %ret
Expand Down Expand Up @@ -552,7 +552,7 @@ define float @test_sext_cvt_f32_fp8(i16 %a) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1]
; GFX12-NEXT: v_cvt_f32_fp8_e64 v0, v0 byte_sel:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a.sext, i32 1)
Expand All @@ -576,7 +576,7 @@ define float @test_sext_cvt_f32_bf8(i16 %a) {
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
; GFX12-NEXT: v_cvt_f32_bf8_e64 v0, v0 byte_sel:1
; GFX12-NEXT: s_setpc_b64 s[30:31]
%a.sext = sext i16 %a to i32
%ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a.sext, i32 1)
Expand Down
Loading