Skip to content

[AMDGPU][MC][True16] Support VOP2 instructions with true16 format #115233

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,25 @@ static DecodeStatus decodeOperand_VSrcT16_Lo128(MCInst &Inst, unsigned Imm,
(AMDGPU::OperandSemantics)OperandSemantics));
}

template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
unsigned OperandSemantics>
static DecodeStatus
decodeOperand_VSrcT16_Lo128_Deferred(MCInst &Inst, unsigned Imm,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looks this can be combined with decodeOperand_VSrcT16_Lo128(), e.g., by adding a Deferred template parameter?

Copy link
Contributor Author

@broxigarchen broxigarchen Nov 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think it can be combined but It seems to be a naming convension in this file that the deferred decoder has the defered name on the function. So it might be better to keep them?

uint64_t /*Addr*/,
const MCDisassembler *Decoder) {
const auto *DAsm = static_cast<const AMDGPUDisassembler *>(Decoder);
assert(isUInt<9>(Imm) && "9-bit encoding expected");

if (Imm & AMDGPU::EncValues::IS_VGPR) {
bool IsHi = Imm & (1 << 7);
unsigned RegIdx = Imm & 0x7f;
return addOperand(Inst, DAsm->createVGPR16Operand(RegIdx, IsHi));
}
return addOperand(Inst, DAsm->decodeNonVGPRSrcOp(
OpWidth, Imm & 0xFF, true, ImmWidth,
(AMDGPU::OperandSemantics)OperandSemantics));
}

template <AMDGPUDisassembler::OpWidthTy OpWidth, unsigned ImmWidth,
unsigned OperandSemantics>
static DecodeStatus decodeOperand_VSrcT16(MCInst &Inst, unsigned Imm,
Expand Down
10 changes: 9 additions & 1 deletion llvm/lib/Target/AMDGPU/SIRegisterInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -1161,7 +1161,8 @@ def SSrcOrLds_b32 : SrcRegOrImm9 <SRegOrLds_32, "OPW32", "OPERAND_REG_IMM_INT32"
class SrcRegOrImmDeferred9<RegisterClass regClass, string opWidth,
string operandType, int immWidth, int OperandSemantics>
: RegOrImmOperand<regClass, operandType> {
let DecoderMethod = "decodeSrcRegOrImmDeferred9<AMDGPUDisassembler::" #
string DecoderMethodName = "decodeSrcRegOrImmDeferred9";
let DecoderMethod = DecoderMethodName # "<AMDGPUDisassembler::" #
opWidth # ", " # immWidth # ", " # OperandSemantics # ">";
}

Expand Down Expand Up @@ -1222,6 +1223,13 @@ def VSrc_bf16_Deferred : SrcRegOrImmDeferred9<VS_32, "OPW16", "OPERAND_REG_IMM_B
def VSrc_f16_Deferred : SrcRegOrImmDeferred9<VS_32, "OPW16", "OPERAND_REG_IMM_FP16_DEFERRED", 16, OperandSemantics.FP16>;
def VSrc_f32_Deferred : SrcRegOrImmDeferred9<VS_32, "OPW32", "OPERAND_REG_IMM_FP32_DEFERRED", 32, OperandSemantics.FP32>;

// True 16 Operands
def VSrcT_f16_Lo128_Deferred : SrcRegOrImmDeferred9<VS_16_Lo128, "OPW16",
"OPERAND_REG_IMM_FP16_DEFERRED", 16, OperandSemantics.FP16> {
let DecoderMethodName = "decodeOperand_VSrcT16_Lo128_Deferred";
let EncoderMethod = "getMachineOpValueT16Lo128";
}

def VSrcFake16_bf16_Lo128_Deferred
: SrcRegOrImmDeferred9<VS_32_Lo128, "OPW16", "OPERAND_REG_IMM_BF16_DEFERRED", 16, OperandSemantics.BF16>;
def VSrcFake16_f16_Lo128_Deferred
Expand Down
76 changes: 70 additions & 6 deletions llvm/lib/Target/AMDGPU/VOP2Instructions.td
Original file line number Diff line number Diff line change
Expand Up @@ -374,6 +374,12 @@ class VOP_MADAK <ValueType vt> : VOP_MADK_Base<vt> {
}

def VOP_MADAK_F16 : VOP_MADAK <f16>;
def VOP_MADAK_F16_t16 : VOP_MADAK <f16> {
let IsTrue16 = 1;
let IsRealTrue16 = 1;
let DstRC = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 0/*IsVOP3Encoding*/>.ret;
let Ins32 = (ins VSrcT_f16_Lo128_Deferred:$src0, VGPRSrc_16_Lo128:$src1, ImmOpType:$imm);
}
def VOP_MADAK_F16_fake16 : VOP_MADAK <f16> {
let IsTrue16 = 1;
let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
Expand All @@ -399,6 +405,12 @@ class VOP_MADMK <ValueType vt> : VOP_MADK_Base<vt> {
}

def VOP_MADMK_F16 : VOP_MADMK <f16>;
def VOP_MADMK_F16_t16 : VOP_MADMK <f16> {
let IsTrue16 = 1;
let IsRealTrue16 = 1;
let DstRC = getVALUDstForVT<DstVT, 1/*IsTrue16*/, 0/*IsVOP3Encoding*/>.ret;
let Ins32 = (ins VSrcT_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPRSrc_16_Lo128:$src1);
}
def VOP_MADMK_F16_fake16 : VOP_MADMK <f16> {
let IsTrue16 = 1;
let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
Expand Down Expand Up @@ -467,6 +479,42 @@ class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, v
}

def VOP_MAC_F16 : VOP_MAC <f16>;
def VOP_MAC_F16_t16 : VOP_MAC <f16> {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a lot of copy paste. Is there a way to avoid replicating so many lets?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see a way to reduce it much. These lets are pretty much standard for a true16 instruction. But we have the true16 lets, and all the VOP_MAC lets to support. We could inherit from VOPProfile_TRUE16 instead of VOP_MAC, but then we'd have all the same lets here as in VOP_MAC. One upside is when the default is switched to true16, we can delete 6 lines related to VOP3DPP.

Copy link
Contributor Author

@broxigarchen broxigarchen Nov 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks Joe for the explanation. let me double check if there are some lines can be removed, but for the majority of them I think they might have to stay

let IsTrue16 = 1;
let IsRealTrue16 = 1;
let HasOpSel = 1;
let DstRC = VOPDstOperand_t16Lo128;
let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret:$src2);
let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0/*IsFake16*/>.ret;
let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0/*IsFake16*/>.ret;
let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0/*IsFake16*/>.ret;
let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret:$src2, // stub argument
dpp_ctrl:$dpp_ctrl, DppRowMask:$row_mask,
DppBankMask:$bank_mask, DppBoundCtrl:$bound_ctrl);
let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret:$src2, // stub argument
dpp8:$dpp8, Dpp8FI:$fi);
let DstRC64 = getVALUDstForVT<DstVT, 1/*IsTrue*/, 1/*IsVOP3Encoding*/>.ret;
let Src0RC64 = getVOP3SrcForVT<Src0VT, 1/*IsTrue16*/>.ret;
let Src1RC64 = getVOP3SrcForVT<Src1VT, 1/*IsTrue16*/>.ret;
let Src0VOP3DPP = VGPRSrc_16;
let Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT, 0/*IsFake16*/>.ret;
let Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT, 0/*IsFake16*/>.ret;
let Src0ModVOP3DPP = getSrc0ModVOP3DPP<Src0VT, DstVT, 0/*IsFake16*/>.ret;
let Src1ModVOP3DPP = getSrcModVOP3DPP<Src1VT, 0/*IsFake16*/>.ret;
let Src2ModVOP3DPP = getSrcModVOP3DPP<Src2VT, 0/*IsFake16*/>.ret;
let Src0Mod = getSrc0Mod<Src0VT, DstVT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
let Src1Mod = getSrcMod<Src1VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
let Src2Mod = getSrcMod<Src2VT, 1/*IsTrue16*/, 0/*IsFake16*/>.ret;
}
def VOP_MAC_F16_fake16 : VOP_MAC <f16> {
let IsTrue16 = 1;
let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
Expand Down Expand Up @@ -998,6 +1046,9 @@ let FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 in {
let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">;
}
let True16Predicate = UseRealTrue16Insts in {
def V_FMAMK_F16_t16 : VOP2_Pseudo <"v_fmamk_f16_t16", VOP_MADMK_F16_t16, [], "">;
}
let True16Predicate = UseFakeTrue16Insts in {
def V_FMAMK_F16_fake16 : VOP2_Pseudo <"v_fmamk_f16_fake16", VOP_MADMK_F16_fake16, [], "">;
}
Expand All @@ -1006,6 +1057,9 @@ let isCommutable = 1 in {
let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in {
def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">;
}
let True16Predicate = UseRealTrue16Insts in {
def V_FMAAK_F16_t16 : VOP2_Pseudo <"v_fmaak_f16_t16", VOP_MADAK_F16_t16, [], "">;
}
let True16Predicate = UseFakeTrue16Insts in {
def V_FMAAK_F16_fake16 : VOP2_Pseudo <"v_fmaak_f16_fake16", VOP_MADAK_F16_fake16, [], "">;
}
Expand All @@ -1020,6 +1074,9 @@ let SubtargetPredicate = isGFX10Plus in {
let True16Predicate = NotHasTrue16BitInsts in {
defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>;
}
let True16Predicate = UseRealTrue16Insts in {
defm V_FMAC_F16_t16 : VOP2Inst <"v_fmac_f16_t16", VOP_MAC_F16_t16>;
}
let True16Predicate = UseFakeTrue16Insts in {
defm V_FMAC_F16_fake16 : VOP2Inst <"v_fmac_f16_fake16", VOP_MAC_F16_fake16>;
}
Expand Down Expand Up @@ -1692,8 +1749,8 @@ multiclass VOP3Only_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, stri
VOP3_Realtriple_t16_gfx12<op, asmName, OpName, "", /*IsSingle*/1>;

multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string OpName = NAME> {
defm OpName#"_t16": VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_t16">;
defm OpName#"_fake16": VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_fake16">;
defm _t16: VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_t16">;
defm _fake16: VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_fake16">;
}

multiclass VOP3beOnly_Realtriple_gfx11_gfx12<bits<10> op> :
Expand All @@ -1712,7 +1769,14 @@ multiclass VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<bits<6> op, string asmN

multiclass VOP2_Real_FULL_t16_gfx11_gfx12<bits<6> op, string asmName,
string opName = NAME> :
VOP2_Real_FULL_with_name_gfx11_gfx12<op, opName, asmName>;
VOP2_Real_FULL_with_name<GFX11Gen, op, opName, asmName>,
VOP2_Real_FULL_with_name<GFX12Gen, op, opName, asmName>;

multiclass VOP2_Real_FULL_t16_and_fake16_gfx11_gfx12<bits<6> op, string asmName,
string opName = NAME> {
defm _t16: VOP2_Real_FULL_t16_gfx11_gfx12<op, asmName, opName#"_t16">;
defm _fake16: VOP2_Real_FULL_t16_gfx11_gfx12<op, asmName, opName#"_fake16">;
}

multiclass VOP2_Real_FULL_gfx11<bits<6> op> :
VOP2_Real_FULL<GFX11Gen, op>;
Expand Down Expand Up @@ -1747,15 +1811,15 @@ defm V_SUBREV_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16
defm V_SUBREV_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16">;
defm V_MUL_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
defm V_MUL_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">;
defm V_FMAC_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">;
defm V_FMAC_F16 : VOP2_Real_FULL_t16_and_fake16_gfx11_gfx12<0x036, "v_fmac_f16">;
defm V_LDEXP_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
defm V_LDEXP_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">;
defm V_MAX_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
defm V_MAX_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">;
defm V_MIN_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
defm V_MIN_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">;
defm V_FMAMK_F16_fake16 : VOP2Only_Real_MADK_t16_gfx11_gfx12<0x037, "v_fmamk_f16">;
defm V_FMAAK_F16_fake16 : VOP2Only_Real_MADK_t16_gfx11_gfx12<0x038, "v_fmaak_f16">;
defm V_FMAMK_F16 : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x037, "v_fmamk_f16">;
defm V_FMAAK_F16 : VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12<0x038, "v_fmaak_f16">;

// VOP3 only.
defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11_gfx12<0x25d>;
Expand Down
Loading
Loading