Skip to content

Commit 8d7c2cc

Browse files
arsenmpravinjagtapjrbyrnes
committed
AMDGPU: Handle cvt_scale F32/F16->F4/F8 gfx950 hazard (llvm#117844)
gfx950 SP changes doc says: No 4 clk forwarding on opcodes that convert from F32/F16->F8 or F32/F16->F4. Must insert a NOP or instruction writing some other destination VREG after a conversion to F4/F8 since it writes either low/high half or bytes. Co-authored-by: Pravin Jagtap <[email protected]> Co-authored-by: Jeffrey Byrnes <[email protected]>
1 parent 7f10cac commit 8d7c2cc

File tree

7 files changed

+433
-7
lines changed

7 files changed

+433
-7
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -907,17 +907,18 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
907907

908908
// There are three different types of instructions
909909
// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
910-
// which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
911-
// CVT_SR_BF8_F32 with op_sel[3:2]
910+
// which write hi bits (e.g. op_sel[3] == 1), and 3. FP8DstSelInst
911+
// (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
912+
// op_sel[3:2]
912913
// != 0
913914
if (SIInstrInfo::isSDWA(MI)) {
914915
// Type 1: SDWA with dst_sel != DWORD
915916
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
916917
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
917918
return nullptr;
918919
} else {
919-
// Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
920-
// CVT_SR_BF8_F32 with op_sel[3:2] != 0)
920+
// Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
921+
// with op_sel[3:2] != 0)
921922
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
922923
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
923924
SISrcMods::DST_OP_SEL ||
@@ -981,7 +982,7 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
981982
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
982983
}
983984

984-
if (ST.hasDstSelForwardingHazard()) {
985+
if (ST.hasDstSelForwardingHazard() || ST.hasCvtScaleForwardingHazard()) {
985986
const int Shift16DefWaitstates = 1;
986987

987988
auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
@@ -1092,7 +1093,8 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10921093
// problematic thus far.
10931094

10941095
// see checkVALUHazards()
1095-
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
1096+
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard() &&
1097+
!ST.hasCvtScaleForwardingHazard())
10961098
return 0;
10971099

10981100
const MachineRegisterInfo &MRI = MF.getRegInfo();

llvm/lib/Target/AMDGPU/GCNSubtarget.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1265,6 +1265,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
12651265

12661266
bool hasVALUTransUseHazard() const { return HasVALUTransUseHazard; }
12671267

1268+
bool hasCvtScaleForwardingHazard() const { return GFX950Insts; }
1269+
12681270
bool hasForceStoreSC0SC1() const { return HasForceStoreSC0SC1; }
12691271

12701272
bool requiresCodeObjectV6() const { return RequiresCOV6; }

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,14 @@ struct SingleUseExceptionInfo {
385385
bool IsInvalidSingleUseProducer;
386386
};
387387

388+
#define GET_FP8DstByteSelTable_DECL
389+
#define GET_FP8DstByteSelTable_IMPL
390+
391+
struct DPMACCInstructionInfo {
392+
uint16_t Opcode;
393+
bool IsDPMACCInstruction;
394+
};
395+
388396
struct FP8DstByteSelInfo {
389397
uint16_t Opcode;
390398
bool HasFP8DstByteSel;
@@ -427,6 +435,8 @@ struct FP8DstByteSelInfo {
427435
#define GET_getMFMA_F8F6F4_WithSize_DECL
428436
#define GET_getMFMA_F8F6F4_WithSize_IMPL
429437
#define GET_isMFMA_F8F6F4Table_IMPL
438+
#define GET_isCvtScaleF32_F32F16ToF8F4Table_IMPL
439+
430440
#include "AMDGPUGenSearchableTables.inc"
431441

432442
int getMTBUFBaseOpcode(unsigned Opc) {

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,10 @@ struct MFMA_F8F6F4_Info {
103103
uint8_t NumRegsSrcB;
104104
};
105105

106+
struct CvtScaleF32_F32F16ToF8F4_Info {
107+
unsigned Opcode;
108+
};
109+
106110
#define GET_MIMGBaseOpcode_DECL
107111
#define GET_MIMGDim_DECL
108112
#define GET_MIMGEncoding_DECL
@@ -112,6 +116,7 @@ struct MFMA_F8F6F4_Info {
112116
#define GET_MAIInstInfoTable_DECL
113117
#define GET_MAIInstInfoTable_DECL
114118
#define GET_isMFMA_F8F6F4Table_DECL
119+
#define GET_isCvtScaleF32_F32F16ToF8F4Table_DECL
115120
#include "AMDGPUGenSearchableTables.inc"
116121

117122
namespace IsaInfo {

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -973,11 +973,16 @@ class VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOPProfile P> : VOP3_Profil
973973
let HasOMod = 0;
974974
}
975975

976+
class VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
977+
let HasFP8DstByteSel = 1;
978+
}
979+
976980
class VOP3_CVT_SCALE_SR_F8BF8_F16BF16F32_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<P> {
977981
let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
978982
Int32InputMods:$src1_modifiers, Src1RC64:$src1,
979983
FP32InputMods:$src2_modifiers, Src2RC64:$src2,
980984
VGPR_32:$vdst_in, op_sel0:$op_sel);
985+
let HasFP8DstByteSel = 1;
981986
}
982987

983988

@@ -995,6 +1000,7 @@ class VOP3_CVT_SCALE_FP4_F16BF16_TiedInput_Profile<VOPProfile P> : VOP3_Profile<
9951000
HasSrc0FloatMods, HasSrc1FloatMods,
9961001
HasSrc2FloatMods>.ret);
9971002
let HasExtVOP3DPP = 0;
1003+
let HasFP8DstByteSel = 1;
9981004
}
9991005

10001006
class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
@@ -1007,6 +1013,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
10071013
let HasExtVOP3DPP = 0;
10081014
let HasOpSel = 1;
10091015
let HasOMod = 0;
1016+
let HasFP8DstByteSel = 1;
10101017
}
10111018

10121019
def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
@@ -1018,6 +1025,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
10181025
let HasExtVOP3DPP = 0;
10191026
let HasOpSel = 1;
10201027
let HasOMod = 0;
1028+
let HasFP8DstByteSel = 1;
10211029
}
10221030

10231031
class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
@@ -1093,7 +1101,7 @@ let SubtargetPredicate = HasBF8ConversionScaleInsts, mayRaiseFPException = 0 in
10931101
let SubtargetPredicate = HasFP4ConversionScaleInsts, mayRaiseFPException = 0 in {
10941102
defm V_CVT_SCALEF32_PK_F32_FP4 : VOP3Inst<"v_cvt_scalef32_pk_f32_fp4", VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<v2f32>>;
10951103
let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
1096-
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4FP8BF8_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
1104+
defm V_CVT_SCALEF32_PK_FP4_F32 : VOP3Inst<"v_cvt_scalef32_pk_fp4_f32", VOP3_CVT_SCALE_FP4_F32_TiedInput_Profile<VOP_I32_F32_F32_F32>>;
10971105
let Constraints = "@earlyclobber $vdst" in {
10981106
defm V_CVT_SCALEF32_SR_PK_FP4_F16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_f16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2f16>>;
10991107
defm V_CVT_SCALEF32_SR_PK_FP4_BF16: VOP3Inst<"v_cvt_scalef32_sr_pk_fp4_bf16", VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<v2bf16>>;
@@ -2027,6 +2035,7 @@ multiclass VOP3_Real_BITOP3_gfx9<bits<10> op, string AsmName, bit isSingle = 0>
20272035
}
20282036
}
20292037
}
2038+
20302039
} // End AssemblerPredicate = isGFX9Only, DecoderNamespace = "GFX9"
20312040

20322041
defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>;

0 commit comments

Comments
 (0)