Skip to content

Commit 4891262

Browse files
jrbyrnesDavid Salinas
authored andcommitted
[AMDGPU] Correctly insert s_nops for dst forwarding hazard (llvm#100276)
MI300 ISA section 4.5 states there is a hazard between "VALU op which uses OPSEL or SDWA with changes the result’s bit position" and "VALU op consumes result of that op" This includes the case where the second op is SDWA with same dest and dst_sel != DWORD && dst_unused == UNUSED_PRESERVE. In this case, there is an implicit read of the first op dst and the compiler needs to resolve this hazard. Confirmed with HW team. We model dst_unused == UNUSED_PRESERVE as tied-def of implicit operand, so this PR checks for that. MI300_SP_MAS section 1.3.9.2 specifies that CVT_SR_FP8_F32 and CVT_SR_BF8_F32 with opsel[3:2] !=0 have dest forwarding issue. Currently, we only add check for CVT_SR_FP8_F32 with opsel[3] != 0 -- this PR adds support opsel[2] != 0 as well Change-Id: I43faede4f816f0602000accd5e6ed962c0dbee8b
1 parent c3940cd commit 4891262

File tree

8 files changed

+579
-22
lines changed

8 files changed

+579
-22
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 112 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -875,13 +875,78 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
875875
return DataIdx >= 0 &&
876876
TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
877877
};
878+
878879
int WaitStatesNeededForDef =
879880
VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
880881
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
881882

882883
return WaitStatesNeeded;
883884
}
884885

886+
/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
887+
/// pack the computed value into correct bit position of the dest register. This
888+
/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
889+
/// dst_sel that is not aligned to the register. This function analayzes the \p
890+
/// MI and \returns an operand with dst forwarding issue, or nullptr if
891+
/// none exists.
892+
static const MachineOperand *
893+
getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
894+
if (!SIInstrInfo::isVALU(MI))
895+
return nullptr;
896+
897+
const SIInstrInfo *TII = ST.getInstrInfo();
898+
899+
unsigned Opcode = MI.getOpcode();
900+
901+
// There are three different types of instructions
902+
// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
903+
// which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
904+
// CVT_SR_BF8_F32 with op_sel[3:2]
905+
// != 0
906+
if (SIInstrInfo::isSDWA(MI)) {
907+
// Type 1: SDWA with dst_sel != DWORD
908+
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
909+
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
910+
return nullptr;
911+
} else {
912+
// Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
913+
// CVT_SR_BF8_F32 with op_sel[3:2] != 0)
914+
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
915+
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
916+
SISrcMods::DST_OP_SEL ||
917+
(AMDGPU::isFP8DstSelInst(Opcode) &&
918+
(TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
919+
SISrcMods::OP_SEL_0))))
920+
return nullptr;
921+
}
922+
923+
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
924+
}
925+
926+
/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
927+
/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
928+
/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
929+
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
930+
const MachineOperand *Dst,
931+
const SIRegisterInfo *TRI) {
932+
// We must consider implicit reads of the VALU. SDWA with dst_sel and
933+
// UNUSED_PRESERVE will implicitly read the result from forwarded dest,
934+
// and we must account for that hazard.
935+
// We also must account for WAW hazards. In particular, WAW with dest
936+
// preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
937+
// !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
938+
// check for ECC. Without accounting for this hazard, the ECC will be
939+
// wrong.
940+
// TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
941+
// complete zeroesHigh16BitsOfDest)
942+
for (auto &Operand : VALU->operands()) {
943+
if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
944+
return true;
945+
}
946+
}
947+
return false;
948+
}
949+
885950
int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
886951
int WaitStatesNeeded = 0;
887952

@@ -912,27 +977,18 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
912977
if (ST.hasDstSelForwardingHazard()) {
913978
const int Shift16DefWaitstates = 1;
914979

915-
auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
916-
if (!SIInstrInfo::isVALU(MI))
917-
return false;
918-
const SIInstrInfo *TII = ST.getInstrInfo();
919-
if (SIInstrInfo::isSDWA(MI)) {
920-
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
921-
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
922-
return false;
923-
} else {
924-
if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
925-
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
926-
->getImm() &
927-
SISrcMods::DST_OP_SEL))
928-
return false;
929-
}
980+
auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
930981
const SIRegisterInfo *TRI = ST.getRegisterInfo();
931-
if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
932-
Register Def = Dst->getReg();
982+
const MachineOperand *ForwardedDst =
983+
getDstSelForwardingOperand(ProducerMI, ST);
984+
if (ForwardedDst) {
985+
return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
986+
}
933987

934-
for (const MachineOperand &Use : VALU->explicit_uses()) {
935-
if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
988+
if (ProducerMI.isInlineAsm()) {
989+
// Assume inline asm has dst forwarding hazard
990+
for (auto &Def : ProducerMI.all_defs()) {
991+
if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
936992
return true;
937993
}
938994
}
@@ -1029,7 +1085,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10291085
// problematic thus far.
10301086

10311087
// see checkVALUHazards()
1032-
if (!ST.has12DWordStoreHazard())
1088+
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
10331089
return 0;
10341090

10351091
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1038,11 +1094,45 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10381094
for (const MachineOperand &Op :
10391095
llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
10401096
if (Op.isReg() && Op.isDef()) {
1041-
WaitStatesNeeded =
1042-
std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1097+
if (!TRI.isVectorRegister(MRI, Op.getReg()))
1098+
continue;
1099+
1100+
if (ST.has12DWordStoreHazard()) {
1101+
WaitStatesNeeded =
1102+
std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1103+
}
10431104
}
10441105
}
10451106

1107+
if (ST.hasDstSelForwardingHazard()) {
1108+
const int Shift16DefWaitstates = 1;
1109+
1110+
auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1111+
const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1112+
// Assume inline asm reads the dst
1113+
if (Dst)
1114+
return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1115+
IA->readsRegister(Dst->getReg(), &TRI);
1116+
1117+
if (ProducerMI.isInlineAsm()) {
1118+
// If MI is inline asm, assume it has dst forwarding hazard
1119+
for (auto &Def : ProducerMI.all_defs()) {
1120+
if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1121+
IA->readsRegister(Def.getReg(), &TRI)) {
1122+
return true;
1123+
}
1124+
}
1125+
}
1126+
1127+
return false;
1128+
};
1129+
1130+
int WaitStatesNeededForDef =
1131+
Shift16DefWaitstates -
1132+
getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1133+
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1134+
}
1135+
10461136
return WaitStatesNeeded;
10471137
}
10481138

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2325,6 +2325,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
23252325

23262326
field bit IsFP8SrcByteSel = 0;
23272327
field bit IsFP8DstByteSel = 0;
2328+
field bit HasFP8DstByteSel = 0;
23282329
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
23292330

23302331
field bit HasDst = !ne(DstVT.Value, untyped.Value);
@@ -2904,6 +2905,15 @@ def getVCMPXOpFromVCMP : InstrMapping {
29042905
let ValueCols = [["1"]];
29052906
}
29062907

2908+
def FP8DstByteSelTable : GenericTable {
2909+
let FilterClass = "VOP3_Pseudo";
2910+
let CppTypeName = "FP8DstByteSelInfo";
2911+
let Fields = ["Opcode", "HasFP8DstByteSel"];
2912+
2913+
let PrimaryKey = ["Opcode"];
2914+
let PrimaryKeyName = "getFP8DstByteSelHelper";
2915+
}
2916+
29072917
def VOPDComponentTable : GenericTable {
29082918
let FilterClass = "VOPD_Component";
29092919
let CppTypeName = "VOPDComponentInfo";

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,13 @@ struct SingleUseExceptionInfo {
385385
bool IsInvalidSingleUseProducer;
386386
};
387387

388+
struct FP8DstByteSelInfo {
389+
uint16_t Opcode;
390+
bool HasFP8DstByteSel;
391+
};
392+
393+
#define GET_FP8DstByteSelTable_DECL
394+
#define GET_FP8DstByteSelTable_IMPL
388395
#define GET_MTBUFInfoTable_DECL
389396
#define GET_MTBUFInfoTable_IMPL
390397
#define GET_MUBUFInfoTable_DECL
@@ -629,6 +636,11 @@ bool isInvalidSingleUseProducerInst(unsigned Opc) {
629636
return Info && Info->IsInvalidSingleUseProducer;
630637
}
631638

639+
bool isFP8DstSelInst(unsigned Opc) {
640+
const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc);
641+
return Info ? Info->HasFP8DstByteSel : false;
642+
}
643+
632644
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
633645
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
634646
return Info ? Info->Opcode3Addr : ~0u;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,9 @@ getVOPDInstInfo(unsigned VOPDOpcode, const MCInstrInfo *InstrInfo);
861861
LLVM_READONLY
862862
bool isTrue16Inst(unsigned Opc);
863863

864+
LLVM_READONLY
865+
bool isFP8DstSelInst(unsigned Opc);
866+
864867
LLVM_READONLY
865868
bool isInvalidSingleUseConsumerInst(unsigned Opc);
866869

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
568568
let HasSrc2Mods = 1;
569569
let HasExtVOP3DPP = 1;
570570
let HasOpSel = 1;
571+
let HasFP8DstByteSel = 1;
571572
let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
572573
getAsmVOP3OpSel<3, HasClamp, HasOMod,
573574
HasSrc0FloatMods, HasSrc1FloatMods,
@@ -587,6 +588,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
587588
class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
588589
VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
589590
let IsFP8DstByteSel = 1;
591+
let HasFP8DstByteSel = 1;
590592
let HasClamp = 0;
591593
defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel);
592594
let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
113113
let IsWMMA = P.IsWMMA;
114114
let IsSWMMAC = P.IsSWMMAC;
115115

116+
bit HasFP8DstByteSel = P.HasFP8DstByteSel;
117+
116118
let AsmOperands = !if(isVop3OpSel,
117119
P.AsmVOP3OpSel,
118120
!if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));

0 commit comments

Comments
 (0)