Skip to content

Commit 7bcf4d6

Browse files
authored
[AMDGPU] Correctly insert s_nops for dst forwarding hazard (#100276)
MI300 ISA section 4.5 states there is a hazard between "VALU op which uses OPSEL or SDWA with changes the result’s bit position" and "VALU op consumes result of that op" This includes the case where the second op is SDWA with same dest and dst_sel != DWORD && dst_unused == UNUSED_PRESERVE. In this case, there is an implicit read of the first op dst and the compiler needs to resolve this hazard. Confirmed with HW team. We model dst_unused == UNUSED_PRESERVE as tied-def of implicit operand, so this PR checks for that. MI300_SP_MAS section 1.3.9.2 specifies that CVT_SR_FP8_F32 and CVT_SR_BF8_F32 with opsel[3:2] !=0 have dest forwarding issue. Currently, we only add check for CVT_SR_FP8_F32 with opsel[3] != 0 -- this PR adds support opsel[2] != 0 as well
1 parent e5140ae commit 7bcf4d6

File tree

8 files changed

+579
-22
lines changed

8 files changed

+579
-22
lines changed

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Lines changed: 112 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -876,13 +876,78 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
876876
return DataIdx >= 0 &&
877877
TRI->regsOverlap(MI.getOperand(DataIdx).getReg(), Reg);
878878
};
879+
879880
int WaitStatesNeededForDef =
880881
VALUWaitStates - getWaitStatesSince(IsHazardFn, VALUWaitStates);
881882
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
882883

883884
return WaitStatesNeeded;
884885
}
885886

887+
/// Dest sel forwarding issue occurs if additional logic is needed to swizzle /
888+
/// pack the computed value into correct bit position of the dest register. This
889+
/// occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
890+
/// dst_sel that is not aligned to the register. This function analayzes the \p
891+
/// MI and \returns an operand with dst forwarding issue, or nullptr if
892+
/// none exists.
893+
static const MachineOperand *
894+
getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
895+
if (!SIInstrInfo::isVALU(MI))
896+
return nullptr;
897+
898+
const SIInstrInfo *TII = ST.getInstrInfo();
899+
900+
unsigned Opcode = MI.getOpcode();
901+
902+
// There are three different types of instructions
903+
// which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
904+
// which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
905+
// CVT_SR_BF8_F32 with op_sel[3:2]
906+
// != 0
907+
if (SIInstrInfo::isSDWA(MI)) {
908+
// Type 1: SDWA with dst_sel != DWORD
909+
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
910+
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
911+
return nullptr;
912+
} else {
913+
// Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
914+
// CVT_SR_BF8_F32 with op_sel[3:2] != 0)
915+
if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
916+
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
917+
SISrcMods::DST_OP_SEL ||
918+
(AMDGPU::isFP8DstSelInst(Opcode) &&
919+
(TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
920+
SISrcMods::OP_SEL_0))))
921+
return nullptr;
922+
}
923+
924+
return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
925+
}
926+
927+
/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
928+
/// fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
929+
/// RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
930+
static bool consumesDstSelForwardingOperand(const MachineInstr *VALU,
931+
const MachineOperand *Dst,
932+
const SIRegisterInfo *TRI) {
933+
// We must consider implicit reads of the VALU. SDWA with dst_sel and
934+
// UNUSED_PRESERVE will implicitly read the result from forwarded dest,
935+
// and we must account for that hazard.
936+
// We also must account for WAW hazards. In particular, WAW with dest
937+
// preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
938+
// !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
939+
// check for ECC. Without accounting for this hazard, the ECC will be
940+
// wrong.
941+
// TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
942+
// complete zeroesHigh16BitsOfDest)
943+
for (auto &Operand : VALU->operands()) {
944+
if (Operand.isReg() && TRI->regsOverlap(Dst->getReg(), Operand.getReg())) {
945+
return true;
946+
}
947+
}
948+
return false;
949+
}
950+
886951
int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
887952
int WaitStatesNeeded = 0;
888953

@@ -913,27 +978,18 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
913978
if (ST.hasDstSelForwardingHazard()) {
914979
const int Shift16DefWaitstates = 1;
915980

916-
auto IsShift16BitDefFn = [this, VALU](const MachineInstr &MI) {
917-
if (!SIInstrInfo::isVALU(MI))
918-
return false;
919-
const SIInstrInfo *TII = ST.getInstrInfo();
920-
if (SIInstrInfo::isSDWA(MI)) {
921-
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
922-
if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
923-
return false;
924-
} else {
925-
if (!AMDGPU::hasNamedOperand(MI.getOpcode(), AMDGPU::OpName::op_sel) ||
926-
!(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)
927-
->getImm() &
928-
SISrcMods::DST_OP_SEL))
929-
return false;
930-
}
981+
auto IsShift16BitDefFn = [this, VALU](const MachineInstr &ProducerMI) {
931982
const SIRegisterInfo *TRI = ST.getRegisterInfo();
932-
if (auto *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst)) {
933-
Register Def = Dst->getReg();
983+
const MachineOperand *ForwardedDst =
984+
getDstSelForwardingOperand(ProducerMI, ST);
985+
if (ForwardedDst) {
986+
return consumesDstSelForwardingOperand(VALU, ForwardedDst, TRI);
987+
}
934988

935-
for (const MachineOperand &Use : VALU->explicit_uses()) {
936-
if (Use.isReg() && TRI->regsOverlap(Def, Use.getReg()))
989+
if (ProducerMI.isInlineAsm()) {
990+
// Assume inline asm has dst forwarding hazard
991+
for (auto &Def : ProducerMI.all_defs()) {
992+
if (consumesDstSelForwardingOperand(VALU, &Def, TRI))
937993
return true;
938994
}
939995
}
@@ -1030,7 +1086,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10301086
// problematic thus far.
10311087

10321088
// see checkVALUHazards()
1033-
if (!ST.has12DWordStoreHazard())
1089+
if (!ST.has12DWordStoreHazard() && !ST.hasDstSelForwardingHazard())
10341090
return 0;
10351091

10361092
const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1039,11 +1095,45 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10391095
for (const MachineOperand &Op :
10401096
llvm::drop_begin(IA->operands(), InlineAsm::MIOp_FirstOperand)) {
10411097
if (Op.isReg() && Op.isDef()) {
1042-
WaitStatesNeeded =
1043-
std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1098+
if (!TRI.isVectorRegister(MRI, Op.getReg()))
1099+
continue;
1100+
1101+
if (ST.has12DWordStoreHazard()) {
1102+
WaitStatesNeeded =
1103+
std::max(WaitStatesNeeded, checkVALUHazardsHelper(Op, MRI));
1104+
}
10441105
}
10451106
}
10461107

1108+
if (ST.hasDstSelForwardingHazard()) {
1109+
const int Shift16DefWaitstates = 1;
1110+
1111+
auto IsShift16BitDefFn = [this, &IA](const MachineInstr &ProducerMI) {
1112+
const MachineOperand *Dst = getDstSelForwardingOperand(ProducerMI, ST);
1113+
// Assume inline asm reads the dst
1114+
if (Dst)
1115+
return IA->modifiesRegister(Dst->getReg(), &TRI) ||
1116+
IA->readsRegister(Dst->getReg(), &TRI);
1117+
1118+
if (ProducerMI.isInlineAsm()) {
1119+
// If MI is inline asm, assume it has dst forwarding hazard
1120+
for (auto &Def : ProducerMI.all_defs()) {
1121+
if (IA->modifiesRegister(Def.getReg(), &TRI) ||
1122+
IA->readsRegister(Def.getReg(), &TRI)) {
1123+
return true;
1124+
}
1125+
}
1126+
}
1127+
1128+
return false;
1129+
};
1130+
1131+
int WaitStatesNeededForDef =
1132+
Shift16DefWaitstates -
1133+
getWaitStatesSince(IsShift16BitDefFn, Shift16DefWaitstates);
1134+
WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForDef);
1135+
}
1136+
10471137
return WaitStatesNeeded;
10481138
}
10491139

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2342,6 +2342,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
23422342

23432343
field bit IsFP8SrcByteSel = 0;
23442344
field bit IsFP8DstByteSel = 0;
2345+
field bit HasFP8DstByteSel = 0;
23452346
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
23462347

23472348
field bit HasDst = !ne(DstVT.Value, untyped.Value);
@@ -2921,6 +2922,15 @@ def getVCMPXOpFromVCMP : InstrMapping {
29212922
let ValueCols = [["1"]];
29222923
}
29232924

2925+
def FP8DstByteSelTable : GenericTable {
2926+
let FilterClass = "VOP3_Pseudo";
2927+
let CppTypeName = "FP8DstByteSelInfo";
2928+
let Fields = ["Opcode", "HasFP8DstByteSel"];
2929+
2930+
let PrimaryKey = ["Opcode"];
2931+
let PrimaryKeyName = "getFP8DstByteSelHelper";
2932+
}
2933+
29242934
def VOPDComponentTable : GenericTable {
29252935
let FilterClass = "VOPD_Component";
29262936
let CppTypeName = "VOPDComponentInfo";

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,13 @@ struct SingleUseExceptionInfo {
385385
bool IsInvalidSingleUseProducer;
386386
};
387387

388+
struct FP8DstByteSelInfo {
389+
uint16_t Opcode;
390+
bool HasFP8DstByteSel;
391+
};
392+
393+
#define GET_FP8DstByteSelTable_DECL
394+
#define GET_FP8DstByteSelTable_IMPL
388395
#define GET_MTBUFInfoTable_DECL
389396
#define GET_MTBUFInfoTable_IMPL
390397
#define GET_MUBUFInfoTable_DECL
@@ -629,6 +636,11 @@ bool isInvalidSingleUseProducerInst(unsigned Opc) {
629636
return Info && Info->IsInvalidSingleUseProducer;
630637
}
631638

639+
bool isFP8DstSelInst(unsigned Opc) {
640+
const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc);
641+
return Info ? Info->HasFP8DstByteSel : false;
642+
}
643+
632644
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
633645
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
634646
return Info ? Info->Opcode3Addr : ~0u;

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -861,6 +861,9 @@ getVOPDInstInfo(unsigned VOPDOpcode, const MCInstrInfo *InstrInfo);
861861
LLVM_READONLY
862862
bool isTrue16Inst(unsigned Opc);
863863

864+
LLVM_READONLY
865+
bool isFP8DstSelInst(unsigned Opc);
866+
864867
LLVM_READONLY
865868
bool isInvalidSingleUseConsumerInst(unsigned Opc);
866869

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
568568
let HasSrc2Mods = 1;
569569
let HasExtVOP3DPP = 1;
570570
let HasOpSel = 1;
571+
let HasFP8DstByteSel = 1;
571572
let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
572573
getAsmVOP3OpSel<3, HasClamp, HasOMod,
573574
HasSrc0FloatMods, HasSrc1FloatMods,
@@ -587,6 +588,7 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
587588
class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
588589
VOP3_Profile<VOPProfile<[i32, SrcVT, i32, untyped]>> {
589590
let IsFP8DstByteSel = 1;
591+
let HasFP8DstByteSel = 1;
590592
let HasClamp = 0;
591593
defvar bytesel = (ins VGPR_32:$vdst_in, ByteSel:$byte_sel);
592594
let Ins64 = !con(getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,

llvm/lib/Target/AMDGPU/VOPInstructions.td

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
113113
let IsWMMA = P.IsWMMA;
114114
let IsSWMMAC = P.IsSWMMAC;
115115

116+
bit HasFP8DstByteSel = P.HasFP8DstByteSel;
117+
116118
let AsmOperands = !if(isVop3OpSel,
117119
P.AsmVOP3OpSel,
118120
!if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));

0 commit comments

Comments
 (0)