@@ -876,13 +876,78 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
876
876
return DataIdx >= 0 &&
877
877
TRI->regsOverlap (MI.getOperand (DataIdx).getReg (), Reg);
878
878
};
879
+
879
880
int WaitStatesNeededForDef =
880
881
VALUWaitStates - getWaitStatesSince (IsHazardFn, VALUWaitStates);
881
882
WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
882
883
883
884
return WaitStatesNeeded;
884
885
}
885
886
887
+ // / Dest sel forwarding issue occurs if additional logic is needed to swizzle /
888
+ // / pack the computed value into correct bit position of the dest register. This
889
+ // / occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
890
+ // / dst_sel that is not aligned to the register. This function analayzes the \p
891
+ // / MI and \returns an operand with dst forwarding issue, or nullptr if
892
+ // / none exists.
893
+ static const MachineOperand *
894
+ getDstSelForwardingOperand (const MachineInstr &MI, const GCNSubtarget &ST) {
895
+ if (!SIInstrInfo::isVALU (MI))
896
+ return nullptr ;
897
+
898
+ const SIInstrInfo *TII = ST.getInstrInfo ();
899
+
900
+ unsigned Opcode = MI.getOpcode ();
901
+
902
+ // There are three different types of instructions
903
+ // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
904
+ // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
905
+ // CVT_SR_BF8_F32 with op_sel[3:2]
906
+ // != 0
907
+ if (SIInstrInfo::isSDWA (MI)) {
908
+ // Type 1: SDWA with dst_sel != DWORD
909
+ if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
910
+ if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
911
+ return nullptr ;
912
+ } else {
913
+ // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
914
+ // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
915
+ if (!AMDGPU::hasNamedOperand (Opcode, AMDGPU::OpName::op_sel) ||
916
+ !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)->getImm () &
917
+ SISrcMods::DST_OP_SEL ||
918
+ (AMDGPU::isFP8DstSelInst (Opcode) &&
919
+ (TII->getNamedOperand (MI, AMDGPU::OpName::src2_modifiers)->getImm () &
920
+ SISrcMods::OP_SEL_0))))
921
+ return nullptr ;
922
+ }
923
+
924
+ return TII->getNamedOperand (MI, AMDGPU::OpName::vdst);
925
+ }
926
+
927
+ // / Checks whether the provided \p MI "consumes" the operand with a Dest sel
928
+ // / fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
929
+ // / RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
930
+ static bool consumesDstSelForwardingOperand (const MachineInstr *VALU,
931
+ const MachineOperand *Dst,
932
+ const SIRegisterInfo *TRI) {
933
+ // We must consider implicit reads of the VALU. SDWA with dst_sel and
934
+ // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
935
+ // and we must account for that hazard.
936
+ // We also must account for WAW hazards. In particular, WAW with dest
937
+ // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
938
+ // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
939
+ // check for ECC. Without accounting for this hazard, the ECC will be
940
+ // wrong.
941
+ // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
942
+ // complete zeroesHigh16BitsOfDest)
943
+ for (auto &Operand : VALU->operands ()) {
944
+ if (Operand.isReg () && TRI->regsOverlap (Dst->getReg (), Operand.getReg ())) {
945
+ return true ;
946
+ }
947
+ }
948
+ return false ;
949
+ }
950
+
886
951
int GCNHazardRecognizer::checkVALUHazards (MachineInstr *VALU) {
887
952
int WaitStatesNeeded = 0 ;
888
953
@@ -913,27 +978,18 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
913
978
if (ST.hasDstSelForwardingHazard ()) {
914
979
const int Shift16DefWaitstates = 1 ;
915
980
916
- auto IsShift16BitDefFn = [this , VALU](const MachineInstr &MI) {
917
- if (!SIInstrInfo::isVALU (MI))
918
- return false ;
919
- const SIInstrInfo *TII = ST.getInstrInfo ();
920
- if (SIInstrInfo::isSDWA (MI)) {
921
- if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
922
- if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
923
- return false ;
924
- } else {
925
- if (!AMDGPU::hasNamedOperand (MI.getOpcode (), AMDGPU::OpName::op_sel) ||
926
- !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)
927
- ->getImm () &
928
- SISrcMods::DST_OP_SEL))
929
- return false ;
930
- }
981
+ auto IsShift16BitDefFn = [this , VALU](const MachineInstr &ProducerMI) {
931
982
const SIRegisterInfo *TRI = ST.getRegisterInfo ();
932
- if (auto *Dst = TII->getNamedOperand (MI, AMDGPU::OpName::vdst)) {
933
- Register Def = Dst->getReg ();
983
+ const MachineOperand *ForwardedDst =
984
+ getDstSelForwardingOperand (ProducerMI, ST);
985
+ if (ForwardedDst) {
986
+ return consumesDstSelForwardingOperand (VALU, ForwardedDst, TRI);
987
+ }
934
988
935
- for (const MachineOperand &Use : VALU->explicit_uses ()) {
936
- if (Use.isReg () && TRI->regsOverlap (Def, Use.getReg ()))
989
+ if (ProducerMI.isInlineAsm ()) {
990
+ // Assume inline asm has dst forwarding hazard
991
+ for (auto &Def : ProducerMI.all_defs ()) {
992
+ if (consumesDstSelForwardingOperand (VALU, &Def, TRI))
937
993
return true ;
938
994
}
939
995
}
@@ -1030,7 +1086,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1030
1086
// problematic thus far.
1031
1087
1032
1088
// see checkVALUHazards()
1033
- if (!ST.has12DWordStoreHazard ())
1089
+ if (!ST.has12DWordStoreHazard () && !ST. hasDstSelForwardingHazard () )
1034
1090
return 0 ;
1035
1091
1036
1092
const MachineRegisterInfo &MRI = MF.getRegInfo ();
@@ -1039,11 +1095,45 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1039
1095
for (const MachineOperand &Op :
1040
1096
llvm::drop_begin (IA->operands (), InlineAsm::MIOp_FirstOperand)) {
1041
1097
if (Op.isReg () && Op.isDef ()) {
1042
- WaitStatesNeeded =
1043
- std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1098
+ if (!TRI.isVectorRegister (MRI, Op.getReg ()))
1099
+ continue ;
1100
+
1101
+ if (ST.has12DWordStoreHazard ()) {
1102
+ WaitStatesNeeded =
1103
+ std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1104
+ }
1044
1105
}
1045
1106
}
1046
1107
1108
+ if (ST.hasDstSelForwardingHazard ()) {
1109
+ const int Shift16DefWaitstates = 1 ;
1110
+
1111
+ auto IsShift16BitDefFn = [this , &IA](const MachineInstr &ProducerMI) {
1112
+ const MachineOperand *Dst = getDstSelForwardingOperand (ProducerMI, ST);
1113
+ // Assume inline asm reads the dst
1114
+ if (Dst)
1115
+ return IA->modifiesRegister (Dst->getReg (), &TRI) ||
1116
+ IA->readsRegister (Dst->getReg (), &TRI);
1117
+
1118
+ if (ProducerMI.isInlineAsm ()) {
1119
+ // If MI is inline asm, assume it has dst forwarding hazard
1120
+ for (auto &Def : ProducerMI.all_defs ()) {
1121
+ if (IA->modifiesRegister (Def.getReg (), &TRI) ||
1122
+ IA->readsRegister (Def.getReg (), &TRI)) {
1123
+ return true ;
1124
+ }
1125
+ }
1126
+ }
1127
+
1128
+ return false ;
1129
+ };
1130
+
1131
+ int WaitStatesNeededForDef =
1132
+ Shift16DefWaitstates -
1133
+ getWaitStatesSince (IsShift16BitDefFn, Shift16DefWaitstates);
1134
+ WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
1135
+ }
1136
+
1047
1137
return WaitStatesNeeded;
1048
1138
}
1049
1139
0 commit comments