@@ -875,13 +875,78 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
875
875
return DataIdx >= 0 &&
876
876
TRI->regsOverlap (MI.getOperand (DataIdx).getReg (), Reg);
877
877
};
878
+
878
879
int WaitStatesNeededForDef =
879
880
VALUWaitStates - getWaitStatesSince (IsHazardFn, VALUWaitStates);
880
881
WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
881
882
882
883
return WaitStatesNeeded;
883
884
}
884
885
886
+ // / Dest sel forwarding issue occurs if additional logic is needed to swizzle /
887
+ // / pack the computed value into correct bit position of the dest register. This
888
+ // / occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
889
+ // / dst_sel that is not aligned to the register. This function analayzes the \p
890
+ // / MI and \returns an operand with dst forwarding issue, or nullptr if
891
+ // / none exists.
892
+ static const MachineOperand *
893
+ getDstSelForwardingOperand (const MachineInstr &MI, const GCNSubtarget &ST) {
894
+ if (!SIInstrInfo::isVALU (MI))
895
+ return nullptr ;
896
+
897
+ const SIInstrInfo *TII = ST.getInstrInfo ();
898
+
899
+ unsigned Opcode = MI.getOpcode ();
900
+
901
+ // There are three different types of instructions
902
+ // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
903
+ // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
904
+ // CVT_SR_BF8_F32 with op_sel[3:2]
905
+ // != 0
906
+ if (SIInstrInfo::isSDWA (MI)) {
907
+ // Type 1: SDWA with dst_sel != DWORD
908
+ if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
909
+ if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
910
+ return nullptr ;
911
+ } else {
912
+ // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
913
+ // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
914
+ if (!AMDGPU::hasNamedOperand (Opcode, AMDGPU::OpName::op_sel) ||
915
+ !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)->getImm () &
916
+ SISrcMods::DST_OP_SEL ||
917
+ (AMDGPU::isFP8DstSelInst (Opcode) &&
918
+ (TII->getNamedOperand (MI, AMDGPU::OpName::src2_modifiers)->getImm () &
919
+ SISrcMods::OP_SEL_0))))
920
+ return nullptr ;
921
+ }
922
+
923
+ return TII->getNamedOperand (MI, AMDGPU::OpName::vdst);
924
+ }
925
+
926
+ // / Checks whether the provided \p MI "consumes" the operand with a Dest sel
927
+ // / fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
928
+ // / RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
929
+ static bool consumesDstSelForwardingOperand (const MachineInstr *VALU,
930
+ const MachineOperand *Dst,
931
+ const SIRegisterInfo *TRI) {
932
+ // We must consider implicit reads of the VALU. SDWA with dst_sel and
933
+ // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
934
+ // and we must account for that hazard.
935
+ // We also must account for WAW hazards. In particular, WAW with dest
936
+ // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
937
+ // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
938
+ // check for ECC. Without accounting for this hazard, the ECC will be
939
+ // wrong.
940
+ // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
941
+ // complete zeroesHigh16BitsOfDest)
942
+ for (auto &Operand : VALU->operands ()) {
943
+ if (Operand.isReg () && TRI->regsOverlap (Dst->getReg (), Operand.getReg ())) {
944
+ return true ;
945
+ }
946
+ }
947
+ return false ;
948
+ }
949
+
885
950
int GCNHazardRecognizer::checkVALUHazards (MachineInstr *VALU) {
886
951
int WaitStatesNeeded = 0 ;
887
952
@@ -912,27 +977,18 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
912
977
if (ST.hasDstSelForwardingHazard ()) {
913
978
const int Shift16DefWaitstates = 1 ;
914
979
915
- auto IsShift16BitDefFn = [this , VALU](const MachineInstr &MI) {
916
- if (!SIInstrInfo::isVALU (MI))
917
- return false ;
918
- const SIInstrInfo *TII = ST.getInstrInfo ();
919
- if (SIInstrInfo::isSDWA (MI)) {
920
- if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
921
- if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
922
- return false ;
923
- } else {
924
- if (!AMDGPU::hasNamedOperand (MI.getOpcode (), AMDGPU::OpName::op_sel) ||
925
- !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)
926
- ->getImm () &
927
- SISrcMods::DST_OP_SEL))
928
- return false ;
929
- }
980
+ auto IsShift16BitDefFn = [this , VALU](const MachineInstr &ProducerMI) {
930
981
const SIRegisterInfo *TRI = ST.getRegisterInfo ();
931
- if (auto *Dst = TII->getNamedOperand (MI, AMDGPU::OpName::vdst)) {
932
- Register Def = Dst->getReg ();
982
+ const MachineOperand *ForwardedDst =
983
+ getDstSelForwardingOperand (ProducerMI, ST);
984
+ if (ForwardedDst) {
985
+ return consumesDstSelForwardingOperand (VALU, ForwardedDst, TRI);
986
+ }
933
987
934
- for (const MachineOperand &Use : VALU->explicit_uses ()) {
935
- if (Use.isReg () && TRI->regsOverlap (Def, Use.getReg ()))
988
+ if (ProducerMI.isInlineAsm ()) {
989
+ // Assume inline asm has dst forwarding hazard
990
+ for (auto &Def : ProducerMI.all_defs ()) {
991
+ if (consumesDstSelForwardingOperand (VALU, &Def, TRI))
936
992
return true ;
937
993
}
938
994
}
@@ -1029,7 +1085,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1029
1085
// problematic thus far.
1030
1086
1031
1087
// see checkVALUHazards()
1032
- if (!ST.has12DWordStoreHazard ())
1088
+ if (!ST.has12DWordStoreHazard () && !ST. hasDstSelForwardingHazard () )
1033
1089
return 0 ;
1034
1090
1035
1091
const MachineRegisterInfo &MRI = MF.getRegInfo ();
@@ -1038,11 +1094,45 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1038
1094
for (const MachineOperand &Op :
1039
1095
llvm::drop_begin (IA->operands (), InlineAsm::MIOp_FirstOperand)) {
1040
1096
if (Op.isReg () && Op.isDef ()) {
1041
- WaitStatesNeeded =
1042
- std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1097
+ if (!TRI.isVectorRegister (MRI, Op.getReg ()))
1098
+ continue ;
1099
+
1100
+ if (ST.has12DWordStoreHazard ()) {
1101
+ WaitStatesNeeded =
1102
+ std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1103
+ }
1043
1104
}
1044
1105
}
1045
1106
1107
+ if (ST.hasDstSelForwardingHazard ()) {
1108
+ const int Shift16DefWaitstates = 1 ;
1109
+
1110
+ auto IsShift16BitDefFn = [this , &IA](const MachineInstr &ProducerMI) {
1111
+ const MachineOperand *Dst = getDstSelForwardingOperand (ProducerMI, ST);
1112
+ // Assume inline asm reads the dst
1113
+ if (Dst)
1114
+ return IA->modifiesRegister (Dst->getReg (), &TRI) ||
1115
+ IA->readsRegister (Dst->getReg (), &TRI);
1116
+
1117
+ if (ProducerMI.isInlineAsm ()) {
1118
+ // If MI is inline asm, assume it has dst forwarding hazard
1119
+ for (auto &Def : ProducerMI.all_defs ()) {
1120
+ if (IA->modifiesRegister (Def.getReg (), &TRI) ||
1121
+ IA->readsRegister (Def.getReg (), &TRI)) {
1122
+ return true ;
1123
+ }
1124
+ }
1125
+ }
1126
+
1127
+ return false ;
1128
+ };
1129
+
1130
+ int WaitStatesNeededForDef =
1131
+ Shift16DefWaitstates -
1132
+ getWaitStatesSince (IsShift16BitDefFn, Shift16DefWaitstates);
1133
+ WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
1134
+ }
1135
+
1046
1136
return WaitStatesNeeded;
1047
1137
}
1048
1138
0 commit comments