@@ -873,13 +873,78 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
873
873
return DataIdx >= 0 &&
874
874
TRI->regsOverlap (MI.getOperand (DataIdx).getReg (), Reg);
875
875
};
876
+
876
877
int WaitStatesNeededForDef =
877
878
VALUWaitStates - getWaitStatesSince (IsHazardFn, VALUWaitStates);
878
879
WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
879
880
880
881
return WaitStatesNeeded;
881
882
}
882
883
884
+ // / Dest sel forwarding issue occurs if additional logic is needed to swizzle /
885
+ // / pack the computed value into correct bit position of the dest register. This
886
+ // / occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
887
+ // / dst_sel that is not aligned to the register. This function analayzes the \p
888
+ // / MI and \returns an operand with dst forwarding issue, or nullptr if
889
+ // / none exists.
890
+ static const MachineOperand *
891
+ getDstSelForwardingOperand (const MachineInstr &MI, const GCNSubtarget &ST) {
892
+ if (!SIInstrInfo::isVALU (MI))
893
+ return nullptr ;
894
+
895
+ const SIInstrInfo *TII = ST.getInstrInfo ();
896
+
897
+ unsigned Opcode = MI.getOpcode ();
898
+
899
+ // There are three different types of instructions
900
+ // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
901
+ // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
902
+ // CVT_SR_BF8_F32 with op_sel[3:2]
903
+ // != 0
904
+ if (SIInstrInfo::isSDWA (MI)) {
905
+ // Type 1: SDWA with dst_sel != DWORD
906
+ if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
907
+ if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
908
+ return nullptr ;
909
+ } else {
910
+ // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
911
+ // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
912
+ if (!AMDGPU::hasNamedOperand (Opcode, AMDGPU::OpName::op_sel) ||
913
+ !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)->getImm () &
914
+ SISrcMods::DST_OP_SEL ||
915
+ (AMDGPU::isFP8DstSelInst (Opcode) &&
916
+ (TII->getNamedOperand (MI, AMDGPU::OpName::src2_modifiers)->getImm () &
917
+ SISrcMods::OP_SEL_0))))
918
+ return nullptr ;
919
+ }
920
+
921
+ return TII->getNamedOperand (MI, AMDGPU::OpName::vdst);
922
+ }
923
+
924
+ // / Checks whether the provided \p MI "consumes" the operand with a Dest sel
925
+ // / fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
926
+ // / RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
927
+ static bool consumesDstSelForwardingOperand (const MachineInstr *VALU,
928
+ const MachineOperand *Dst,
929
+ const SIRegisterInfo *TRI) {
930
+ // We must consider implicit reads of the VALU. SDWA with dst_sel and
931
+ // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
932
+ // and we must account for that hazard.
933
+ // We also must account for WAW hazards. In particular, WAW with dest
934
+ // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
935
+ // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
936
+ // check for ECC. Without accounting for this hazard, the ECC will be
937
+ // wrong.
938
+ // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
939
+ // complete zeroesHigh16BitsOfDest)
940
+ for (auto &Operand : VALU->operands ()) {
941
+ if (Operand.isReg () && TRI->regsOverlap (Dst->getReg (), Operand.getReg ())) {
942
+ return true ;
943
+ }
944
+ }
945
+ return false ;
946
+ }
947
+
883
948
int GCNHazardRecognizer::checkVALUHazards (MachineInstr *VALU) {
884
949
int WaitStatesNeeded = 0 ;
885
950
@@ -910,27 +975,18 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
910
975
if (ST.hasDstSelForwardingHazard ()) {
911
976
const int Shift16DefWaitstates = 1 ;
912
977
913
- auto IsShift16BitDefFn = [this , VALU](const MachineInstr &MI) {
914
- if (!SIInstrInfo::isVALU (MI))
915
- return false ;
916
- const SIInstrInfo *TII = ST.getInstrInfo ();
917
- if (SIInstrInfo::isSDWA (MI)) {
918
- if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
919
- if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
920
- return false ;
921
- } else {
922
- if (!AMDGPU::hasNamedOperand (MI.getOpcode (), AMDGPU::OpName::op_sel) ||
923
- !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)
924
- ->getImm () &
925
- SISrcMods::DST_OP_SEL))
926
- return false ;
927
- }
978
+ auto IsShift16BitDefFn = [this , VALU](const MachineInstr &ProducerMI) {
928
979
const SIRegisterInfo *TRI = ST.getRegisterInfo ();
929
- if (auto *Dst = TII->getNamedOperand (MI, AMDGPU::OpName::vdst)) {
930
- Register Def = Dst->getReg ();
980
+ const MachineOperand *ForwardedDst =
981
+ getDstSelForwardingOperand (ProducerMI, ST);
982
+ if (ForwardedDst) {
983
+ return consumesDstSelForwardingOperand (VALU, ForwardedDst, TRI);
984
+ }
931
985
932
- for (const MachineOperand &Use : VALU->explicit_uses ()) {
933
- if (Use.isReg () && TRI->regsOverlap (Def, Use.getReg ()))
986
+ if (ProducerMI.isInlineAsm ()) {
987
+ // Assume inline asm has dst forwarding hazard
988
+ for (auto &Def : ProducerMI.all_defs ()) {
989
+ if (consumesDstSelForwardingOperand (VALU, &Def, TRI))
934
990
return true ;
935
991
}
936
992
}
@@ -1027,7 +1083,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1027
1083
// problematic thus far.
1028
1084
1029
1085
// see checkVALUHazards()
1030
- if (!ST.has12DWordStoreHazard ())
1086
+ if (!ST.has12DWordStoreHazard () && !ST. hasDstSelForwardingHazard () )
1031
1087
return 0 ;
1032
1088
1033
1089
const MachineRegisterInfo &MRI = MF.getRegInfo ();
@@ -1036,11 +1092,45 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
1036
1092
for (const MachineOperand &Op :
1037
1093
llvm::drop_begin (IA->operands (), InlineAsm::MIOp_FirstOperand)) {
1038
1094
if (Op.isReg () && Op.isDef ()) {
1039
- WaitStatesNeeded =
1040
- std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1095
+ if (!TRI.isVectorRegister (MRI, Op.getReg ()))
1096
+ continue ;
1097
+
1098
+ if (ST.has12DWordStoreHazard ()) {
1099
+ WaitStatesNeeded =
1100
+ std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1101
+ }
1041
1102
}
1042
1103
}
1043
1104
1105
+ if (ST.hasDstSelForwardingHazard ()) {
1106
+ const int Shift16DefWaitstates = 1 ;
1107
+
1108
+ auto IsShift16BitDefFn = [this , &IA](const MachineInstr &ProducerMI) {
1109
+ const MachineOperand *Dst = getDstSelForwardingOperand (ProducerMI, ST);
1110
+ // Assume inline asm reads the dst
1111
+ if (Dst)
1112
+ return IA->modifiesRegister (Dst->getReg (), &TRI) ||
1113
+ IA->readsRegister (Dst->getReg (), &TRI);
1114
+
1115
+ if (ProducerMI.isInlineAsm ()) {
1116
+ // If MI is inline asm, assume it has dst forwarding hazard
1117
+ for (auto &Def : ProducerMI.all_defs ()) {
1118
+ if (IA->modifiesRegister (Def.getReg (), &TRI) ||
1119
+ IA->readsRegister (Def.getReg (), &TRI)) {
1120
+ return true ;
1121
+ }
1122
+ }
1123
+ }
1124
+
1125
+ return false ;
1126
+ };
1127
+
1128
+ int WaitStatesNeededForDef =
1129
+ Shift16DefWaitstates -
1130
+ getWaitStatesSince (IsShift16BitDefFn, Shift16DefWaitstates);
1131
+ WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
1132
+ }
1133
+
1044
1134
return WaitStatesNeeded;
1045
1135
}
1046
1136
0 commit comments