-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions #118589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions #118589
Conversation
@llvm/pr-subscribers-backend-amdgpu Author: Pravin Jagtap (pravinjagtap) Changescvt_scalef32_sr_pk_fp4_f32, Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes. Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop for above cases irrespctive of opsel values. Note: We might need to add few others into the same table. Full diff: https://github.com/llvm/llvm-project/pull/118589.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ecf03b14143ee3..accfae676d1186 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -913,24 +913,31 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
// (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
// op_sel[3:2]
// != 0
- if (SIInstrInfo::isSDWA(MI)) {
+ if (SIInstrInfo::isSDWA(MI))
// Type 1: SDWA with dst_sel != DWORD
if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
- if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
- return nullptr;
- } else {
- // Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
- // with op_sel[3:2] != 0)
- if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
- !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
- SISrcMods::DST_OP_SEL ||
- (AMDGPU::isFP8DstSelInst(Opcode) &&
- (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
- SISrcMods::OP_SEL_0))))
- return nullptr;
- }
-
- return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
+ return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
+ // Type 2: VOP3 which write the hi bits
+ if (TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
+ SISrcMods::DST_OP_SEL)
+ return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
+ if (AMDGPU::isFP8DstSelInst(Opcode) &&
+ (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
+ SISrcMods::OP_SEL_0))
+ return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+ }
+
+ // Special case: nop is required for all the opsel values for fp4 sr variant
+ // cvt scale instructions
+ if (AMDGPU::isFP4DstSelInst(Opcode))
+ return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+ return nullptr;
}
/// Checks whether the provided \p MI "consumes" the operand with a Dest sel
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index d8eb9d155315a6..a374099e5885c3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2566,6 +2566,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
field bit IsFP8SrcByteSel = 0;
field bit IsFP8DstByteSel = 0;
field bit HasFP8DstByteSel = 0;
+ field bit HasFP4DstByteSel = 0;
field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
field bit HasDst = !ne(DstVT.Value, untyped.Value);
@@ -3257,6 +3258,15 @@ def FP8DstByteSelTable : GenericTable {
let PrimaryKeyName = "getFP8DstByteSelHelper";
}
+def FP4DstByteSelTable : GenericTable {
+ let FilterClass = "VOP3_Pseudo";
+ let CppTypeName = "FP4DstByteSelInfo";
+ let Fields = ["Opcode", "HasFP4DstByteSel"];
+
+ let PrimaryKey = ["Opcode"];
+ let PrimaryKeyName = "getFP4DstByteSelHelper";
+}
+
def VOPDComponentTable : GenericTable {
let FilterClass = "VOPD_Component";
let CppTypeName = "VOPDComponentInfo";
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5a0e812748fbb7..bbb329746fc5ed 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -380,6 +380,8 @@ struct VOPTrue16Info {
#define GET_FP8DstByteSelTable_DECL
#define GET_FP8DstByteSelTable_IMPL
+#define GET_FP4DstByteSelTable_DECL
+#define GET_FP4DstByteSelTable_IMPL
struct DPMACCInstructionInfo {
uint16_t Opcode;
@@ -391,6 +393,11 @@ struct FP8DstByteSelInfo {
bool HasFP8DstByteSel;
};
+struct FP4DstByteSelInfo {
+ uint16_t Opcode;
+ bool HasFP4DstByteSel;
+};
+
#define GET_FP8DstByteSelTable_DECL
#define GET_FP8DstByteSelTable_IMPL
#define GET_MTBUFInfoTable_DECL
@@ -662,6 +669,11 @@ bool isFP8DstSelInst(unsigned Opc) {
return Info ? Info->HasFP8DstByteSel : false;
}
+bool isFP4DstSelInst(unsigned Opc) {
+ const FP4DstByteSelInfo *Info = getFP4DstByteSelHelper(Opc);
+ return Info ? Info->HasFP4DstByteSel : false;
+}
+
unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
return Info ? Info->Opcode3Addr : ~0u;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ea497d7b239d7e..7648f52fa4a7a5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -887,6 +887,9 @@ bool isTrue16Inst(unsigned Opc);
LLVM_READONLY
bool isFP8DstSelInst(unsigned Opc);
+LLVM_READONLY
+bool isFP4DstSelInst(unsigned Opc);
+
LLVM_READONLY
bool isInvalidSingleUseConsumerInst(unsigned Opc);
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index ff9376e635af96..2f1dd188c60ede 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1014,7 +1014,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
- let HasFP8DstByteSel = 1;
+ let HasFP4DstByteSel = 1;
}
def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
@@ -1026,7 +1026,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
let HasExtVOP3DPP = 0;
let HasOpSel = 1;
let HasOMod = 0;
- let HasFP8DstByteSel = 1;
+ let HasFP4DstByteSel = 1;
}
class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 0e19696a32f86f..c38ec3ba897270 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -110,6 +110,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
let IsSWMMAC = P.IsSWMMAC;
bit HasFP8DstByteSel = P.HasFP8DstByteSel;
+ bit HasFP4DstByteSel = P.HasFP4DstByteSel;
let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
P.AsmVOP3OpSel,
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 6a25e346c89447..22ff1e2b4cbf08 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -485,6 +485,23 @@ body: |
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
...
+---
+# GCN-LABEL: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_BF16_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
---
# GCN-LABEL: test_scalef32_sr_pk_fp4_f32_hazard
# GCN: V_CVT_SCALEF32_SR_PK_FP4_F32_e64
@@ -502,6 +519,23 @@ body: |
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
...
+---
+# GCN-LABEL: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_F32_e64
+# GCN: S_NOP 0
+# GCN: V_ADD_U32_e32
+name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+ S_WAITCNT 0
+ renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+ S_WAITCNT 3952
+ early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec
+ renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
---
# GCN-LABEL: test_cvt_scalef32_fp4_f16_hazard
# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
@@ -636,6 +670,24 @@ body: |
S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
...
+---
+# GCN-LABEL: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
+# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: V_CVT_SCALEF32_SR_PK_FP4_F16_e64
+# GCN: S_NOP 0
+# GCN: S_SETPC_B64_return
+name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
+body: |
+ bb.0:
+ liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ S_WAITCNT 0
+ renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+ early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr3, 0, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec
+ $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec
+ S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
---
# GCN-LABEL: test_cvt_scale_cvt_scale_waw_hazard
# GCN: V_CVT_SCALEF32_PK_FP4_F16_e64
|
Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes. Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop for above cases irrespctive of opsel values. Note: We might need to add few others into the same table.
6ee546d
to
085035e
Compare
@@ -731,17 +732,18 @@ body: | | |||
... | |||
|
|||
--- | |||
name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard | |||
name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why rename here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This change adds nop
for opsel==0
as well now. So its not negative test anymore.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It was never a negative test though, it was pre-commited for this
|
||
// Special case: nop is required for all the opsel values for fp4 sr variant | ||
// cvt scale instructions | ||
if (AMDGPU::isFP4DstSelInst(Opcode)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you combine the fp4 and fp8 queries?
8e1563b
to
b34afa8
Compare
const FP8DstByteSelInfo *Info8 = getFP8DstByteSelHelper(Opc); | ||
if (Info8 && Info8->HasFP8DstByteSel) | ||
return FPType::FP8; | ||
|
||
const FP4DstByteSelInfo *Info4 = getFP4DstByteSelHelper(Opc); | ||
if (Info4 && Info4->HasFP4DstByteSel) | ||
return FPType::FP4; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
These can be handled with one generated lookup function
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We are generating two distinct searchable tables for FP4 and FP8 opcodes (one does not have info about the other).
I am not sure how can we handle above logic with just one (either getFP8DstByteSelHelper
or getFP4DstByteSelHelper
) lookup function. I think, I am missing something here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, I'm saying to generate one searchable table with the enum as one of the fields. See MFMA_F8F6F4_Info or MAIInstInfo for an example
if (Info && Info->HasFP8DstByteSel) | ||
return FPType::FP8; | ||
else if (Info && Info->HasFP4DstByteSel) | ||
return FPType::FP4; | ||
else | ||
return FPType::None; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
if (Info && Info->HasFP8DstByteSel) | |
return FPType::FP8; | |
else if (Info && Info->HasFP4DstByteSel) | |
return FPType::FP4; | |
else | |
return FPType::None; | |
if (!Info) | |
return FPType::None; | |
if (Info->HasFP8DstByteSel) | |
return FPType::FP8; | |
if (Info->HasFP4DstByteSel) | |
return FPType::FP4; | |
llvm_unreachable("not fp4 or fp8"); |
No else after return
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Need to use return None
instead of llvm_unreachable
. We can reach here for other instructions for handling dst forwarding hazard.
Ping @arsenm. |
This is fine but I would prefer that we have one searchable table instead of 2, as mentioned above. Can do as a follow up |
We have only one searchable table |
Local branch amd-gfx 4c17e1c Merged main:c00a708fc954 into amd-gfx:6f6131a974b2 Remote branch main 5e007af [AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions (llvm#118589)
Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes. Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop for above cases irrespective of opsel values. Note: We might need to add few others into the same table.
Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes.
Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop for above cases irrespective of opsel values.
Note: We might need to add few others into the same table.