[AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions #118589

pravinjagtap · 2024-12-04T04:56:45Z

Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes.
Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop for above cases irrespective of opsel values.

Note: We might need to add few others into the same table.

llvmbot · 2024-12-04T04:57:21Z

@llvm/pr-subscribers-backend-amdgpu

Author: Pravin Jagtap (pravinjagtap)

Changes

cvt_scalef32_sr_pk_fp4_f32,
cvt_scalef32_sr_pk_fp4_[f|bf]16 of gfx950.

Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes. Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop for above cases irrespctive of opsel values.

Note: We might need to add few others into the same table.

Full diff: https://github.com/llvm/llvm-project/pull/118589.diff

7 Files Affected:

(modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+23-16)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+10)
(modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp (+12)
(modified) llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h (+3)
(modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+2-2)
(modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+1)
(modified) llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir (+52)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ecf03b14143ee3..accfae676d1186 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -913,24 +913,31 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) {
   // (instructions with dest byte sel, e.g. CVT_SR_BF8_F32) and
   // op_sel[3:2]
   // != 0
-  if (SIInstrInfo::isSDWA(MI)) {
+  if (SIInstrInfo::isSDWA(MI))
     // Type 1: SDWA with dst_sel != DWORD
     if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel))
-      if (DstSel->getImm() == AMDGPU::SDWA::DWORD)
-        return nullptr;
-  } else {
-    // Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst
-    // with op_sel[3:2] != 0)
-    if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) ||
-        !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
-              SISrcMods::DST_OP_SEL ||
-          (AMDGPU::isFP8DstSelInst(Opcode) &&
-           (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
-            SISrcMods::OP_SEL_0))))
-      return nullptr;
-  }
-
-  return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      if (DstSel->getImm() != AMDGPU::SDWA::DWORD)
+        return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+  if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) {
+    // Type 2: VOP3 which write the hi bits
+    if (TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() &
+        SISrcMods::DST_OP_SEL)
+      return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+    // Type 3: FP8DstSelInst with op_sel[3:2] != 0)
+    if (AMDGPU::isFP8DstSelInst(Opcode) &&
+        (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() &
+         SISrcMods::OP_SEL_0))
+      return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  }
+
+  // Special case: nop is required for all the opsel values for fp4 sr variant
+  // cvt scale instructions
+  if (AMDGPU::isFP4DstSelInst(Opcode))
+    return TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+  return nullptr;
 }
 
 /// Checks whether the provided \p MI "consumes" the operand with a Dest sel
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index d8eb9d155315a6..a374099e5885c3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2566,6 +2566,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit IsFP8SrcByteSel = 0;
   field bit IsFP8DstByteSel = 0;
   field bit HasFP8DstByteSel = 0;
+  field bit HasFP4DstByteSel = 0;
   field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel);
 
   field bit HasDst = !ne(DstVT.Value, untyped.Value);
@@ -3257,6 +3258,15 @@ def FP8DstByteSelTable : GenericTable {
   let PrimaryKeyName = "getFP8DstByteSelHelper";
 }
 
+def FP4DstByteSelTable : GenericTable {
+  let FilterClass = "VOP3_Pseudo";
+  let CppTypeName = "FP4DstByteSelInfo";
+  let Fields = ["Opcode", "HasFP4DstByteSel"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getFP4DstByteSelHelper";
+}
+
 def VOPDComponentTable : GenericTable {
   let FilterClass = "VOPD_Component";
   let CppTypeName = "VOPDComponentInfo";
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5a0e812748fbb7..bbb329746fc5ed 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -380,6 +380,8 @@ struct VOPTrue16Info {
 
 #define GET_FP8DstByteSelTable_DECL
 #define GET_FP8DstByteSelTable_IMPL
+#define GET_FP4DstByteSelTable_DECL
+#define GET_FP4DstByteSelTable_IMPL
 
 struct DPMACCInstructionInfo {
   uint16_t Opcode;
@@ -391,6 +393,11 @@ struct FP8DstByteSelInfo {
   bool HasFP8DstByteSel;
 };
 
+struct FP4DstByteSelInfo {
+  uint16_t Opcode;
+  bool HasFP4DstByteSel;
+};
+
 #define GET_FP8DstByteSelTable_DECL
 #define GET_FP8DstByteSelTable_IMPL
 #define GET_MTBUFInfoTable_DECL
@@ -662,6 +669,11 @@ bool isFP8DstSelInst(unsigned Opc) {
   return Info ? Info->HasFP8DstByteSel : false;
 }
 
+bool isFP4DstSelInst(unsigned Opc) {
+  const FP4DstByteSelInfo *Info = getFP4DstByteSelHelper(Opc);
+  return Info ? Info->HasFP4DstByteSel : false;
+}
+
 unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) {
   const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc);
   return Info ? Info->Opcode3Addr : ~0u;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index ea497d7b239d7e..7648f52fa4a7a5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -887,6 +887,9 @@ bool isTrue16Inst(unsigned Opc);
 LLVM_READONLY
 bool isFP8DstSelInst(unsigned Opc);
 
+LLVM_READONLY
+bool isFP4DstSelInst(unsigned Opc);
+
 LLVM_READONLY
 bool isInvalidSingleUseConsumerInst(unsigned Opc);
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index ff9376e635af96..2f1dd188c60ede 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1014,7 +1014,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile<ValueType Src0Ty> :
   let HasExtVOP3DPP = 0;
   let HasOpSel = 1;
   let HasOMod = 0;
-  let HasFP8DstByteSel = 1;
+  let HasFP4DstByteSel = 1;
 }
 
 def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32, v2f32, i32, f32]>, VOP3_OPSEL> {
@@ -1026,7 +1026,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile<VOPProfile<[i32
   let HasExtVOP3DPP = 0;
   let HasOpSel = 1;
   let HasOMod = 0;
-  let HasFP8DstByteSel = 1;
+  let HasFP4DstByteSel = 1;
 }
 
 class VOP3_CVT_SCALE_PK_F16BF16F32_FP4FP8BF8_Profile<ValueType DstTy> : VOP3_Profile<VOPProfile<[DstTy, i32, f32, untyped]>,
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 0e19696a32f86f..c38ec3ba897270 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -110,6 +110,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   let IsSWMMAC = P.IsSWMMAC;
 
   bit HasFP8DstByteSel = P.HasFP8DstByteSel;
+  bit HasFP4DstByteSel = P.HasFP4DstByteSel;
 
   let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
                         P.AsmVOP3OpSel,
diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
index 6a25e346c89447..22ff1e2b4cbf08 100644
--- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
+++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir
@@ -485,6 +485,23 @@ body:             |
     S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
 ...
 
+---
+# GCN-LABEL: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
+# GCN:      V_CVT_SCALEF32_SR_PK_FP4_BF16_e64
+# GCN:      S_NOP 0
+# GCN:      V_ADD_U32_e32
+name:            test_scalef32_sr_pk_fp4_bf16_opsel0_hazard
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    S_WAITCNT 0
+    renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+    S_WAITCNT 3952
+    early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec
+    renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
+    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
 ---
 # GCN-LABEL: test_scalef32_sr_pk_fp4_f32_hazard
 # GCN:      V_CVT_SCALEF32_SR_PK_FP4_F32_e64
@@ -502,6 +519,23 @@ body:             |
     S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
 ...
 
+---
+# GCN-LABEL: test_scalef32_sr_pk_fp4_f32_opsel0_hazard
+# GCN:      V_CVT_SCALEF32_SR_PK_FP4_F32_e64
+# GCN:      S_NOP 0
+# GCN:      V_ADD_U32_e32
+name:            test_scalef32_sr_pk_fp4_f32_opsel0_hazard
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5
+    S_WAITCNT 0
+    renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec
+    S_WAITCNT 3952
+    early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec
+    renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec
+    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
 ---
 # GCN-LABEL: test_cvt_scalef32_fp4_f16_hazard
 # GCN:      V_CVT_SCALEF32_PK_FP4_F16_e64
@@ -636,6 +670,24 @@ body:             |
     S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
 ...
 
+---
+# GCN-LABEL: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
+# GCN:      V_CVT_SCALEF32_PK_FP4_F16_e64
+# GCN:      S_NOP 0
+# GCN:      V_CVT_SCALEF32_SR_PK_FP4_F16_e64
+# GCN:      S_NOP 0
+# GCN:      S_SETPC_B64_return
+name:            test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    S_WAITCNT 0
+    renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec
+    early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr3, 0, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec
+    $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec
+    S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0
+...
+
 ---
 # GCN-LABEL: test_cvt_scale_cvt_scale_waw_hazard
 # GCN:      V_CVT_SCALEF32_PK_FP4_F16_e64

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes. Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop for above cases irrespctive of opsel values. Note: We might need to add few others into the same table.

arsenm · 2024-12-05T18:26:18Z

llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir

@@ -731,17 +732,18 @@ body:             |
 ...

 ---
-name:            test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard
+name:            test_scalef32_sr_pk_fp4_f32_opsel0_hazard


why rename here?

This change adds nop for opsel==0 as well now. So its not negative test anymore.

It was never a negative test though, it was pre-commited for this

arsenm · 2024-12-05T18:27:22Z

llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp

+
+  // Special case: nop is required for all the opsel values for fp4 sr variant
+  // cvt scale instructions
+  if (AMDGPU::isFP4DstSelInst(Opcode))


Can you combine the fp4 and fp8 queries?

arsenm · 2024-12-06T14:26:58Z

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

+  const FP8DstByteSelInfo *Info8 = getFP8DstByteSelHelper(Opc);
+  if (Info8 && Info8->HasFP8DstByteSel)
+    return FPType::FP8;
+
+  const FP4DstByteSelInfo *Info4 = getFP4DstByteSelHelper(Opc);
+  if (Info4 && Info4->HasFP4DstByteSel)
+    return FPType::FP4;


These can be handled with one generated lookup function

We are generating two distinct searchable tables for FP4 and FP8 opcodes (one does not have info about the other).
I am not sure how can we handle above logic with just one (either getFP8DstByteSelHelper or getFP4DstByteSelHelper) lookup function. I think, I am missing something here.

Yes, I'm saying to generate one searchable table with the enum as one of the fields. See MFMA_F8F6F4_Info or MAIInstInfo for an example

arsenm · 2024-12-08T14:48:00Z

llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp

+  if (Info && Info->HasFP8DstByteSel)
+    return FPType::FP8;
+  else if (Info && Info->HasFP4DstByteSel)
+    return FPType::FP4;
+  else
+    return FPType::None;


Suggested change

if (Info && Info->HasFP8DstByteSel)

return FPType::FP8;

else if (Info && Info->HasFP4DstByteSel)

return FPType::FP4;

else

return FPType::None;

if (!Info)

return FPType::None;

if (Info->HasFP8DstByteSel)

return FPType::FP8;

if (Info->HasFP4DstByteSel)

return FPType::FP4;

llvm_unreachable("not fp4 or fp8");

No else after return

Need to use return None instead of llvm_unreachable. We can reach here for other instructions for handling dst forwarding hazard.

pravinjagtap · 2024-12-11T07:12:52Z

Ping @arsenm.

arsenm · 2024-12-11T10:21:09Z

Ping @arsenm.

This is fine but I would prefer that we have one searchable table instead of 2, as mentioned above. Can do as a follow up

pravinjagtap · 2024-12-11T13:56:49Z

Ping @arsenm.

This is fine but I would prefer that we have one searchable table instead of 2, as mentioned above. Can do as a follow up

We have only one searchable table FP4FP8DstByteSelTable now after this patch. Which other you are referring to?

Local branch amd-gfx 4c17e1c Merged main:c00a708fc954 into amd-gfx:6f6131a974b2 Remote branch main 5e007af [AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions (llvm#118589)

Presently, compiler selectivelly adds nop when opsel != 0 i.e. only when partially writing to high bytes. Experiments in SWDEV-499733 and SWDEV-501347 suggest that we need nop for above cases irrespective of opsel values. Note: We might need to add few others into the same table.

… (llvm#1627)

pravinjagtap requested review from arsenm and jrbyrnes December 4, 2024 04:56

llvmbot added the backend:AMDGPU label Dec 4, 2024

arsenm changed the title ~~[AMDGPU] Handle CvtScaleForwardingHazard using HasFP4DstByteSel for:~~ [AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions Dec 4, 2024

arsenm reviewed Dec 4, 2024

View reviewed changes

pravinjagtap requested a review from shiltian December 5, 2024 13:03

pravinjagtap force-pushed the pravinjagtap/fix-fp4-sr-cvt-hazards branch from 6ee546d to 085035e Compare December 5, 2024 18:23

arsenm reviewed Dec 5, 2024

View reviewed changes

Combined the fp4 and fp8 queries in getFPDstSelType.

b34afa8

pravinjagtap force-pushed the pravinjagtap/fix-fp4-sr-cvt-hazards branch from 8e1563b to b34afa8 Compare December 6, 2024 05:43

arsenm reviewed Dec 6, 2024

View reviewed changes

Merged two search tables of fp4 and fp8

3cfa49c

arsenm reviewed Dec 8, 2024

View reviewed changes

addressed review comment.

c3dac82

arsenm approved these changes Dec 11, 2024

View reviewed changes

pravinjagtap merged commit 5e007af into llvm:main Dec 11, 2024
8 checks passed

searlmc1 pushed a commit to ROCm/llvm-project that referenced this pull request Apr 11, 2025

[AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions (llvm#118589)…

7aa0aca

… (llvm#1627)

[AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions #118589

[AMDGPU] Handle hazard in v_scalef32_sr_fp4_* conversions #118589

Uh oh!

Conversation

pravinjagtap commented Dec 4, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Dec 4, 2024

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

pravinjagtap commented Dec 11, 2024

Uh oh!

arsenm commented Dec 11, 2024

Uh oh!

Uh oh!

pravinjagtap commented Dec 11, 2024

Uh oh!

Uh oh!

pravinjagtap commented Dec 4, 2024 •

edited

Loading