[AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 #144782

jwanggit86 · 2025-06-18T19:04:55Z

Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10.

llvmbot · 2025-06-18T19:05:18Z

@llvm/pr-subscribers-mc

@llvm/pr-subscribers-backend-amdgpu

Author: Jun Wang (jwanggit86)

Changes

Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10.

Full diff: https://github.com/llvm/llvm-project/pull/144782.diff

6 Files Affected:

(modified) llvm/lib/Target/AMDGPU/VOP2Instructions.td (+8)
(modified) llvm/test/MC/AMDGPU/gfx10_asm_vop2.s (+12)
(added) llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s (+13)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt (+7)
(modified) llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt (+3)
(added) llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt (+10)

diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 0c7e20fc1ebf3..c459c4df11e9e 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16       : VOP2_Real_gfx10<0x03b>;
 let IsSingle = 1 in {
   defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
 }
+defm V_PK_FMAC_F16     :  VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>;
 
 // VOP2 no carry-in, carry-out.
 defm V_ADD_NC_U32 :
@@ -2504,6 +2505,11 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
       VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
 }
 
+  multiclass VOP2_Real_dpp_gfx9<bits<6> op> {
+    if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+      def _dpp_gfx9 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>;
+  }
+
 } // End DecoderNamespace = "GFX9"
 
 multiclass VOP2_Real_e32e64_vi <bits<6> op> :
@@ -2560,6 +2566,8 @@ defm V_SUBBREV_CO_U32     : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s
 defm V_ADD_U32            : VOP2_Real_e32e64_gfx9 <0x34>;
 defm V_SUB_U32            : VOP2_Real_e32e64_gfx9 <0x35>;
 defm V_SUBREV_U32         : VOP2_Real_e32e64_gfx9 <0x36>;
+
+defm V_PK_FMAC_F16     : VOP2_Real_dpp_gfx9<0x03c>;
 } // End AssemblerPredicate = isGFX9Only
 
 defm V_BFM_B32            : VOP2_Real_e64only_vi <0x293>;
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
index 3dcf288bbbaa5..bbd36a9140b96 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
@@ -13185,3 +13185,15 @@ v_pk_fmac_f16 v5, -4.0, v2
 
 v_pk_fmac_f16 v5, v1, v255
 // GFX10: encoding: [0x01,0xff,0x0b,0x78]
+
+v_pk_fmac_f16 v5, v1, v2
+// GFX10: encoding: [0x01,0x05,0x0a,0x78]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
+
+v_pk_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX10: encoding: [0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
new file mode 100644
index 0000000000000..f7dab2d0359dc
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx942 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+
+v_pk_fmac_f16 v5, v1, v2
+// CHECK-MI: [0x01,0x05,0x0a,0x78]
+
+v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3]
+// CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
index 1774efa4a65c7..4a7471e6c6f98 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
@@ -2476,3 +2476,10 @@
 # W32: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff]
 # W64: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff]
 0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
index 40b8f24e4d72f..233f93a5b8e7d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
@@ -222,3 +222,6 @@
 # W32: v_cndmask_b32_dpp v0, v1, v2, vcc_lo  dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa]
 # W64: v_cndmask_b32_dpp v0, v1, v2, vcc  dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa]
 0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05]
+0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
new file mode 100644
index 0000000000000..ac1ef4baa9aa4
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -disassemble -show-encoding < %s | FileCheck -check-prefix=CHECK-MI %s
+
+# CHECK-MI: v_pk_fmac_f16_e32 v5, v1, v2
+0x01,0x05,0x0a,0x78
+
+# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00

rampitec · 2025-06-18T19:13:10Z

llvm/lib/Target/AMDGPU/VOP2Instructions.td

@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16       : VOP2_Real_gfx10<0x03b>;
 let IsSingle = 1 in {
  defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
 }
+defm V_PK_FMAC_F16     :  VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>;


No need to define separate dpp real, VOP2_Real_gfx10 should do it for you.

No need to define separate dpp real, VOP2_Real_gfx10 should do it for you.

This would create some additional instructions though, i.e., for GFX9, V_PK_FMAC_F16_e32_gfx9, V_PK_FMAC_F16_e64_gfx9, V_PK_FMAC_F16_sdwa_gfx9, and for GFX10, V_PK_FMAC_F16_e64_gfx10, V_PK_FMAC_F16_sdwa_gfx10. Would this be a problem?

Also, here (Line 2173) VOP2_Real_e32_gfx10 is instantiated with a predicate on Line 2172, but VOP2_Real_gfx10 does not have this predicate.

Actually gfx9 does support SDWA for this instruction. Add few tests as your change has enabled it?

For gfx10 I think you need to add a test to the gfx10_unsupported_sdwa.s, because it is unsupported there.

If sdwa is not supported for gfx10, then the code should go back to the previous version, namely,
defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>;, instead of using VOP2_Real_gfx10. Right?

Still do it in a single def. If there is no suitable macro, create one.

Still do it in a single def. If there is no suitable macro, create one.

Done.

rampitec · 2025-06-18T19:14:22Z

llvm/lib/Target/AMDGPU/VOP2Instructions.td

@@ -2560,6 +2566,8 @@ defm V_SUBBREV_CO_U32     : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s
 defm V_ADD_U32            : VOP2_Real_e32e64_gfx9 <0x34>;
 defm V_SUB_U32            : VOP2_Real_e32e64_gfx9 <0x35>;
 defm V_SUBREV_U32         : VOP2_Real_e32e64_gfx9 <0x36>;
+
+defm V_PK_FMAC_F16     : VOP2_Real_dpp_gfx9<0x03c>;


Same here, VOP2_Real_e32e64_gfx9 shall define all forms.

Code updated as suggested.

rampitec · 2025-06-19T18:40:13Z

llvm/lib/Target/AMDGPU/VOP2Instructions.td

-let IsSingle = 1 in {
-  defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
-}
+defm V_PK_FMAC_F16     :  VOP2_Real_gfx10<0x03c>;


nit: remove the blank line above.

Went back to previous commit since SDWA is not supported for this instruction in GFX10.

rampitec · 2025-06-19T18:40:44Z

llvm/lib/Target/AMDGPU/VOP2Instructions.td

@@ -2560,6 +2558,8 @@ defm V_SUBBREV_CO_U32     : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s
 defm V_ADD_U32            : VOP2_Real_e32e64_gfx9 <0x34>;
 defm V_SUB_U32            : VOP2_Real_e32e64_gfx9 <0x35>;
 defm V_SUBREV_U32         : VOP2_Real_e32e64_gfx9 <0x36>;
+
+defm V_PK_FMAC_F16     : VOP2_Real_e32e64_gfx9<0x03c>;


nit: remove the blank line and align the column.

rampitec · 2025-06-19T18:46:35Z

llvm/lib/Target/AMDGPU/VOP2Instructions.td

@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16       : VOP2_Real_gfx10<0x03b>;
 let IsSingle = 1 in {
  defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
 }
+defm V_PK_FMAC_F16     :  VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>;


Actually gfx9 does support SDWA for this instruction. Add few tests as your change has enabled it?

For gfx10 I think you need to add a test to the gfx10_unsupported_sdwa.s, because it is unsupported there.

rampitec · 2025-06-23T19:08:21Z

llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt

+# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0


sdwa disasm tests needed.

rampitec · 2025-06-23T19:13:31Z

llvm/lib/Target/AMDGPU/VOP2Instructions.td

@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16       : VOP2_Real_gfx10<0x03b>;
 let IsSingle = 1 in {
  defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
 }
+defm V_PK_FMAC_F16     :  VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>;


Still do it in a single def. If there is no suitable macro, create one.

rampitec · 2025-06-24T08:19:54Z

llvm/lib/Target/AMDGPU/VOP2Instructions.td

@@ -2172,6 +2175,7 @@ defm V_LDEXP_F16       : VOP2_Real_gfx10<0x03b>;
 let IsSingle = 1 in {
  defm V_PK_FMAC_F16     : VOP2_Real_e32_gfx10<0x03c>;
 }
+defm V_PK_FMAC_F16     :  VOP2_Real_dpp_dpp8_gfx10<0x03c>;


You still need to have a singe macro.

Pls see the new commit.

rampitec

LGTM. Not sure about that IsSingle though, if it is still needed. Please let Joe check it.

Sisyph

It should have [MC] in the commit title or message, since there is no codegen support.

IsSingle is fine, it suppresses the _e32 suffix on instructions with no VOP3 form, which is this one.

Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10.

This reverts commit f2d51a2.

verify.

Allows dpp in v_pk_fmac_f16 for GFX9, and both dpp and dpp8 for GFX10.

jwanggit86 requested review from Sisyph and rampitec June 18, 2025 19:04

jwanggit86 added the backend:AMDGPU label Jun 18, 2025

llvmbot added the mc Machine (object) code label Jun 18, 2025

rampitec reviewed Jun 18, 2025

View reviewed changes

jwanggit86 requested a review from rampitec June 19, 2025 18:35

rampitec reviewed Jun 19, 2025

View reviewed changes

rampitec reviewed Jun 23, 2025

View reviewed changes

jwanggit86 requested a review from rampitec June 23, 2025 20:52

rampitec reviewed Jun 24, 2025

View reviewed changes

jwanggit86 requested a review from rampitec June 24, 2025 17:35

rampitec approved these changes Jun 24, 2025

View reviewed changes

Sisyph approved these changes Jun 24, 2025

View reviewed changes

jwanggit86 added 7 commits June 24, 2025 12:26

[AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10

c51f487

Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10.

for gfx9, use existing VOP2_Real_e32e64_gfx9

76a36b2

for gfx10, use existing VOP2_Real_gfx10

e16fa23

Revert "for gfx10, use existing VOP2_Real_gfx10"

bccf7eb

This reverts commit f2d51a2.

GFX9 has SDWA instruction while GFX10 does not. Add tests to

e9acb6c

verify.

(1) A new multiclass for gfx10 (2) tests for disassembler

ff2c368

Combine two defs into one

39fcf97

jwanggit86 force-pushed the allow-dpp-in-v_pk_fmac_f16-for-gfx9-gfx10 branch from f447b3a to 39fcf97 Compare June 24, 2025 20:46

jwanggit86 merged commit 46d33b6 into llvm:main Jun 24, 2025
7 checks passed

anthonyhatran pushed a commit to anthonyhatran/llvm-project that referenced this pull request Jun 26, 2025

[AMDGPU][MC] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 (llvm#144782)

94942ba

Allows dpp in v_pk_fmac_f16 for GFX9, and both dpp and dpp8 for GFX10.

[AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 #144782

[AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 #144782

Uh oh!

Conversation

jwanggit86 commented Jun 18, 2025

Uh oh!

llvmbot commented Jun 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

rampitec left a comment

Choose a reason for hiding this comment

Uh oh!

Sisyph left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

llvmbot commented Jun 18, 2025 •

edited

Loading