-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 #144782
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Allow dpp in v_pk_fmac_f16 for GFX9 and GFX10 #144782
Conversation
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Jun Wang (jwanggit86) ChangesAllows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10. Full diff: https://github.com/llvm/llvm-project/pull/144782.diff 6 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 0c7e20fc1ebf3..c459c4df11e9e 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>;
let IsSingle = 1 in {
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>;
}
+defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>;
// VOP2 no carry-in, carry-out.
defm V_ADD_NC_U32 :
@@ -2504,6 +2505,11 @@ multiclass VOP2_Real_e32e64_gfx9 <bits<6> op> {
VOP2_DPPe<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp")>;
}
+ multiclass VOP2_Real_dpp_gfx9<bits<6> op> {
+ if !cast<VOP2_Pseudo>(NAME#"_e32").Pfl.HasExtDPP then
+ def _dpp_gfx9 : VOP2_DPP16<op, !cast<VOP2_DPP_Pseudo>(NAME#"_dpp"), SIEncodingFamily.GFX9>;
+ }
+
} // End DecoderNamespace = "GFX9"
multiclass VOP2_Real_e32e64_vi <bits<6> op> :
@@ -2560,6 +2566,8 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s
defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>;
defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>;
defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>;
+
+defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx9<0x03c>;
} // End AssemblerPredicate = isGFX9Only
defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>;
diff --git a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
index 3dcf288bbbaa5..bbd36a9140b96 100644
--- a/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
+++ b/llvm/test/MC/AMDGPU/gfx10_asm_vop2.s
@@ -13185,3 +13185,15 @@ v_pk_fmac_f16 v5, -4.0, v2
v_pk_fmac_f16 v5, v1, v255
// GFX10: encoding: [0x01,0xff,0x0b,0x78]
+
+v_pk_fmac_f16 v5, v1, v2
+// GFX10: encoding: [0x01,0x05,0x0a,0x78]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// GFX10: encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
+
+v_pk_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX10: encoding: [0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
new file mode 100644
index 0000000000000..f7dab2d0359dc
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx9_asm_vop2_features.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx90a -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx942 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=CHECK-MI %s
+
+v_pk_fmac_f16 v5, v1, v2
+// CHECK-MI: [0x01,0x05,0x0a,0x78]
+
+v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3]
+// CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+
+v_pk_fmac_f16 v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK-MI: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
index 1774efa4a65c7..4a7471e6c6f98 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp16.txt
@@ -2476,3 +2476,10 @@
# W32: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc_lo quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff]
# W64: v_cndmask_b32_dpp v5, -|v1|, -|v2|, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff]
0xfa,0x04,0x0a,0x02,0x01,0xe4,0xf0,0xff
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 ; encoding: [0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00]
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00
+
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
index 40b8f24e4d72f..233f93a5b8e7d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx10_vop2_dpp8.txt
@@ -222,3 +222,6 @@
# W32: v_cndmask_b32_dpp v0, v1, v2, vcc_lo dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa]
# W64: v_cndmask_b32_dpp v0, v1, v2, vcc dpp8:[0,1,2,3,4,5,6,7] fi:1 ; encoding: [0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa]
0xea,0x04,0x00,0x02,0x01,0x88,0xc6,0xfa
+
+# GFX10: v_pk_fmac_f16_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05]
+0xe9,0x04,0x0a,0x78,0x01,0x77,0x39,0x05
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
new file mode 100644
index 0000000000000..ac1ef4baa9aa4
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx9_vop2_features.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx908 -disassemble -show-encoding < %s | FileCheck -check-prefix=CHECK-MI %s
+
+# CHECK-MI: v_pk_fmac_f16_e32 v5, v1, v2
+0x01,0x05,0x0a,0x78
+
+# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff
+
+# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0x00
|
@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; | |||
let IsSingle = 1 in { | |||
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; | |||
} | |||
defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to define separate dpp real, VOP2_Real_gfx10 should do it for you.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No need to define separate dpp real, VOP2_Real_gfx10 should do it for you.
This would create some additional instructions though, i.e., for GFX9, V_PK_FMAC_F16_e32_gfx9
, V_PK_FMAC_F16_e64_gfx9
, V_PK_FMAC_F16_sdwa_gfx9
, and for GFX10, V_PK_FMAC_F16_e64_gfx10
, V_PK_FMAC_F16_sdwa_gfx10
. Would this be a problem?
Also, here (Line 2173) VOP2_Real_e32_gfx10
is instantiated with a predicate on Line 2172, but VOP2_Real_gfx10
does not have this predicate.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually gfx9 does support SDWA for this instruction. Add few tests as your change has enabled it?
For gfx10 I think you need to add a test to the gfx10_unsupported_sdwa.s, because it is unsupported there.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If sdwa is not supported for gfx10, then the code should go back to the previous version, namely,
defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>;
, instead of using VOP2_Real_gfx10
. Right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still do it in a single def. If there is no suitable macro, create one.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still do it in a single def. If there is no suitable macro, create one.
Done.
@@ -2560,6 +2566,8 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s | |||
defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; | |||
defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; | |||
defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; | |||
|
|||
defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx9<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Same here, VOP2_Real_e32e64_gfx9 shall define all forms.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Code updated as suggested.
let IsSingle = 1 in { | ||
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; | ||
} | ||
defm V_PK_FMAC_F16 : VOP2_Real_gfx10<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: remove the blank line above.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Went back to previous commit since SDWA is not supported for this instruction in GFX10.
@@ -2560,6 +2558,8 @@ defm V_SUBBREV_CO_U32 : VOP2be_Real_e32e64_gfx9 <0x1e, "V_SUBBREV_U32", "v_s | |||
defm V_ADD_U32 : VOP2_Real_e32e64_gfx9 <0x34>; | |||
defm V_SUB_U32 : VOP2_Real_e32e64_gfx9 <0x35>; | |||
defm V_SUBREV_U32 : VOP2_Real_e32e64_gfx9 <0x36>; | |||
|
|||
defm V_PK_FMAC_F16 : VOP2_Real_e32e64_gfx9<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: remove the blank line and align the column.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed.
@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; | |||
let IsSingle = 1 in { | |||
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; | |||
} | |||
defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually gfx9 does support SDWA for this instruction. Add few tests as your change has enabled it?
For gfx10 I think you need to add a test to the gfx10_unsupported_sdwa.s, because it is unsupported there.
# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf | ||
0xfa,0x04,0x0a,0x78,0x01,0xe4,0x00,0xff | ||
|
||
# CHECK-MI: v_pk_fmac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sdwa disasm tests needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -2172,6 +2172,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; | |||
let IsSingle = 1 in { | |||
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; | |||
} | |||
defm V_PK_FMAC_F16 : VOP2_Real_dpp_gfx10<0x03c>, VOP2_Real_dpp8_gfx10<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Still do it in a single def. If there is no suitable macro, create one.
@@ -2172,6 +2175,7 @@ defm V_LDEXP_F16 : VOP2_Real_gfx10<0x03b>; | |||
let IsSingle = 1 in { | |||
defm V_PK_FMAC_F16 : VOP2_Real_e32_gfx10<0x03c>; | |||
} | |||
defm V_PK_FMAC_F16 : VOP2_Real_dpp_dpp8_gfx10<0x03c>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You still need to have a singe macro.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Pls see the new commit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Not sure about that IsSingle though, if it is still needed. Please let Joe check it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It should have [MC] in the commit title or message, since there is no codegen support.
IsSingle is fine, it suppresses the _e32 suffix on instructions with no VOP3 form, which is this one.
Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10.
This reverts commit f2d51a2.
f447b3a
to
39fcf97
Compare
Allows dpp in v_pk_fmac_f16 for GFX9, and both dpp and dpp8 for GFX10.
Allows dpp in v_pk_fmac-f16 for GFX9, and both dpp and dpp8 for GFX10.