Skip to content

Add mad support for v_pk_* 16 bit integer #95104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 13, 2024
Merged

Conversation

dstutt
Copy link
Collaborator

@dstutt dstutt commented Jun 11, 2024

No description provided.

Change-Id: I88c02fa15b56b26abb0315b1d447d51bb19f0843
@dstutt dstutt requested a review from Sisyph June 11, 2024 12:51
@llvmbot
Copy link
Member

llvmbot commented Jun 11, 2024

@llvm/pr-subscribers-backend-amdgpu

Author: David Stuttard (dstutt)

Changes

Change-Id: I88c02fa15b56b26abb0315b1d447d51bb19f0843


Patch is 67.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95104.diff

3 Files Affected:

  • (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+1-1)
  • (modified) llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll (+377-472)
  • (modified) llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll (+10-17)
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 4c78bd94458d2..9dc4daa1cef08 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -90,7 +90,7 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
 let isReMaterializable = 1 in {
 let isCommutable = 1 in {
 defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
-defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>, imad>;
 
 let FPDPRounding = 1 in {
 defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 526ee5a51745d..3dea5cb5c1423 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -477,42 +477,38 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-NEXT:    v_pk_add_u16 v0, v2, v0
+; GFX9-NEXT:    v_pk_mad_u16 v2, v0, v1, v0
+; GFX9-NEXT:    v_pk_mul_lo_u16 v3, v2, v1
+; GFX9-NEXT:    v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0]
+; GFX9-NEXT:    v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0]
+; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: clpeak_imad_pat_v2i16:
 ; GFX10:       ; %bb.0: ; %entry
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-NEXT:    v_pk_add_u16 v0, v2, v0
-; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX10-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT:    v_pk_mad_u16 v2, v0, v1, v0
+; GFX10-NEXT:    v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v3, v2, v1
+; GFX10-NEXT:    v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0]
+; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
 ; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: clpeak_imad_pat_v2i16:
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-NEXT:    v_pk_add_u16 v0, v2, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-NEXT:    v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX11-NEXT:    v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT:    v_pk_mad_u16 v2, v0, v1, v0
+; GFX11-NEXT:    v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0]
+; GFX11-NEXT:    v_pk_mul_lo_u16 v3, v2, v1
+; GFX11-NEXT:    v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v3, v0
 ; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y18 = add <2 x i16> %x, <i16 1, i16 1>
@@ -691,22 +687,20 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_v3i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v5, v1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v4, v0
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v4, v1, v3, v1
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v5, v0, v2, v0
+; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v6, v5, v2
+; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v7, v4, v3
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v1, v1, v3, 1
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v3, v4, v3, 1
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v2, v5, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v3, v5, 1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_v3i16:
@@ -714,41 +708,37 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v5, v1, v3
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v4, v0
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v5, v1
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v1, v1, v3, 1
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v3, v5, v3, 1
+; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v3, v5, 1
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v5, v1, 1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v3i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v4, v1
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v5, v0
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v1, v1, v3, 1
+; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v3, v5, v3, 1
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v3i16:
@@ -756,47 +746,41 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v5, v1, v3
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v4, v0
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v5, v1
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v3, v5, 1
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v5, v1, 1
+; GFX10-GISEL-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX10-GISEL-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX10-GISEL-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-GISEL-NEXT:    v_pk_mad_u16 v1, v1, v3, 1
+; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX10-GISEL-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-GISEL-NEXT:    v_pk_mad_u16 v3, v5, v3, 1
+; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-LABEL: clpeak_imad_pat_v3i16:
 ; GFX11-SDAG:       ; %bb.0: ; %entry
 ; GFX11-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
 ; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v1, v4, v1
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v0, v5, v0
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1
+; GFX11-SDAG-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX11-SDAG-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX11-SDAG-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX11-SDAG-NEXT:    v_pk_mad_u16 v1, v1, v3, 1
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1
-; GFX11-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX11-SDAG-NEXT:    v_pk_mad_u16 v3, v5, v3, 1
+; GFX11-SDAG-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX11-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
-; GFX11-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
 ; GFX11-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-GISEL-LABEL: clpeak_imad_pat_v3i16:
@@ -805,25 +789,21 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
 ; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v5, v1, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v0, v4, v0
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v5, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v3, v5, 1
+; GFX11-GISEL-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX11-GISEL-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX11-GISEL-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX11-GISEL-NEXT:    v_pk_mad_u16 v1, v1, v3, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v5, v1, 1
+; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX11-GISEL-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX11-GISEL-NEXT:    v_pk_mad_u16 v3, v5, v3, 1
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
-; GFX11-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1>
@@ -1088,22 +1068,20 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX9-SDAG-LABEL: clpeak_imad_pat_v4i16:
 ; GFX9-SDAG:       ; %bb.0: ; %entry
 ; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v5, v1
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v0, v4, v0
+; GFX9-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v4, v1, v3, v1
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v5, v0, v2, v0
+; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v6, v5, v2
+; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v7, v4, v3
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v3, v4, v3, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT:    v_pk_mad_u16 v2, v5, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
+; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
-; GFX9-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
 ; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-GISEL-LABEL: clpeak_imad_pat_v4i16:
@@ -1111,41 +1089,37 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v5, v1, v3
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v0, v4, v0
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v5, v1
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v5, v1, 1 op_sel_hi:[1,0]
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT:    v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v0, v0, v4
-; GFX9-GISEL-NEXT:    v_pk_mul_lo_u16 v1, v1, v5
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-SDAG-LABEL: clpeak_imad_pat_v4i16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
 ; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v4, v1, v3
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v5, v0, v2
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v4, v1
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v0, v5, v0
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT:    v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v4, v0, v2, v0
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v5, v1, v3, v1
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v6, v4, v2
+; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v7, v5, v3
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT:    v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v6, v0
+; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v7, v1
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v2
 ; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v0, v0, v5
-; GFX10-SDAG-NEXT:    v_pk_mul_lo_u16 v1, v1, v4
 ; GFX10-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-GISEL-LABEL: clpeak_imad_pat_v4i16:
@@ -1153,47 +1127,41 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_mul_lo_u16 v4, v0, v2
-; GFX10-GISEL-NEXT:    v_pk_mul_...
[truncated]

@dstutt dstutt requested a review from jayfoad June 11, 2024 12:55
Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm surprised this was missed. Also remove the change-id from the description before merging

@dstutt
Copy link
Collaborator Author

dstutt commented Jun 13, 2024

I'm surprised this was missed. Also remove the change-id from the description before merging

Thanks for removing it.

@dstutt dstutt merged commit 1fb1fcf into llvm:main Jun 13, 2024
9 checks passed
@dstutt dstutt deleted the llvm-main branch June 13, 2024 15:46
EthanLuisMcDonough pushed a commit to EthanLuisMcDonough/llvm-project that referenced this pull request Aug 13, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants