-
Notifications
You must be signed in to change notification settings - Fork 14.3k
Add mad support for v_pk_* 16 bit integer #95104
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Change-Id: I88c02fa15b56b26abb0315b1d447d51bb19f0843
@llvm/pr-subscribers-backend-amdgpu Author: David Stuttard (dstutt) ChangesChange-Id: I88c02fa15b56b26abb0315b1d447d51bb19f0843 Patch is 67.67 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/95104.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 4c78bd94458d2..9dc4daa1cef08 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -90,7 +90,7 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
let isReMaterializable = 1 in {
let isCommutable = 1 in {
defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
-defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
+defm V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>, imad>;
let FPDPRounding = 1 in {
defm V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3P_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 526ee5a51745d..3dea5cb5c1423 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -477,42 +477,38 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
; GFX9: ; %bb.0: ; %entry
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_mul_lo_u16 v2, v0, v1
-; GFX9-NEXT: v_pk_add_u16 v0, v2, v0
+; GFX9-NEXT: v_pk_mad_u16 v2, v0, v1, v0
+; GFX9-NEXT: v_pk_mul_lo_u16 v3, v2, v1
+; GFX9-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0]
+; GFX9-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0]
+; GFX9-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
-; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX9-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: clpeak_imad_pat_v2i16:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_pk_mul_lo_u16 v2, v0, v1
-; GFX10-NEXT: v_pk_add_u16 v0, v2, v0
-; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX10-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX10-NEXT: v_pk_mad_u16 v2, v0, v1, v0
+; GFX10-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0]
+; GFX10-NEXT: v_pk_mul_lo_u16 v3, v2, v1
+; GFX10-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0]
+; GFX10-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX10-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: clpeak_imad_pat_v2i16:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_pk_mul_lo_u16 v2, v0, v1
-; GFX11-NEXT: v_pk_add_u16 v0, v2, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-NEXT: v_pk_add_u16 v1, v2, 1 op_sel_hi:[1,0]
-; GFX11-NEXT: v_pk_add_u16 v2, v0, 1 op_sel_hi:[1,0]
+; GFX11-NEXT: v_pk_mad_u16 v2, v0, v1, v0
+; GFX11-NEXT: v_pk_mad_u16 v0, v0, v1, 1 op_sel_hi:[1,1,0]
+; GFX11-NEXT: v_pk_mul_lo_u16 v3, v2, v1
+; GFX11-NEXT: v_pk_mad_u16 v1, v2, v1, 1 op_sel_hi:[1,1,0]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_pk_mul_lo_u16 v0, v3, v0
; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v1
-; GFX11-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%y18 = add <2 x i16> %x, <i16 1, i16 1>
@@ -691,22 +687,20 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX9-SDAG-LABEL: clpeak_imad_pat_v3i16:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1
; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v4, v0, v2
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v5, v1, v3
-; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v5, v1
-; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v4, v0
+; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v4, v1, v3, v1
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v5, v0, v2, v0
+; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v6, v5, v2
+; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v7, v4, v3
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v3, v4, v3, 1
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v2, v5, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1
+; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0
; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: v_pk_add_u16 v3, v5, 1
-; GFX9-SDAG-NEXT: v_pk_add_u16 v4, v1, 1
-; GFX9-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: clpeak_imad_pat_v3i16:
@@ -714,41 +708,37 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, 1
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3
-; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v4, v0
-; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v5, v1
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1
+; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2
+; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1
+; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0
+; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1
; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT: v_pk_add_u16 v3, v5, 1
-; GFX9-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT: v_pk_add_u16 v5, v1, 1
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: clpeak_imad_pat_v3i16:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1
; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2
-; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v4, v1
-; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v5, v0
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT: v_pk_add_u16 v3, v4, 1
-; GFX10-SDAG-NEXT: v_pk_add_u16 v4, v1, 1
-; GFX10-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1
+; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2
+; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0
+; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1
; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: clpeak_imad_pat_v3i16:
@@ -756,47 +746,41 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, 1
-; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2
-; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3
-; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v4, v0
-; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v5, v1
-; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT: v_pk_add_u16 v3, v5, 1
-; GFX10-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT: v_pk_add_u16 v5, v1, 1
+; GFX10-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0
+; GFX10-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1
+; GFX10-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1
+; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2
+; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3
+; GFX10-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1
+; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0
+; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1
; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4
-; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5
; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-LABEL: clpeak_imad_pat_v3i16:
; GFX11-SDAG: ; %bb.0: ; %entry
; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1
; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v1, 1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3
-; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_pk_add_u16 v1, v4, v1
-; GFX11-SDAG-NEXT: v_pk_add_u16 v0, v5, v0
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
-; GFX11-SDAG-NEXT: v_pk_add_u16 v3, v4, 1
+; GFX11-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0
+; GFX11-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1
+; GFX11-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX11-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-NEXT: v_pk_add_u16 v4, v1, 1
-; GFX11-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
+; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2
+; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3
+; GFX11-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1
+; GFX11-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0
+; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5
-; GFX11-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4
; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-GISEL-LABEL: clpeak_imad_pat_v3i16:
@@ -805,25 +789,21 @@ define <3 x i16> @clpeak_imad_pat_v3i16(<3 x i16> %x, <3 x i16> %y) {
; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v1, 1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2
-; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_pk_add_u16 v0, v4, v0
-; GFX11-GISEL-NEXT: v_pk_add_u16 v1, v5, v1
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX11-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT: v_pk_add_u16 v3, v5, 1
+; GFX11-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0
+; GFX11-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1
+; GFX11-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX11-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT: v_pk_add_u16 v5, v1, 1
+; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2
+; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3
+; GFX11-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX11-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1
; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0
+; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4
-; GFX11-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5
; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
%y48 = add <3 x i16> %x, <i16 1, i16 1, i16 1>
@@ -1088,22 +1068,20 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX9-SDAG-LABEL: clpeak_imad_pat_v4i16:
; GFX9-SDAG: ; %bb.0: ; %entry
; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v4, v0, v2
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v5, v1, v3
-; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v5, v1
-; GFX9-SDAG-NEXT: v_pk_add_u16 v0, v4, v0
+; GFX9-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v4, v1, v3, v1
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v5, v0, v2, v0
+; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v6, v5, v2
+; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v7, v4, v3
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v3, v4, v3, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT: v_pk_mad_u16 v2, v5, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1
+; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0
; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5
-; GFX9-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4
; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-GISEL-LABEL: clpeak_imad_pat_v4i16:
@@ -1111,41 +1089,37 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v5, v1, v3
-; GFX9-GISEL-NEXT: v_pk_add_u16 v0, v4, v0
-; GFX9-GISEL-NEXT: v_pk_add_u16 v1, v5, v1
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-GISEL-NEXT: v_pk_add_u16 v2, v4, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT: v_pk_add_u16 v3, v5, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT: v_pk_add_u16 v4, v0, 1 op_sel_hi:[1,0]
-; GFX9-GISEL-NEXT: v_pk_add_u16 v5, v1, 1 op_sel_hi:[1,0]
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v4, v0, v2, v0
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v5, v1, v3, v1
+; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v6, v4, v2
+; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v7, v5, v3
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0]
+; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v6, v0
+; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v7, v1
; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v0, v0, v4
-; GFX9-GISEL-NEXT: v_pk_mul_lo_u16 v1, v1, v5
; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-SDAG-LABEL: clpeak_imad_pat_v4i16:
; GFX10-SDAG: ; %bb.0: ; %entry
; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v4, v1, v3
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v5, v0, v2
-; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v4, v1
-; GFX10-SDAG-NEXT: v_pk_add_u16 v0, v5, v0
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
-; GFX10-SDAG-NEXT: v_pk_add_u16 v2, v5, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT: v_pk_add_u16 v3, v4, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT: v_pk_add_u16 v4, v1, 1 op_sel_hi:[1,0]
-; GFX10-SDAG-NEXT: v_pk_add_u16 v5, v0, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v4, v0, v2, v0
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v5, v1, v3, v1
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v0, v0, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v1, v1, v3, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v6, v4, v2
+; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v7, v5, v3
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v3, v5, v3, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT: v_pk_mad_u16 v2, v4, v2, 1 op_sel_hi:[1,1,0]
+; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v6, v0
+; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v7, v1
; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v3
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v0, v0, v5
-; GFX10-SDAG-NEXT: v_pk_mul_lo_u16 v1, v1, v4
; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-GISEL-LABEL: clpeak_imad_pat_v4i16:
@@ -1153,47 +1127,41 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-GISEL-NEXT: v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
; GFX10-GISEL-NEXT: v_pk_add_u16 v1, v1, 1 op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT: v_pk_mul_lo_u16 v4, v0, v2
-; GFX10-GISEL-NEXT: v_pk_mul_...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm surprised this was missed. Also remove the change-id from the description before merging
Thanks for removing it. |
No description provided.