-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[GlobalISel][AMDGPU] Import patterns with multiple defs #84171
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-backend-amdgpu @llvm/pr-subscribers-llvm-globalisel Author: Pierre van Houtryve (Pierre-vh) ChangesFixes #63216 Patch is 124.98 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/84171.diff 7 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 334cfad478f151..b12c1c41b62b00 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -728,25 +728,34 @@ def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
} // End SubtargetPredicate = isGFX9Plus
-// FIXME: GlobalISel in general does not handle instructions with 2 results,
-// so it cannot use these patterns.
multiclass IMAD32_Pats <VOP3_Pseudo inst> {
def : GCNPat <
(ThreeOpFrag<mul, add> i32:$src0, i32:$src1, i32:$src2),
- (EXTRACT_SUBREG (inst $src0, $src1,
+ (EXTRACT_SUBREG (inst i32:$src0, i32:$src1,
(REG_SEQUENCE SReg_64, // Use scalar and let it be legalized
$src2, sub0,
(i32 (IMPLICIT_DEF)), sub1),
0 /* clamp */),
sub0)
>;
+
+ // GISel-specific pattern that avoids creating a SGPR->VGPR copy if
+ // $src2 is a VGPR.
+ def : GCNPat <
+ (ThreeOpFrag<mul, add> i32:$src0, i32:$src1, VGPR_32:$src2),
+ (EXTRACT_SUBREG (inst i32:$src0, i32:$src1,
+ (REG_SEQUENCE VReg_64,
+ $src2, sub0,
+ (i32 (IMPLICIT_DEF)), sub1),
+ 0 /* clamp */),
+ sub0)
+ >;
+
// Immediate src2 in the pattern above will not fold because it would be partially
// undef. Hence define specialized pattern for this case.
- // FIXME: GlobalISel pattern exporter fails to export a pattern like this and asserts,
- // make it SDAG only.
def : GCNPat <
- (ThreeOpFragSDAG<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
- (EXTRACT_SUBREG (inst $src0, $src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
+ (ThreeOpFrag<mul, add> i32:$src0, i32:$src1, (i32 imm:$src2)),
+ (EXTRACT_SUBREG (inst i32:$src0, i32:$src1, (i64 (as_i64imm $src2)), 0 /* clamp */), sub0)
>;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
index d671a1d87b63df..1140ef88ac7f85 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul-known-bits.i64.ll
@@ -8,34 +8,35 @@ define amdgpu_kernel void @v_mul_i64_no_zext(ptr addrspace(1) %out, ptr addrspac
; GFX10-LABEL: v_mul_i64_no_zext:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x2c
-; GFX10-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v7, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v6, s[0:1]
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v6, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[0:1], v7, s[0:1]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v7, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v0, v2, 0
-; GFX10-NEXT: v_mul_lo_u32 v0, v0, v3
-; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2
-; GFX10-NEXT: v_add3_u32 v5, v5, v0, v1
-; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[2:3]
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v0, v3, v[5:6]
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v2, v[5:6]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: global_store_dwordx2 v7, v[4:5], s[2:3]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_no_zext:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x2c
-; GFX11-NEXT: v_lshlrev_b32_e32 v6, 3, v0
+; GFX11-NEXT: v_lshlrev_b32_e32 v9, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b64 v[0:1], v6, s[0:1]
-; GFX11-NEXT: global_load_b64 v[2:3], v6, s[2:3]
+; GFX11-NEXT: global_load_b64 v[0:1], v9, s[0:1]
+; GFX11-NEXT: global_load_b64 v[2:3], v9, s[2:3]
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v0, v2, 0
-; GFX11-NEXT: v_mul_lo_u32 v0, v0, v3
-; GFX11-NEXT: v_mul_lo_u32 v1, v1, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v0, v3, v[5:6]
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v1, v2, v[6:7]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v5, v5, v0, v1
-; GFX11-NEXT: global_store_b64 v6, v[4:5], s[2:3]
+; GFX11-NEXT: v_mov_b32_e32 v5, v7
+; GFX11-NEXT: global_store_b64 v9, v[4:5], s[2:3]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -64,8 +65,9 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX10-NEXT: global_load_dword v4, v3, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v0, v4, 0
-; GFX10-NEXT: v_mul_lo_u32 v0, v1, v4
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v1, v4, v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
@@ -79,12 +81,13 @@ define amdgpu_kernel void @v_mul_i64_zext_src1(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: global_load_b64 v[0:1], v1, s[6:7]
-; GFX11-NEXT: global_load_b32 v4, v2, s[0:1]
+; GFX11-NEXT: global_load_b32 v5, v2, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v4, 0
-; GFX11-NEXT: v_mul_lo_u32 v0, v1, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v0, v5, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v1, v5, v[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -114,8 +117,9 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX10-NEXT: global_load_dwordx2 v[0:1], v3, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0
-; GFX10-NEXT: v_mul_lo_u32 v0, v4, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v1, v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
@@ -128,13 +132,14 @@ define amdgpu_kernel void @v_mul_i64_zext_src0(ptr addrspace(1) %out, ptr addrsp
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: global_load_b32 v4, v1, s[6:7]
+; GFX11-NEXT: global_load_b32 v5, v1, s[6:7]
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, 0
-; GFX11-NEXT: v_mul_lo_u32 v0, v4, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -211,8 +216,9 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX10-NEXT: global_load_dwordx2 v[0:1], v2, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: v_mad_u64_u32 v[2:3], s0, v4, v0, 0
-; GFX10-NEXT: v_mul_lo_u32 v0, v4, v1
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v1, v[0:1]
+; GFX10-NEXT: v_mov_b32_e32 v3, v0
; GFX10-NEXT: v_mov_b32_e32 v0, 0
; GFX10-NEXT: global_store_dwordx2 v0, v[2:3], s[4:5]
; GFX10-NEXT: s_endpgm
@@ -225,13 +231,14 @@ define amdgpu_kernel void @v_mul_i64_masked_src0_hi(ptr addrspace(1) %out, ptr a
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: global_load_b32 v4, v0, s[6:7]
+; GFX11-NEXT: global_load_b32 v5, v0, s[6:7]
; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v4, v0, 0
-; GFX11-NEXT: v_mul_lo_u32 v0, v4, v1
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_add_nc_u32 v3, v3, v0
+; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v5, v0, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v5, v1, v[0:1]
+; GFX11-NEXT: v_mov_b32_e32 v0, 0
; GFX11-NEXT: global_store_b64 v0, v[2:3], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -390,15 +397,16 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[6:7]
; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[2:3]
; GFX10-NEXT: s_waitcnt vmcnt(1)
-; GFX10-NEXT: v_and_b32_e32 v4, 0xfff00000, v0
-; GFX10-NEXT: v_and_b32_e32 v5, 0xf00f, v1
+; GFX10-NEXT: v_and_b32_e32 v6, 0xfff00000, v0
; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v4, v2, 0
-; GFX10-NEXT: v_mul_lo_u32 v3, v4, v3
-; GFX10-NEXT: v_mul_lo_u32 v2, v5, v2
-; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2
-; GFX10-NEXT: v_mov_b32_e32 v2, 0
-; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
+; GFX10-NEXT: v_mad_u64_u32 v[4:5], s0, v6, v2, 0
+; GFX10-NEXT: v_mov_b32_e32 v0, v5
+; GFX10-NEXT: v_mad_u64_u32 v[5:6], s0, v6, v3, v[0:1]
+; GFX10-NEXT: v_and_b32_e32 v0, 0xf00f, v1
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s0, v0, v2, v[5:6]
+; GFX10-NEXT: v_mov_b32_e32 v5, v0
+; GFX10-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-NEXT: global_store_dwordx2 v0, v[4:5], s[4:5]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: v_mul_i64_partially_masked_src0:
@@ -412,17 +420,18 @@ define amdgpu_kernel void @v_mul_i64_partially_masked_src0(ptr addrspace(1) %out
; GFX11-NEXT: global_load_b64 v[0:1], v2, s[6:7]
; GFX11-NEXT: global_load_b64 v[2:3], v2, s[0:1]
; GFX11-NEXT: s_waitcnt vmcnt(1)
-; GFX11-NEXT: v_and_b32_e32 v4, 0xfff00000, v0
-; GFX11-NEXT: v_and_b32_e32 v5, 0xf00f, v1
+; GFX11-NEXT: v_and_b32_e32 v7, 0xfff00000, v0
; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
-; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3
-; GFX11-NEXT: v_mul_lo_u32 v2, v5, v2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mad_u64_u32 v[4:5], null, v7, v2, 0
+; GFX11-NEXT: v_mov_b32_e32 v0, v5
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_mad_u64_u32 v[5:6], null, v7, v3, v[0:1]
+; GFX11-NEXT: v_and_b32_e32 v3, 0xf00f, v1
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, v[5:6]
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add3_u32 v1, v1, v3, v2
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
+; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v0, 0
+; GFX11-NEXT: global_store_b64 v0, v[4:5], s[4:5]
; GFX11-NEXT: s_nop 0
; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX11-NEXT: s_endpgm
@@ -491,27 +500,31 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24
; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34
-; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; GFX10-NEXT: v_lshlrev_b32_e32 v0, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: global_load_dwordx2 v[2:3], v4, s[6:7]
-; GFX10-NEXT: global_load_dwordx2 v[0:1], v4, s[2:3]
+; GFX10-NEXT: global_load_dwordx2 v[2:3], v0, s[6:7]
+; GFX10-NEXT: global_load_dwordx2 v[4:5], v0, s[2:3]
+; GFX10-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX10-NEXT: s_waitcnt vmcnt(1)
; GFX10-NEXT: v_cmp_ge_u64_e32 vcc_lo, 0, v[2:3]
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: v_mul_lo_u32 v1, v2, v1
; GFX10-NEXT: s_and_saveexec_b32 s0, vcc_lo
; GFX10-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX10-NEXT: s_cbranch_execz .LBB10_2
; GFX10-NEXT: ; %bb.1: ; %else
-; GFX10-NEXT: v_mad_u64_u32 v[2:3], s1, v2, v0, 0
-; GFX10-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX10-NEXT: v_mov_b32_e32 v0, v2
-; GFX10-NEXT: v_mov_b32_e32 v1, v3
-; GFX10-NEXT: ; %bb.2: ; %Flow
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mad_u64_u32 v[0:1], s1, v2, v4, 0
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s1, v2, v5, v[1:2]
+; GFX10-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX10-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX10-NEXT: .LBB10_2: ; %Flow
; GFX10-NEXT: s_andn2_saveexec_b32 s0, s0
+; GFX10-NEXT: s_cbranch_execz .LBB10_4
; GFX10-NEXT: ; %bb.3: ; %if
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: v_mul_lo_u32 v1, v2, v5
; GFX10-NEXT: v_mov_b32_e32 v0, 0
-; GFX10-NEXT: ; %bb.4: ; %endif
+; GFX10-NEXT: .LBB10_4: ; %endif
; GFX10-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -526,22 +539,29 @@ define amdgpu_kernel void @v_mul64_masked_before_and_in_branch(ptr addrspace(1)
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: global_load_b64 v[2:3], v0, s[6:7]
-; GFX11-NEXT: global_load_b64 v[0:1], v0, s[0:1]
+; GFX11-NEXT: global_load_b64 v[4:5], v0, s[0:1]
; GFX11-NEXT: s_mov_b32 s0, exec_lo
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: v_mul_lo_u32 v1, v2, v1
+; GFX11-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX11-NEXT: s_waitcnt vmcnt(1)
; GFX11-NEXT: v_cmpx_ge_u64_e32 0, v[2:3]
; GFX11-NEXT: s_xor_b32 s0, exec_lo, s0
+; GFX11-NEXT: s_cbranch_execz .LBB10_2
; GFX11-NEXT: ; %bb.1: ; %else
-; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v2, v0, 0
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v4, 0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_nc_u32_e32 v3, v3, v1
-; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
-; GFX11-NEXT: ; %bb.2: ; %Flow
+; GFX11-NEXT: v_mad_u64_u32 v[3:4], null, v2, v5, v[1:2]
+; GFX11-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX11-NEXT: v_mov_b32_e32 v1, v3
+; GFX11-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX11-NEXT: .LBB10_2: ; %Flow
; GFX11-NEXT: s_and_not1_saveexec_b32 s0, s0
+; GFX11-NEXT: s_cbranch_execz .LBB10_4
; GFX11-NEXT: ; %bb.3: ; %if
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_mul_lo_u32 v1, v2, v5
; GFX11-NEXT: v_mov_b32_e32 v0, 0
-; GFX11-NEXT: ; %bb.4: ; %endif
+; GFX11-NEXT: .LBB10_4: ; %endif
; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 4c1935d06517e5..2d81452f9ef38d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -483,19 +483,18 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX10-NEXT: v_mov_b32_e32 v4, v0
; GFX10-NEXT: v_mov_b32_e32 v5, v1
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v4, v2, 0
-; GFX10-NEXT: v_mul_lo_u32 v3, v4, v3
-; GFX10-NEXT: v_mul_lo_u32 v2, v5, v2
-; GFX10-NEXT: v_add3_u32 v1, v1, v3, v2
+; GFX10-NEXT: v_mad_u64_u32 v[3:4], s4, v4, v3, v[1:2]
+; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v5, v2, v[3:4]
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_mul_i64:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1
-; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0
-; GFX11-NEXT: v_mul_lo_u32 v3, v4, v3
-; GFX11-NEXT: v_mul_lo_u32 v2, v5, v2
-; GFX11-NEXT: v_add3_u32 v1, v1, v3, v2
+; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v1
+; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v5, 0
+; GFX11-NEXT: v_mad_u64_u32 v[7:8], null, v4, v3, v[1:2]
+; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v5, v[7:8]
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: v_mul_i64:
@@ -506,11 +505,10 @@ define i64 @v_mul_i64(i64 %num, i64 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mul_hi_u32 v4, v0, v2
-; GFX12-NEXT: v_mul_lo_u32 v3, v0, v3
-; GFX12-NEXT: v_mul_lo_u32 v1, v1, v2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-NEXT: v_mad_co_u64_u32 v[3:4], null, v0, v3, v[4:5]
; GFX12-NEXT: v_mul_lo_u32 v0, v0, v2
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12-NEXT: v_add3_u32 v1, v4, v3, v1
+; GFX12-NEXT: v_mad_co_u64_u32 v[1:2], null, v1, v2, v[3:4]
; GFX12-NEXT: s_setpc_b64 s[30:31]
%result = mul i64 %num, %den
ret i64 %result
@@ -653,11 +651,11 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v6, v0
; GFX10-NEXT: v_mov_b32_e32 v7, v1
-; GFX10-NEXT: v_mul_lo_u32 v2, v2, v3
-; GFX10-NEXT: v_mul_lo_u32 v5, v6, v5
-; GFX10-NEXT: v_mul_lo_u32 v8, v7, v4
+; GFX10-NEXT: v_mul_lo_u32 v0, v6, v5
+; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v7, v4, v[0:1]
; GFX10-NEXT: v_mad_u64_u32 v[0:1], s4, v6, v3, 0
-; GFX10-NEXT: v_add3_u32 v2, v5, v8, v2
+; GFX10-NEXT: v_mad_u64_u32 v[8:9], s4, v2, v3, v[8:9]
+; GFX10-NEXT: v_mov_b32_e32 v2, v8
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v6, v4, v[1:2]
; GFX10-NEXT: v_mad_u64_u32 v[1:2], s4, v7, v3, v[1:2]
; GFX10-NEXT: s_setpc_b64 s[30:31]
@@ -666,11 +664,11 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
-; GFX11-NEXT: v_mul_lo_u32 v2, v2, v3
-; GFX11-NEXT: v_mul_lo_u32 v5, v6, v5
-; GFX11-NEXT: v_mul_lo_u32 v8, v7, v4
+; GFX11-NEXT: v_mul_lo_u32 v0, v6, v5
+; GFX11-NEXT: v_mad_u64_u32 v[8:9], null, v7, v4, v[0:1]
; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v3, 0
-; GFX11-NEXT: v_add3_u32 v2, v5, v8, v2
+; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v2, v3, v[8:9]
+; GFX11-NEXT: v_mov_b32_e32 v2, v9
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[1:2]
; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v7, v3, v[1:2]
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -683,15 +681,15 @@ define i96 @v_mul_i96(i96 %num, i96 %den) {
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, v1
-; GFX12-NEXT: v_mul_lo_u32 v2, v2, v3
-; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_mul_lo_u32 v5, v6, v5
-; GFX12-NEXT: v_mul_lo_u32 v8, v7, v4
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT: v_mul_l...
[truncated]
|
// GISel-specific pattern that avoids creating a SGPR->VGPR copy if | ||
// $src2 is a VGPR. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can't we just add the VGPR_32 decoration to the original pattern?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doesn't seem to work, we need two patterns
(EXTRACT_SUBREG (inst i32:$src0, i32:$src1, | ||
(REG_SEQUENCE VReg_64, | ||
$src2, sub0, | ||
(i32 (IMPLICIT_DEF)), sub1), | ||
0 /* clamp */), | ||
sub0) | ||
>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we reduce duplication with OutFrags or something?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I didn't find a worthwhile way to do it, we save 2 lines in each pattern but add like 3-4 lines to declare the fragment so it's more or less the same
Fixes #63216