-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[AMDGPU][True16][CodeGen] sext i16 inreg in true16 mode #144024
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU][True16][CodeGen] sext i16 inreg in true16 mode #144024
Conversation
e1b5438
to
c7fe296
Compare
@llvm/pr-subscribers-backend-amdgpu Author: Brox Chen (broxigarchen) Changesupdate sext pattern in true16, setting up proper vgpr16 reg use Patch is 179.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144024.diff 8 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1419f63202a7c..d8c986bcea972 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2574,6 +2574,8 @@ def : GCNPat<
(i32 (DivergentSextInreg<i1> i32:$src)),
(V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
def : GCNPat <
(i16 (DivergentSextInreg<i1> i16:$src)),
(V_BFE_I32_e64 $src, (i32 0), (i32 1))
@@ -2583,6 +2585,23 @@ def : GCNPat <
(i16 (DivergentSextInreg<i8> i16:$src)),
(V_BFE_I32_e64 $src, (i32 0), (i32 8))
>;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+ (i16 (DivergentSextInreg<i1> i16:$src)),
+ (V_BFE_I32_e64
+ (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (i32 0), (i32 1))
+>;
+
+def : GCNPat <
+ (i16 (DivergentSextInreg<i8> i16:$src)),
+ (V_BFE_I32_e64
+ (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (i32 0), (i32 8))
+>;
+}
def : GCNPat<
(i32 (DivergentSextInreg<i8> i32:$src)),
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index a005e0245b8ff..c213b69a423ae 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -319,11 +319,21 @@ let SchedRW = [Write64Bit] in {
} // End SchedRW = [Write64Bit]
} // End isReMaterializable = 1
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
def : GCNPat<
(i32 (DivergentUnaryFrag<sext> i16:$src)),
(i32 (V_BFE_I32_e64 i16:$src, (i32 0), (i32 0x10)))
>;
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat<
+ (i32 (DivergentUnaryFrag<sext> i16:$src)),
+ (i32 (V_BFE_I32_e64
+ (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (i32 0), (i32 0x10)))
+>;
+
let isReMaterializable = 1 in {
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -423,6 +433,8 @@ def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32
} // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
def : GCNPat<
(i64 (DivergentUnaryFrag<sext> i16:$src)),
(REG_SEQUENCE VReg_64,
@@ -432,6 +444,18 @@ def : GCNPat<
), VGPR_32)), sub1)
>;
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat<
+ (i64 (DivergentUnaryFrag<sext> i16:$src)),
+ (REG_SEQUENCE VReg_64,
+ (i32 (V_BFE_I32_e64
+ (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+ (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
+ (i32 (COPY_TO_REGCLASS
+ (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+ ), VGPR_32)), sub1)
+>;
+
let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] in {
def V_INTERP_P1_F32_e64 : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
def V_INTERP_P2_F32_e64 : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 9e7968f1acb84..ab38bd21994ec 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -1165,35 +1165,32 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v3, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v4.l, v1.l
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v2.l
-; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v6.h, 8, v2.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
-; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v8.h, 8, v1.l
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v9.l, v1.h
-; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v2.h, 8, v2.h
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v2, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v4.h, 8, v1.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v1.h
+; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v7.h, 8, v2.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v2.h
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v5.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v4.l, v6.l
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v1, 0, 8
; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v1.h, 8, v1.h
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v8.l, v4.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v5.l
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v9, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v7, 0, 8
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v6, v6, v8
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v2, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_ashrrev_i16 v2.h, 8, v2.h
+; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v4, v4, v7
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v6.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v6.l, v0.l
+; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v4.l, v0.l
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v2, v1
-; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v6.h
+; GFX11-DL-TRUE16-NEXT: v_pk_mul_lo_u16 v1, v1, v2
+; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v4.h
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.l
; GFX11-DL-TRUE16-NEXT: v_add_nc_u16 v0.l, v0.l, v1.h
@@ -3435,35 +3432,31 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1]
; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[2:3]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v3
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v2.h
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 24, v2
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v0, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v1.l
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.h
+; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v5.l
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v7, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v2
; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v3
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v4.l, v2.l
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v2.l, v0.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v3.l, v0.l
; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX11-DL-TRUE16-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index f995f426c6372..5e502882a2645 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1669,40 +1669,38 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes:
; GFX11-DL-TRUE16: ; %bb.0: ; %entry
; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v5, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v5, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v6, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v4
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v5
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v3, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v5, 0, 8
; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v1.l
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v2.l
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v3.l
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v7, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v7.l
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v2.l, v0.l
-; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v4, v4, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v3, v3, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT: v_perm_b32 v1, v5, v5, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v3.l, v0.l
+; GFX11-DL-TRUE16-NEXT: v_perm_b32 v2, v4, v4, 0xc0c0302
; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_dot4_u32_u8 v0, v2, v1, v0
-; GFX11-DL-TRUE16-NEXT: global_store_b16 v5, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_store_b16 v6, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
;
; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes:
@@ -1964,44 +1962,41 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes2:
; GFX11-DL-TRUE16: ; %bb.0: ; %entry
; GFX11-DL-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT: v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
; GFX11-DL-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v4, 0
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0
; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-DL-TRUE16-NEXT: s_clause 0x1
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v4, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT: global_load_d16_b16 v0, v5, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v3
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v3.l
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 8, v3
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v3.h
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v3
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v2, 8, v4
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v6, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v7.l, v4.h
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v6.l
-; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v2
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v2.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.l
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.l
; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v7, v7, 0, 8
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l
+; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v6.l
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3
; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v5.l, v6.l
-; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v2.h
+; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v3.h
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v7.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v4
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v1.h, v0.l
-; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v0.h, v0.l
+; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v3.l, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT: global_store_b16 v4, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT: global_store_b16 v5, v0, s[4:5]
; GFX11-DL-TRUE16-NEXT: s_endpgm
;
; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes2:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index f44faf4f7edba..3a4bf1c81ed58 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -424,15 +424,15 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32:
; GFX11-SDAG-TRUE16: ; %bb.0:
; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT: v_frexp_exp_i16_f16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: v_frexp_exp_i16_f16_e32 v2.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_frexp_mant_f16_e32 v0.h, v0.h
-; GFX11-SDAG-TRUE16-NEXT: v_frexp_mant_f16_e32 v0.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX11-SDAG-TRUE16-NEXT: v_frexp_mant_f16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_frexp_mant_f16_e32 v1.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_frexp_exp_i16_f16_e32 v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT: v_frexp_exp_i16_f16_e32 v3.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 16
; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 16
; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -457,15 +457,15 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT: v_frexp_exp_i16_f16_e32 v1.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: v_frexp_exp_i16_f16_e32 v2.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_frexp_mant_f16_e32 v0.h, v0.h
-; GFX12-SDAG-TRUE16-NEXT: v_frexp_mant_f16_e32 v0.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v2, 0, 16
+; GFX12-SDAG-TRUE16-NEXT: v_frexp_mant_f16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_frexp_mant_f16_e32 v1.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_frexp_exp_i16_f16_e32 v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT: v_frexp_exp_i16_f16_e32 v3.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v1.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 16
; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT: v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT: v_bfe_i32 v2, v3, 0, 16
; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -534,15 +534,15 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkm...
[truncated]
|
(V_BFE_I32_e64 | ||
(REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16), | ||
(i32 0), (i32 1)) | ||
>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This looks like it should work, but it wastes the upper half of the register. Is there an instruction with a 16-bit result suitable for doing a sext from n to 16 bits? I did not find one. @jayfoad
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually for 1 bit, you could probably generate cndmask_b16. For sext 8 to 16 bits I don't know which instruction can do it optimally.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't like this. AFAIX there is no single instruction that can do this, so really instead of this ugly pattern we should just say that i16 sext_inreg is not legal (when real true16 is enabled). Then it is the legalizer's job to legalize it, e.g. by promoting to i32, which is what this ugly pattern is doing anyway.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi Joe. The inst works but it mess up the isel/combine/coalescer. Thanks Jay let me try with disabling this in true16
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hi, I tried looking into the legalizeDAG and test it a bit. It seems the legalizer might not work here.
We don't have a inplace sign_extend_inreg/sign_extend promote case in codegen. And it seems most of promote code is implemented with sign_extend/zero_extend/fp_extend... I think the promote code comes back to these sext_inreg patterns in the end, unless there is another ISD code can be used to do sign extension?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ping! This patch is required to unblock a downstream repo so might need some input on this urgently Thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You can separately legalize the 64-bit SEXT_INREG operation to split it into a 32-bit sext_inreg + a sext. The main problem would be how much code bothers checking if SEXT_INREG is legal before introducing it in post-legalize combines
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you please add a reduced test case for this selection scenario? The affected tests are not very specific. Since we need a patch to fix the downstream regression, I am inclined to approve approach after that, suggest you continue working on a legalizer based fix after that.
It seems to have mixed results on isa quality.
c7fe296
to
fb0551d
Compare
Just added a test at the end of sext-in-reg.ll. I also post the previous bad version of machine code here so that it's easier to see the error
The sequence of v1.h in the output is
The whole vgpr16 is took while it should be an i8. It's expected to see isa quality drop since this is fixing a correctness issue. We are reserving the top 16bits inreg before we do v_bfe_i32 while previously we copy a vgpr16 into vgpr32 without taking care of the top bits. Thus codegen could generate additional copy to move the .h to another reg ========================================= Hold on, there seems to be an error with this test, let me double check.
I missed a v_bfe_i32 here, and thus this machine code is correct. I might need to increase the dimension of the vector shuffle to hit the issue |
fb0551d
to
e8b1d31
Compare
Update the test with dim 8 in vector shuffle, and moved to a seperate test file. Here is the old machine code with the problem
At the output, v0.h is the indexed 1 output i8 and it's expected to be the indexed 6 i8 from input. However, the chain flow:
This equivalent to v0.h = v2 & 0xff which is the indexed 4 i8 from input |
e8b1d31
to
ccb669a
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Please revisit the issue to look for a legalizer based solution when possible.
update sext pattern in true16, setting up proper vgpr16 reg use