[AMDGPU][True16][CodeGen] sext i16 inreg in true16 mode #144024

broxigarchen · 2025-06-13T06:52:12Z

update sext pattern in true16, setting up proper vgpr16 reg use

llvmbot · 2025-06-13T16:17:09Z

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

Changes

update sext pattern in true16, setting up proper vgpr16 reg use

Patch is 179.96 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/144024.diff

8 Files Affected:

(modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+19)
(modified) llvm/lib/Target/AMDGPU/VOP3Instructions.td (+24)
(modified) llvm/test/CodeGen/AMDGPU/idot4s.ll (+31-38)
(modified) llvm/test/CodeGen/AMDGPU/idot4u.ll (+37-42)
(modified) llvm/test/CodeGen/AMDGPU/llvm.frexp.ll (+32-32)
(modified) llvm/test/CodeGen/AMDGPU/sext-in-reg.ll (+2249-273)
(modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smax.ll (+146-114)
(modified) llvm/test/CodeGen/AMDGPU/vector-reduce-smin.ll (+146-114)

diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 1419f63202a7c..d8c986bcea972 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2574,6 +2574,8 @@ def : GCNPat<
   (i32 (DivergentSextInreg<i1> i32:$src)),
   (V_BFE_I32_e64 i32:$src, (i32 0), (i32 1))>;
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (i16 (DivergentSextInreg<i1> i16:$src)),
   (V_BFE_I32_e64 $src, (i32 0), (i32 1))
@@ -2583,6 +2585,23 @@ def : GCNPat <
   (i16 (DivergentSextInreg<i8> i16:$src)),
   (V_BFE_I32_e64 $src, (i32 0), (i32 8))
 >;
+}
+
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (i16 (DivergentSextInreg<i1> i16:$src)),
+  (V_BFE_I32_e64
+   (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+   (i32 0), (i32 1))
+>;
+
+def : GCNPat <
+  (i16 (DivergentSextInreg<i8> i16:$src)),
+  (V_BFE_I32_e64
+   (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+   (i32 0), (i32 8))
+>;
+}
 
 def : GCNPat<
   (i32 (DivergentSextInreg<i8> i32:$src)),
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index a005e0245b8ff..c213b69a423ae 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -319,11 +319,21 @@ let SchedRW = [Write64Bit] in {
 } // End SchedRW = [Write64Bit]
 } // End isReMaterializable = 1
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 def : GCNPat<
   (i32 (DivergentUnaryFrag<sext> i16:$src)),
   (i32 (V_BFE_I32_e64 i16:$src, (i32 0), (i32 0x10)))
 >;
 
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat<
+  (i32 (DivergentUnaryFrag<sext> i16:$src)),
+  (i32 (V_BFE_I32_e64
+       (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+       (i32 0), (i32 0x10)))
+>;
+
 let isReMaterializable = 1 in {
 let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
 defm V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
@@ -423,6 +433,8 @@ def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 def : GCNPat<
   (i64 (DivergentUnaryFrag<sext> i16:$src)),
     (REG_SEQUENCE VReg_64,
@@ -432,6 +444,18 @@ def : GCNPat<
       ), VGPR_32)), sub1)
 >;
 
+let True16Predicate = UseRealTrue16Insts in
+def : GCNPat<
+  (i64 (DivergentUnaryFrag<sext> i16:$src)),
+    (REG_SEQUENCE VReg_64,
+      (i32 (V_BFE_I32_e64
+            (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+            (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
+      (i32 (COPY_TO_REGCLASS
+         (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32_e64 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+      ), VGPR_32)), sub1)
+>;
+
 let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC], OtherPredicates = [isNotGFX90APlus] in {
 def V_INTERP_P1_F32_e64  : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
 def V_INTERP_P2_F32_e64  : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll
index 9e7968f1acb84..ab38bd21994ec 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4s.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll
@@ -1165,35 +1165,32 @@ define amdgpu_kernel void @idot4_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v1, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[2:3]
 ; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v3, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v6.h, 8, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
-; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v8.h, 8, v1.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v9.l, v1.h
-; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v2.h, 8, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v2, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v4.h, 8, v1.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v1.h
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v7.h, 8, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v1, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v1.h, 8, v1.h
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v5.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v9, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v7, 0, 8
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v6, v6, v8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v2, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_ashrrev_i16 v2.h, 8, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v6.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v4.l, v0.l
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v2, v1
-; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v6.h
+; GFX11-DL-TRUE16-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
+; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v4.h
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.l
 ; GFX11-DL-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v1.h
@@ -3435,35 +3432,31 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[0:1]
 ; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[2:3]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v2.h
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v0, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v1.l
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.l, 0xff, v3.l
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 8, v3
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.h
+; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v5.l
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v6, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v7, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
 ; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-DL-TRUE16-NEXT:    v_mul_lo_u16 v0.l, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v2.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v4, v4, 0, 8
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.h, v2.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v4.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v3.l, v0.l
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; GFX11-DL-TRUE16-NEXT:    global_store_b32 v1, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll
index f995f426c6372..5e502882a2645 100644
--- a/llvm/test/CodeGen/AMDGPU/idot4u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll
@@ -1669,40 +1669,38 @@ define amdgpu_kernel void @notdot4_mixedtypes(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes:
 ; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v6, 0
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v5, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v5, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v6, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v4
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v5
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v3, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v5, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v1.l
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.l, 0xff, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v3.l
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v7, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v7.l
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v6.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v2.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v1, v4, v4, 0xc0c0302
-; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v2, v3, v3, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v1, v5, v5, 0xc0c0302
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v3.l, v0.l
+; GFX11-DL-TRUE16-NEXT:    v_perm_b32 v2, v4, v4, 0xc0c0302
 ; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_dot4_u32_u8 v0, v2, v1, v0
-; GFX11-DL-TRUE16-NEXT:    global_store_b16 v5, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v6, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes:
@@ -1964,44 +1962,41 @@ define amdgpu_kernel void @notdot4_mixedtypes2(ptr addrspace(1) %src1,
 ; GFX11-DL-TRUE16-LABEL: notdot4_mixedtypes2:
 ; GFX11-DL-TRUE16:       ; %bb.0: ; %entry
 ; GFX11-DL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GFX11-DL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-DL-TRUE16-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_and_b32 v0, 0x3ff, v0
 ; GFX11-DL-TRUE16-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GFX11-DL-TRUE16-NEXT:    v_mov_b32_e32 v4, 0
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    s_clause 0x1
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v2, v0, s[2:3]
-; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[0:1]
-; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v4, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v3, v0, s[2:3]
+; GFX11-DL-TRUE16-NEXT:    global_load_b32 v4, v0, s[0:1]
+; GFX11-DL-TRUE16-NEXT:    global_load_d16_b16 v0, v5, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(2)
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v2
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 8, v3
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v3.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 8, v3
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.h
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v3
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v2, 8, v4
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v6, v4, 0, 8
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v4.h
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v5, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v6.l
-; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v2
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v2.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.l
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v1.h, 0xff, v3.l
 ; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v7, v7, 0, 8
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v5.l
+; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v6.l
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
 ; GFX11-DL-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v0.h, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v5.l, v6.l
-; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v2.h
+; GFX11-DL-TRUE16-NEXT:    v_and_b16 v0.h, 0xff, v3.h
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v7.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-DL-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 24, v4
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v2.l, v1.h, v0.l
-; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v5, 0, 8
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-DL-TRUE16-NEXT:    v_bfe_i32 v2, v6, 0, 8
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v1.l, v0.h, v0.l
+; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v2.l
-; GFX11-DL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-DL-TRUE16-NEXT:    v_mad_u16 v0.l, v3.l, v1.l, v0.l
-; GFX11-DL-TRUE16-NEXT:    global_store_b16 v4, v0, s[4:5]
+; GFX11-DL-TRUE16-NEXT:    global_store_b16 v5, v0, s[4:5]
 ; GFX11-DL-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX11-DL-FAKE16-LABEL: notdot4_mixedtypes2:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index f44faf4f7edba..3a4bf1c81ed58 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -424,15 +424,15 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX11-SDAG-TRUE16-LABEL: test_frexp_v2f16_v2i32:
 ; GFX11-SDAG-TRUE16:       ; %bb.0:
 ; GFX11-SDAG-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
-; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
-; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.h, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.l
+; GFX11-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v3.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v1.h, v1.l
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; GFX11-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX11-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GFX11-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -457,15 +457,15 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_samplecnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v1.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.h
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.h, v0.h
-; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v0.l, v0.l
-; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v1, 0, 16
-; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 16
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_mant_f16_e32 v1.h, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v2.l, v0.l
+; GFX12-SDAG-TRUE16-NEXT:    v_frexp_exp_i16_f16_e32 v3.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v1.h, v1.l
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v1, v2, 0, 16
 ; GFX12-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX12-SDAG-TRUE16-NEXT:    v_pack_b32_f16 v0, v0.l, v0.h
+; GFX12-SDAG-TRUE16-NEXT:    v_bfe_i32 v2, v3, 0, 16
 ; GFX12-SDAG-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX12-SDAG-FAKE16-LABEL: test_frexp_v2f16_v2i32:
@@ -534,15 +534,15 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX11-GISEL-TRUE16-LABEL: test_frexp_v2f16_v2i32:
 ; GFX11-GISEL-TRUE16:       ; %bb.0:
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkm...
[truncated]

Sisyph · 2025-06-13T19:05:07Z

llvm/lib/Target/AMDGPU/SIInstructions.td

+  (V_BFE_I32_e64
+   (REG_SEQUENCE VGPR_32, VGPR_16:$src, lo16, (i16 (IMPLICIT_DEF)), hi16),
+   (i32 0), (i32 1))
+>;


This looks like it should work, but it wastes the upper half of the register. Is there an instruction with a 16-bit result suitable for doing a sext from n to 16 bits? I did not find one. @jayfoad

Actually for 1 bit, you could probably generate cndmask_b16. For sext 8 to 16 bits I don't know which instruction can do it optimally.

I don't like this. AFAIX there is no single instruction that can do this, so really instead of this ugly pattern we should just say that i16 sext_inreg is not legal (when real true16 is enabled). Then it is the legalizer's job to legalize it, e.g. by promoting to i32, which is what this ugly pattern is doing anyway.

Hi Joe. The inst works but it mess up the isel/combine/coalescer. Thanks Jay let me try with disabling this in true16

Hi, I tried looking into the legalizeDAG and test it a bit. It seems the legalizer might not work here.

We don't have a inplace sign_extend_inreg/sign_extend promote case in codegen. And it seems most of promote code is implemented with sign_extend/zero_extend/fp_extend... I think the promote code comes back to these sext_inreg patterns in the end, unless there is another ISD code can be used to do sign extension?

ping! This patch is required to unblock a downstream repo so might need some input on this urgently Thanks!

You can separately legalize the 64-bit SEXT_INREG operation to split it into a 32-bit sext_inreg + a sext. The main problem would be how much code bothers checking if SEXT_INREG is legal before introducing it in post-legalize combines

Sisyph

Can you please add a reduced test case for this selection scenario? The affected tests are not very specific. Since we need a patch to fix the downstream regression, I am inclined to approve approach after that, suggest you continue working on a legalizer based fix after that.

It seems to have mixed results on isa quality.

broxigarchen · 2025-06-17T22:07:01Z

Can you please add a reduced test case for this selection scenario? The affected tests are not very specific. Since we need a patch to fix the downstream regression, I am inclined to approve approach after that, suggest you continue working on a legalizer based fix after that.

It seems to have mixed results on isa quality.

Just added a test at the end of sext-in-reg.ll. I also post the previous bad version of machine code here so that it's easier to see the error

IR:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid
  %load = load <4 x i8>, ptr addrspace(1) %in.gep
  %shuff = shufflevector <4 x i8> %load, <4 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
  %cast = sitofp <4 x i8> %shuff to <4 x half>
  store <4 x half> %cast, ptr addrspace(1) %out

; GFX11-TRUE16-LABEL: v_sext_in_reg_i8_i16_shuffer_vector:
; GFX11-TRUE16:       ; %bb.0:
; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
; GFX11-TRUE16-NEXT:    global_load_b32 v1, v0, s[2:3]
; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v1.h
; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v4, 24, v1
; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
; GFX11-TRUE16-NEXT:    v_bfe_i32 v3, v0, 0, 8
; GFX11-TRUE16-NEXT:    v_bfe_i32 v2, v2, 0, 8
; GFX11-TRUE16-NEXT:    v_ashrrev_i16 v0.l, 8, v1.l
; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.h, v4.l
; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v3.l
; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v1.h, v2.l
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, 0
; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v1.l, v1.l
; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v0.l, v1.l
; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.h, v1.h
; GFX11-TRUE16-NEXT:    global_store_b64 v3, v[1:2], s[0:1]
; GFX11-TRUE16-NEXT:    s_endpgm
;

The sequence of v1.h in the output is

v_mov_b16_e32 v2.l, v1.h
v_cvt_f16_i16_e32 v1.h, v2.l
v_pack_b32_f16 v1, v0.h, v1.h

The whole vgpr16 is took while it should be an i8.

It's expected to see isa quality drop since this is fixing a correctness issue. We are reserving the top 16bits inreg before we do v_bfe_i32 while previously we copy a vgpr16 into vgpr32 without taking care of the top bits. Thus codegen could generate additional copy to move the .h to another reg

=========================================

Hold on, there seems to be an error with this test, let me double check.

v_mov_b16_e32 v2.l, v1.h
v_bfe_i32 v2, v2, 0, 8
v_cvt_f16_i16_e32 v1.h, v2.l
v_pack_b32_f16 v1, v0.h, v1.h

I missed a v_bfe_i32 here, and thus this machine code is correct. I might need to increase the dimension of the vector shuffle to hit the issue

broxigarchen · 2025-06-18T14:51:40Z

Update the test with dim 8 in vector shuffle, and moved to a seperate test file.

Here is the old machine code with the problem

IR:
  %tid = call i32 @llvm.amdgcn.workitem.id.x()
  %in.gep = getelementptr <{ [0 x i8] }>, ptr addrspace(1) %ptr, i64 0, i32 0, i32 %tid
  %load = load <8 x i8>, ptr addrspace(1) %in.gep
  %shuff = shufflevector <8 x i8> %load, <8 x i8> poison, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
  %cast = sitofp <8 x i8> %shuff to <8 x half>
  store <8 x half> %cast, ptr addrspace(1) %out

; GFX11-TRUE16-NEXT:    global_load_b64 v[1:2], v0, s[2:3]
; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
; GFX11-TRUE16-NEXT:    v_bfe_i32 v5, v2, 0, 8
; GFX11-TRUE16-NEXT:    v_bfe_i32 v4, v1, 0, 8
; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v6, 24, v2
; GFX11-TRUE16-NEXT:    v_ashrrev_i16 v0.l, 8, v1.l
; GFX11-TRUE16-NEXT:    v_bfe_i32 v7, v3, 0, 8
; GFX11-TRUE16-NEXT:    v_ashrrev_i16 v0.h, 8, v2.l
; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v2, 24, v1
; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v5.l
; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v4.l
; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v7.l
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v0.l
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.h, v0.h
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v2.l, v2.l
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v4.h, v6.l
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v1.l, v1.l
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v2.h, v3.l
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v3.l, v4.l
; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v4.l, v5.l
; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v5, 0
; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v1, v0.h, v1.l
; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v2, v2.l, v2.h
; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v3, v0.l, v3.l
; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v4.h, v4.l
; GFX11-TRUE16-NEXT:    global_store_b128 v5, v[0:3], s[0:1]

At the output, v0.h is the indexed 1 output i8 and it's expected to be the indexed 6 i8 from input.

However, the chain flow:

v_bfe_i32 v5, v2, 0, 8
v_cvt_f16_i16_e32 v4.l, v5.l
v_pack_b32_f16 v0, v4.h, v4.l

This equivalent to v0.h = v2 & 0xff which is the indexed 4 i8 from input

llvm/test/CodeGen/AMDGPU/sext-in-reg.ll

llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffer.ll

Sisyph

LGTM. Please revisit the issue to look for a legalizer based solution when possible.

broxigarchen changed the title ~~Main merge true16 fix bfe~~ [AMDGPU][True16][CodeGen] sext i16 inreg in true16 mode Jun 13, 2025

broxigarchen force-pushed the main-merge-true16-fix-bfe branch 2 times, most recently from e1b5438 to c7fe296 Compare June 13, 2025 14:35

broxigarchen marked this pull request as ready for review June 13, 2025 16:16

llvmbot added the backend:AMDGPU label Jun 13, 2025

broxigarchen requested review from arsenm, Sisyph and kosarev June 13, 2025 18:53

Sisyph reviewed Jun 13, 2025

View reviewed changes

Sisyph reviewed Jun 17, 2025

View reviewed changes

fix bfe for true16 mode

9d7007d

broxigarchen force-pushed the main-merge-true16-fix-bfe branch from c7fe296 to fb0551d Compare June 17, 2025 22:02

broxigarchen force-pushed the main-merge-true16-fix-bfe branch from fb0551d to e8b1d31 Compare June 18, 2025 14:46

Sisyph reviewed Jun 18, 2025

View reviewed changes

llvm/test/CodeGen/AMDGPU/sext-in-reg.ll Outdated Show resolved Hide resolved

llvm/test/CodeGen/AMDGPU/sext-in-reg-vector-shuffer.ll Outdated Show resolved Hide resolved

update test

ccb669a

broxigarchen force-pushed the main-merge-true16-fix-bfe branch from e8b1d31 to ccb669a Compare June 18, 2025 15:10

Sisyph approved these changes Jun 18, 2025

View reviewed changes

broxigarchen merged commit 9da9d32 into llvm:main Jun 18, 2025
6 of 7 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU][True16][CodeGen] sext i16 inreg in true16 mode #144024

[AMDGPU][True16][CodeGen] sext i16 inreg in true16 mode #144024

Uh oh!

broxigarchen commented Jun 13, 2025 •

edited

Loading

Uh oh!

llvmbot commented Jun 13, 2025

Uh oh!

Sisyph Jun 13, 2025

Uh oh!

Sisyph Jun 13, 2025

Uh oh!

jayfoad Jun 16, 2025

Uh oh!

broxigarchen Jun 16, 2025 •

edited

Loading

Uh oh!

broxigarchen Jun 16, 2025 •

edited

Loading

Uh oh!

broxigarchen Jun 17, 2025 •

edited

Loading

Uh oh!

arsenm Jun 19, 2025

Uh oh!

Sisyph left a comment

Uh oh!

broxigarchen commented Jun 17, 2025 •

edited

Loading

Uh oh!

broxigarchen commented Jun 18, 2025 •

edited

Loading

Uh oh!

Uh oh!

Uh oh!

Sisyph left a comment

Uh oh!

Uh oh!

Uh oh!

[AMDGPU][True16][CodeGen] sext i16 inreg in true16 mode #144024

[AMDGPU][True16][CodeGen] sext i16 inreg in true16 mode #144024

Uh oh!

Conversation

broxigarchen commented Jun 13, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jun 13, 2025

Uh oh!

Sisyph Jun 13, 2025

Choose a reason for hiding this comment

Uh oh!

Sisyph Jun 13, 2025

Choose a reason for hiding this comment

Uh oh!

jayfoad Jun 16, 2025

Choose a reason for hiding this comment

Uh oh!

broxigarchen Jun 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

broxigarchen Jun 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

broxigarchen Jun 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

arsenm Jun 19, 2025

Choose a reason for hiding this comment

Uh oh!

Sisyph left a comment

Choose a reason for hiding this comment

Uh oh!

broxigarchen commented Jun 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

broxigarchen commented Jun 18, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Sisyph left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

broxigarchen commented Jun 13, 2025 •

edited

Loading

broxigarchen Jun 16, 2025 •

edited

Loading

broxigarchen Jun 16, 2025 •

edited

Loading

broxigarchen Jun 17, 2025 •

edited

Loading

broxigarchen commented Jun 17, 2025 •

edited

Loading

broxigarchen commented Jun 18, 2025 •

edited

Loading