[AMDGPU][True16][CodeGen] build_vector pattern in true16 #118904

broxigarchen · 2024-12-06T00:43:34Z

build_vector pattern in true16 SDAG

llvmbot · 2024-12-06T00:44:33Z

@llvm/pr-subscribers-backend-amdgpu

Author: Brox Chen (broxigarchen)

Changes

build_vector pattern in true16

Patch is 71.48 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/118904.diff

4 Files Affected:

(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+14-1)
(modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+20)
(modified) llvm/test/CodeGen/AMDGPU/bf16.ll (+389-397)
(modified) llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll (+17-65)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 3be865f03df1fd..707ad0caa83042 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -700,9 +700,22 @@ bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR(MachineInstr &MI) const {
     return true;
 
   // TODO: This should probably be a combine somewhere
-  // (build_vector $src0, undef)  -> copy $src0
   MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
   if (Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
+    if (Subtarget->useRealTrue16Insts() && IsVector) {
+      // (vecTy (DivergentBinFrag<build_vector> Ty:$src0, (Ty undef))),
+      // -> (vecTy (INSERT_SUBREG (IMPLICIT_DEF), VGPR_16:$src0, lo16))
+      Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      BuildMI(*BB, &MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
+      BuildMI(*BB, &MI, DL, TII.get(TargetOpcode::INSERT_SUBREG), Dst)
+          .addReg(Undef)
+          .addReg(Src0)
+          .addImm(AMDGPU::lo16);
+      MI.eraseFromParent();
+      return RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) &&
+             RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_16RegClass, *MRI);
+    }
+    // (build_vector $src0, undef)  -> copy $src0
     MI.setDesc(TII.get(AMDGPU::COPY));
     MI.removeOperand(2);
     const auto &RC =
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index bc25d75131cc35..c822a3ea0d8667 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3275,6 +3275,8 @@ def : GCNPat <
   (COPY_TO_REGCLASS SReg_32:$src0, SReg_32)
 >;
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in {
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$src0), (Ty undef))),
   (COPY_TO_REGCLASS VGPR_32:$src0, VGPR_32)
@@ -3284,6 +3286,7 @@ def : GCNPat <
   (vecTy (UniformBinFrag<build_vector> (Ty undef), (Ty SReg_32:$src1))),
   (S_LSHL_B32 SReg_32:$src1, (i32 16))
 >;
+}
 
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty undef), (Ty VGPR_32:$src1))),
@@ -3293,6 +3296,8 @@ def : GCNPat <
 }
 
 let SubtargetPredicate = HasVOP3PInsts in {
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 def : GCNPat <
   (v2i16 (DivergentBinFrag<build_vector> (i16 VGPR_32:$src0), (i16 VGPR_32:$src1))),
   (v2i16 (V_LSHL_OR_B32_e64 $src1, (i32 16), (i32 (V_AND_B32_e64 (i32 (V_MOV_B32_e32 (i32 0xffff))), $src0))))
@@ -3322,12 +3327,25 @@ def : GCNPat <
   (S_PACK_LL_B32_B16 SReg_32:$src0, SReg_32:$src1)
 >;
 
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 // Take the lower 16 bits from each VGPR_32 and concat them
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_32:$a), (Ty VGPR_32:$b))),
   (V_PERM_B32_e64 VGPR_32:$b, VGPR_32:$a, (S_MOV_B32 (i32 0x05040100)))
 >;
 
+let True16Predicate = UseRealTrue16Insts in {
+def : GCNPat <
+  (vecTy (DivergentBinFrag<build_vector> (Ty VGPR_16:$a), (Ty VGPR_16:$b))),
+  (REG_SEQUENCE VGPR_32, VGPR_16:$a, lo16, VGPR_16:$b, hi16)
+>;
+// GISel ignores this Pat, but the equivalent is done in selectG_BUILD_VECTOR
+def : GCNPat <
+  (vecTy (build_vector (Ty VGPR_16:$src0), (Ty undef))),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)
+>;
+}
 
 // Take the lower 16 bits from V[0] and the upper 16 bits from V[1]
 // Special case, can use V_BFI (0xffff literal likely more reusable than 0x70601000)
@@ -3353,6 +3371,8 @@ def : GCNPat <
 
 // Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
 // Special case, can use V_ALIGNBIT (always uses encoded literal)
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 def : GCNPat <
   (vecTy (DivergentBinFrag<build_vector>
     (Ty !if(!eq(Ty, i16),
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 9a647f04d43da6..230a62f447938c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -9341,7 +9341,7 @@ define <3 x bfloat> @v_fadd_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fadd_v3bf16:
@@ -13109,7 +13109,7 @@ define <3 x bfloat> @v_fsub_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fsub_v3bf16:
@@ -13778,7 +13778,7 @@ define <3 x bfloat> @v_fmul_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fmul_v3bf16:
@@ -17856,7 +17856,7 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_minnum_v3bf16:
@@ -21475,7 +21475,7 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_maxnum_v3bf16:
@@ -28670,8 +28670,8 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
@@ -28774,9 +28774,9 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
@@ -28897,18 +28897,18 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 0xffff0000, v0
-; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v3, 0xffff0000, v1
 ; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX11TRUE16-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v2
-; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x5040100
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v1, 0x5040100
+; GFX11TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v2.l
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
@@ -30480,8 +30480,8 @@ define <3 x bfloat> @v_sitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_sitofp_v3i16_to_v3bf16:
@@ -31001,7 +31001,7 @@ define <3 x bfloat> @v_sitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_sitofp_v3i32_to_v3bf16:
@@ -31921,7 +31921,7 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_sitofp_v3i64_to_v3bf16:
@@ -32755,7 +32755,7 @@ define <3 x bfloat> @v_uitofp_v3i16_to_v3bf16(<3 x i16> %x) {
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v2, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_uitofp_v3i16_to_v3bf16:
@@ -33278,7 +33278,7 @@ define <3 x bfloat> @v_uitofp_v3i32_to_v3bf16(<3 x i32> %x) {
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v6, vcc_lo
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v2, 16
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_uitofp_v3i32_to_v3bf16:
@@ -34047,8 +34047,8 @@ define <3 x bfloat> @v_uitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX11TRUE16-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc_lo
 ; GFX11TRUE16-NEXT:    v_perm_b32 v0, v2, v0, 0x7060302
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_alignbit_b32 v1, v0, v1, 16
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_uitofp_v3i64_to_v3bf16:
@@ -34745,9 +34745,9 @@ define <2 x bfloat> @v_select_v2bf16(i1 %cond, <2 x bfloat> %a, <2 x bfloat> %b)
 ; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v2, v1 :: v_dual_cndmask_b32 v1, v3, v4
+; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v3, v3, v4 :: v_dual_cndmask_b32 v0, v2, v1
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v3.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_select_v2bf16:
@@ -34851,13 +34851,14 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
 ; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
 ; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_2)
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v3, v2 :: v_dual_and_b32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_vselect_v2bf16:
@@ -35034,14 +35035,14 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, s2
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s3
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s1
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s0
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_dual_cndmask_b32 v0, v1, v2 :: v_dual_cndmask_b32 v1, v3, v4
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v0.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX11TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX11FAKE16-LABEL: s_select_v2bf16:
@@ -35148,19 +35149,19 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX11TRUE16-LABEL: s_vselect_v2bf16:
 ; GFX11TRUE16:       ; %bb.0:
-; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s1, 16
-; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s3
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.l, s1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v3.l, s0
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
+; GFX11TRUE16-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s1, 16
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s3
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v4, v5, vcc_lo
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11TRUE16-NEXT:    ; return to shader part epilog
 ;
@@ -36868,33 +36869,32 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX11TRUE16-LABEL: s_vselect_v4bf16:
 ; GFX11TRUE16:       ; %bb.0:
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, s3
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, s1
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s3, 16
 ; GFX11TRUE16-NEXT:    s_lshr_b32 s5, s1, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, s1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, s3
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v6, v7, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v10.l, s2
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v11.l, s0
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s4
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s5
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v8, v9, vcc_lo
+; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v10, v11, vcc_lo
 ; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX11TRUE16-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v7.l, s2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, s0
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v6.l, s4
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc_lo
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v4.l, s3
-; GFX11TRUE16-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v5.l, s3
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, s1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v0, v7, v8, vcc_lo
-; GFX11TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v2, v4, v9, vcc_lo
-; GFX11TRUE16-NEXT:    v_perm_b32 v0, v1, v0, 0x5040100
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_perm_b32 v1, v3, v2, 0x5040100
 ; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11TRUE16-NEXT:    v_mov_b16_e32 v2.h, v3.l
+; GFX11TRUE16-NEXT:    v_readfirstlane_b32 s1, v2
 ; GFX11TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX11FAKE16-LABEL: s_vselect_v4bf16:
@@ -37070,29 +37070,28 @@ define <4 x bfloat> @v_vselect_v4bf16(<4 x i1> %cond, <4 x bfloat> %a, <4 x bflo
 ; GFX11TRUE16-LABEL: v_vselect_v4bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v8, 16, v7
+; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
 ; GFX11TRUE16-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v8.l, v7.l
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v9.l, v5.l
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
+; GFX11TR...
[truncated]

arsenm · 2024-12-06T14:53:31Z

llvm/lib/Target/AMDGPU/SIInstructions.td

+  (vecTy (build_vector (Ty VGPR_16:$src0), (Ty undef))),
+  (REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)


I would hope these turn into scalar_to_vector? Or do we go the other way? I don't remember

Hi Matt. I think these turn two vgpr16 into a v2f16. Does it anwer your question?

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

llvm/lib/Target/AMDGPU/SIInstructions.td

broxigarchen · 2025-02-04T16:06:02Z

sorry for the delay. This PR should be ready for a another round of review. Thanks!

Sisyph · 2025-02-10T22:12:05Z

llvm/lib/Target/AMDGPU/SIInstructions.td

@@ -3424,6 +3441,8 @@ def : GCNPat <

 // Take the upper 16 bits from V[0] and the lower 16 bits from V[1]
 // Special case, can use V_ALIGNBIT (always uses encoded literal)
+foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
+let True16Predicate = p in
 def : GCNPat <
  (vecTy (DivergentBinFrag<build_vector>
    (Ty !if(!eq(Ty, i16),


Perhaps the PERM_B32 pat at new line 3456 should also be disabled in Real True16 mode? It seems redundant with the new REG_SEQUENCE pat.

Took a closer look at this. I think these vector patch all might be moving to just fake16/non-16 mode. probably get a patch up in downstream first

I checked the other pattern, for now just removing them from true16, the codegen does work and select the reg_sequence, but it generates additonal right shifts which is worse than the old pattern right now.

I think there exist better selections but it needs a bit more work. This might involving adding more
true16 pattern and some bitcast pattern.

We could merge this patch first and then follow up on this later

broxigarchen · 2025-02-20T18:10:38Z

ping!

broxigarchen · 2025-02-20T20:04:08Z

rebased

Sisyph

LGTM

llvm-ci · 2025-02-21T19:07:29Z

LLVM Buildbot has detected a new failure on builder lldb-x86_64-debian running on lldb-x86_64-debian while building llvm at step 6 "test".

Full details are available at: https://lab.llvm.org/buildbot/#/builders/162/builds/16653

Here is the relevant piece of the build log for the reference

Step 6 (test) failure: build (failure)
...
PASS: lldb-api :: functionalities/data-formatter/data-formatter-stl/generic/unordered/TestDataFormatterGenericUnordered.py (113 of 2786)
PASS: lldb-api :: tools/lldb-server/libraries-svr4/TestGdbRemoteLibrariesSvr4Support.py (114 of 2786)
PASS: lldb-api :: qemu/TestQemuLaunch.py (115 of 2786)
PASS: lldb-api :: functionalities/thread/step_until/TestStepUntilAPI.py (116 of 2786)
PASS: lldb-api :: functionalities/data-formatter/data-formatter-stl/generic/multiset/TestDataFormatterGenericMultiSet.py (117 of 2786)
PASS: lldb-api :: commands/expression/nested/TestNestedExpressions.py (118 of 2786)
PASS: lldb-api :: commands/expression/ir-interpreter/TestIRInterpreter.py (119 of 2786)
PASS: lldb-api :: symbol_ondemand/shared_library/TestSharedLibOnDemand.py (120 of 2786)
PASS: lldb-api :: tools/lldb-server/TestGdbRemoteRegisterState.py (121 of 2786)
PASS: lldb-api :: tools/lldb-server/TestGdbRemote_qThreadStopInfo.py (122 of 2786)
FAIL: lldb-api :: functionalities/thread/thread_specific_break_plus_condition/TestThreadSpecificBpPlusCondition.py (123 of 2786)
******************** TEST 'lldb-api :: functionalities/thread/thread_specific_break_plus_condition/TestThreadSpecificBpPlusCondition.py' FAILED ********************
Script:
--
/usr/bin/python3 /home/worker/2.0.1/lldb-x86_64-debian/llvm-project/lldb/test/API/dotest.py -u CXXFLAGS -u CFLAGS --env LLVM_LIBS_DIR=/home/worker/2.0.1/lldb-x86_64-debian/build/./lib --env LLVM_INCLUDE_DIR=/home/worker/2.0.1/lldb-x86_64-debian/build/include --env LLVM_TOOLS_DIR=/home/worker/2.0.1/lldb-x86_64-debian/build/./bin --arch x86_64 --build-dir /home/worker/2.0.1/lldb-x86_64-debian/build/lldb-test-build.noindex --lldb-module-cache-dir /home/worker/2.0.1/lldb-x86_64-debian/build/lldb-test-build.noindex/module-cache-lldb/lldb-api --clang-module-cache-dir /home/worker/2.0.1/lldb-x86_64-debian/build/lldb-test-build.noindex/module-cache-clang/lldb-api --executable /home/worker/2.0.1/lldb-x86_64-debian/build/./bin/lldb --compiler /home/worker/2.0.1/lldb-x86_64-debian/build/./bin/clang --dsymutil /home/worker/2.0.1/lldb-x86_64-debian/build/./bin/dsymutil --make /usr/bin/gmake --llvm-tools-dir /home/worker/2.0.1/lldb-x86_64-debian/build/./bin --lldb-obj-root /home/worker/2.0.1/lldb-x86_64-debian/build/tools/lldb --lldb-libs-dir /home/worker/2.0.1/lldb-x86_64-debian/build/./lib -t /home/worker/2.0.1/lldb-x86_64-debian/llvm-project/lldb/test/API/functionalities/thread/thread_specific_break_plus_condition -p TestThreadSpecificBpPlusCondition.py
--
Exit Code: 1

Command Output (stdout):
--
lldb version 21.0.0git (https://github.com/llvm/llvm-project.git revision c896f7bdaa0909f7dbc554c28f814a7ef76d34f0)
  clang revision c896f7bdaa0909f7dbc554c28f814a7ef76d34f0
  llvm revision c896f7bdaa0909f7dbc554c28f814a7ef76d34f0
Skipping the following test categories: ['libc++', 'dsym', 'gmodules', 'debugserver', 'objc']

--
Command Output (stderr):
--
Change dir to: /home/worker/2.0.1/lldb-x86_64-debian/llvm-project/lldb/test/API/functionalities/thread/thread_specific_break_plus_condition
UNSUPPORTED: LLDB (/home/worker/2.0.1/lldb-x86_64-debian/build/bin/clang-x86_64) :: test_python_dsym (TestThreadSpecificBpPlusCondition.ThreadSpecificBreakPlusConditionTestCase.test_python_dsym) (test case does not fall in any category of interest for this run) 
runCmd: settings clear -all

output: 

runCmd: settings set symbols.enable-external-lookup false

output: 

runCmd: settings set target.inherit-tcc true

output: 

runCmd: settings set target.disable-aslr false

output: 

runCmd: settings set target.detach-on-error false

output:

broxigarchen changed the title ~~[AMDGPU][True16][CodeGen] buildvector pattern in true16~~ [AMDGPU][True16][CodeGen] build_vector pattern in true16 Dec 6, 2024

broxigarchen marked this pull request as ready for review December 6, 2024 00:44

broxigarchen requested a review from rampitec December 6, 2024 00:44

llvmbot added the backend:AMDGPU label Dec 6, 2024

broxigarchen requested review from arsenm and Sisyph December 6, 2024 00:44

arsenm reviewed Dec 6, 2024

View reviewed changes

jayfoad reviewed Dec 10, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstructions.td Show resolved Hide resolved

jayfoad reviewed Dec 10, 2024

View reviewed changes

llvm/lib/Target/AMDGPU/SIInstructions.td Outdated Show resolved Hide resolved

broxigarchen force-pushed the main-merge-true16-codegen-buildvector branch 2 times, most recently from acb65ba to b9bbb43 Compare February 3, 2025 23:23

broxigarchen requested review from jayfoad and arsenm February 3, 2025 23:36

Sisyph reviewed Feb 10, 2025

View reviewed changes

broxigarchen requested a review from Sisyph February 19, 2025 17:43

broxigarchen added 2 commits February 20, 2025 13:16

buildvector pattern in True16

fd13edf

patch2

18a457b

broxigarchen force-pushed the main-merge-true16-codegen-buildvector branch from b9bbb43 to 18a457b Compare February 20, 2025 20:03

update test with rebase

3466665

Sisyph approved these changes Feb 20, 2025

View reviewed changes

broxigarchen merged commit c896f7b into llvm:main Feb 21, 2025
11 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[AMDGPU][True16][CodeGen] build_vector pattern in true16 #118904

[AMDGPU][True16][CodeGen] build_vector pattern in true16 #118904

Uh oh!

broxigarchen commented Dec 6, 2024 •

edited

Loading

Uh oh!

llvmbot commented Dec 6, 2024

Uh oh!

arsenm Dec 6, 2024

Uh oh!

broxigarchen Feb 20, 2025

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

broxigarchen commented Feb 4, 2025

Uh oh!

Sisyph Feb 10, 2025

Uh oh!

broxigarchen Feb 18, 2025

Uh oh!

broxigarchen Feb 19, 2025 •

edited

Loading

Uh oh!

broxigarchen commented Feb 20, 2025

Uh oh!

broxigarchen commented Feb 20, 2025

Uh oh!

Sisyph left a comment

Uh oh!

Uh oh!

llvm-ci commented Feb 21, 2025

Uh oh!

Uh oh!

		(vecTy (build_vector (Ty VGPR_16:$src0), (Ty undef))),
		(REG_SEQUENCE VGPR_32, $src0, lo16, (IMPLICIT_DEF), hi16)

[AMDGPU][True16][CodeGen] build_vector pattern in true16 #118904

[AMDGPU][True16][CodeGen] build_vector pattern in true16 #118904

Uh oh!

Conversation

broxigarchen commented Dec 6, 2024 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Dec 6, 2024

Uh oh!

arsenm Dec 6, 2024

Choose a reason for hiding this comment

Uh oh!

broxigarchen Feb 20, 2025

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

broxigarchen commented Feb 4, 2025

Uh oh!

Sisyph Feb 10, 2025

Choose a reason for hiding this comment

Uh oh!

broxigarchen Feb 18, 2025

Choose a reason for hiding this comment

Uh oh!

broxigarchen Feb 19, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

broxigarchen commented Feb 20, 2025

Uh oh!

broxigarchen commented Feb 20, 2025

Uh oh!

Sisyph left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

llvm-ci commented Feb 21, 2025

Uh oh!

Uh oh!

broxigarchen commented Dec 6, 2024 •

edited

Loading

broxigarchen Feb 19, 2025 •

edited

Loading