Skip to content

AMDGPU: Update LIT tests for i1-to-bf16 conversions (NFC) #130916

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 12, 2025

Conversation

changpeng
Copy link
Contributor

Add tests for a few other targets, and with SGPR input/output, based on requests from PR:
#130831

  Add tests for a few other targets, and with SGPR input/output,
based on requests from PR:
llvm#130831
@llvmbot
Copy link
Member

llvmbot commented Mar 12, 2025

@llvm/pr-subscribers-backend-amdgpu

Author: Changpeng Fang (changpeng)

Changes

Add tests for a few other targets, and with SGPR input/output, based on requests from PR:
#130831


Patch is 103.86 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/130916.diff

1 Files Affected:

  • (modified) llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll (+2068-251)
diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
index e52ea79857b48..59b0ba2469a20 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
@@ -1,292 +1,2109 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 | FileCheck %s -check-prefix=GFX942
+; RUN: llc < %s -mtriple=amdgcn -mcpu=hawaii | FileCheck %s -check-prefix=GFX7
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 | FileCheck %s -check-prefix=GFX9
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck %s -check-prefix=GFX11
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefix=GFX12
 
 define bfloat @v_uitofp_i1_to_bf16(i1 %num) {
-; GFX942-LABEL: v_uitofp_i1_to_bf16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
-; GFX942-NEXT:    v_bfe_u32 v1, v0, 16, 1
-; GFX942-NEXT:    v_add3_u32 v1, v1, v0, s0
-; GFX942-NEXT:    v_or_b32_e32 v2, 0x400000, v0
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_uitofp_i1_to_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_i1_to_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX9-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_i1_to_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_uitofp_i1_to_bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v1, v1, v0, 0x7fff
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc_lo
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp i1 %num to bfloat
   ret bfloat %op
 }
 
+define amdgpu_ps i32 @s_uitofp_i1_to_bf16(i1 inreg %num) {
+; GFX7-LABEL: s_uitofp_i1_to_bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX7-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_uitofp_i1_to_bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_bfe_u32 s1, s0, 0x10010
+; GFX9-NEXT:    s_or_b32 s2, s0, 0x400000
+; GFX9-NEXT:    s_add_i32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s3, s0, 0x7fff
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s3
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_uitofp_i1_to_bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_bfe_u32 s1, s0, 0x10010
+; GFX11-NEXT:    s_add_i32 s1, s1, s0
+; GFX11-NEXT:    s_bitset1_b32 s0, 22
+; GFX11-NEXT:    s_addk_i32 s1, 0x7fff
+; GFX11-NEXT:    s_and_b32 s2, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: s_uitofp_i1_to_bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_bitcmp1_b32 s0, 0
+; GFX12-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_bfe_u32 s1, s0, 0x10010
+; GFX12-NEXT:    s_or_b32 s2, s0, 0x400000
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x7fff
+; GFX12-NEXT:    s_cmp_u_f32 s0, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cselect_b32 s0, s2, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    ; return to shader part epilog
+  %op = uitofp i1 %num to bfloat
+  %b16 = bitcast bfloat %op to i16
+  %b32 = zext i16 %b16 to i32
+  ret i32 %b32
+}
+
 define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) {
-; GFX942-LABEL: v_uitofp_v2i1_to_v2bf16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
-; GFX942-NEXT:    v_bfe_u32 v2, v0, 16, 1
-; GFX942-NEXT:    v_add3_u32 v2, v2, v0, s0
-; GFX942-NEXT:    v_or_b32_e32 v3, 0x400000, v0
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, vcc
-; GFX942-NEXT:    v_bfe_u32 v2, v1, 16, 1
-; GFX942-NEXT:    v_add3_u32 v2, v2, v1, s0
-; GFX942-NEXT:    v_or_b32_e32 v3, 0x400000, v1
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX942-NEXT:    s_mov_b32 s0, 0x7060302
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
-; GFX942-NEXT:    v_perm_b32 v0, v1, v0, s0
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_uitofp_v2i1_to_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v_and_b32_e32 v1, 0x7fff0000, v1
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_uitofp_v2i1_to_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX9-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v0, s0
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    s_nop 1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, vcc
+; GFX9-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX9-NEXT:    v_add3_u32 v2, v2, v1, s0
+; GFX9-NEXT:    v_or_b32_e32 v3, 0x400000, v1
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX9-NEXT:    s_nop 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: v_uitofp_v2i1_to_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX11-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX11-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX11-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX11-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: v_uitofp_v2i1_to_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo
+; GFX12-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX12-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_add3_u32 v2, v2, v0, 0x7fff
+; GFX12-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX12-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, vcc_lo
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v0, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX12-NEXT:    v_bfe_u32 v3, v1, 16, 1
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc_lo
+; GFX12-NEXT:    v_or_b32_e32 v5, 0x400000, v1
+; GFX12-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX12-NEXT:    v_add3_u32 v3, v3, v1, 0x7fff
+; GFX12-NEXT:    s_wait_alu 0xfffd
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
+; GFX12-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %op = uitofp <2 x i1> %num to <2 x bfloat>
   ret <2 x bfloat> %op
 }
 
+define amdgpu_ps <2 x i32> @s_uitofp_v2i1_to_v2bf16(<2 x i1> inreg %num) {
+; GFX7-LABEL: s_uitofp_v2i1_to_v2bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_and_b32 s2, 1, s0
+; GFX7-NEXT:    s_bitcmp1_b32 s1, 0
+; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX7-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX7-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX7-NEXT:    ; return to shader part epilog
+;
+; GFX9-LABEL: s_uitofp_v2i1_to_v2bf16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_and_b32 s2, 1, s0
+; GFX9-NEXT:    s_bitcmp1_b32 s1, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s[0:1]
+; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[0:1]
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX9-NEXT:    s_bfe_u32 s1, s0, 0x10010
+; GFX9-NEXT:    s_or_b32 s2, s0, 0x400000
+; GFX9-NEXT:    s_add_i32 s0, s1, s0
+; GFX9-NEXT:    s_add_i32 s3, s0, 0x7fff
+; GFX9-NEXT:    s_and_b64 s[0:1], vcc, exec
+; GFX9-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX9-NEXT:    s_cselect_b32 s0, s2, s3
+; GFX9-NEXT:    s_bfe_u32 s2, s1, 0x10010
+; GFX9-NEXT:    s_or_b32 s4, s1, 0x400000
+; GFX9-NEXT:    s_add_i32 s1, s2, s1
+; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-NEXT:    s_addk_i32 s1, 0x7fff
+; GFX9-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX9-NEXT:    s_and_b64 s[2:3], vcc, exec
+; GFX9-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX9-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: s_uitofp_v2i1_to_v2bf16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_and_b32 s0, 1, s0
+; GFX11-NEXT:    s_bitcmp1_b32 s1, 0
+; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX11-NEXT:    s_cmp_eq_u32 s0, 1
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s1
+; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s0
+; GFX11-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX11-NEXT:    v_cmp_u_f32_e64 s1, v0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX11-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
+; GFX11-NEXT:    s_bfe_u32 s3, s0, 0x10010
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_add_i32 s3, s3, s0
+; GFX11-NEXT:    s_bitset1_b32 s0, 22
+; GFX11-NEXT:    s_addk_i32 s3, 0x7fff
+; GFX11-NEXT:    s_and_b32 s4, vcc_lo, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s0, s0, s3
+; GFX11-NEXT:    s_bfe_u32 s3, s2, 0x10010
+; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX11-NEXT:    s_add_i32 s3, s3, s2
+; GFX11-NEXT:    s_bitset1_b32 s2, 22
+; GFX11-NEXT:    s_addk_i32 s3, 0x7fff
+; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
+; GFX11-NEXT:    s_cselect_b32 s1, s2, s3
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: s_uitofp_v2i1_to_v2bf16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_and_b32 s0, 1, s0
+; GFX12-NEXT:    s_bitcmp1_b32 s1, 0
+; GFX12-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX12-NEXT:    s_cmp_eq_u32 s0, 1
+; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s1
+; GFX12-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX12-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    s_bfe_u32 s1, s0, 0x10010
+; GFX12-NEXT:    s_or_b32 s3, s0, 0x400000
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x7fff
+; GFX12-NEXT:    s_cmp_u_f32 s0, s0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cselect_b32 s0, s3, s1
+; GFX12-NEXT:    s_bfe_u32 s1, s2, 0x10010
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX12-NEXT:    s_add_co_i32 s1, s1, s2
+; GFX12-NEXT:    s_or_b32 s3, s2, 0x400000
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_addk_co_i32 s1, 0x7fff
+; GFX12-NEXT:    s_cmp_u_f32 s2, s2
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    ; return to shader part epilog
+  %op = uitofp <2 x i1> %num to <2 x bfloat>
+  %b16 = bitcast <2 x bfloat> %op to <2 x i16>
+  %b32 = zext <2 x i16> %b16 to <2 x i32>
+  ret <2 x i32> %b32
+}
+
 define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) {
-; GFX942-LABEL: v_uitofp_v3i1_to_v3bf16:
-; GFX942:       ; %bb.0:
-; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
-; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, vcc
-; GFX942-NEXT:    v_bfe_u32 v3, v2, 16, 1
-; GFX942-NEXT:    v_add3_u32 v3, v3, v2, s0
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v2
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
-; GFX942-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
-; GFX942-NEXT:    v_bfe_u32 v3, v0, 16, 1
-; GFX942-NEXT:    v_add3_u32 v3, v3, v0, s0
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v0
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
-; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
-; GFX942-NEXT:    s_nop 1
-; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, vcc
-; GFX942-NEXT:    v_bfe_u32 v3, v1, 16, 1
-; GFX942-NEXT:    v_add3_u32 v3, v3, v1, s0
-; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v1
-; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
-; GFX942-NEXT:    s_mov_b32 s0, 0x7060302
-; GFX942-NEXT:    s_nop 0
-; GFX942-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX942-NEXT:    v_perm_b32 v0, v1, v0, s0
-; GFX942-NEXT:    v_alignbit_b32 v1, s0, v2, 16
-; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_uitofp_v3i1_to_v3bf16:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX7-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX7-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, vcc
+; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX7-NEXT:    v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT:    v...
[truncated]

@changpeng changpeng merged commit c86d884 into llvm:main Mar 12, 2025
8 of 11 checks passed
@changpeng changpeng deleted the lit branch March 12, 2025 07:27
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants