-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU: Implement i1 to bfloat conversion #130831
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
We are using the same approach as the conversion of other integer type to bfloat: i1 --> f32 and f32 --> bf16. Refer to LowerUINT_TO_FP and LowerSINT_TO_FP in AMDGPUTargetLowering.cpp for details.
@llvm/pr-subscribers-backend-amdgpu Author: Changpeng Fang (changpeng) ChangesWe are using the same approach as the conversion of other integer type to bfloat: i1 --> f32 and f32 --> bf16. Refer to LowerUINT_TO_FP and LowerSINT_TO_FP in AMDGPUTargetLowering.cpp for details. Full diff: https://github.com/llvm/llvm-project/pull/130831.diff 2 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 0bb830cdf36d6..9743320601ed4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -582,6 +582,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, MVT::i16, Custom);
setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i16, Custom);
+ setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i1, Custom);
setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, MVT::i32, Custom);
diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
new file mode 100644
index 0000000000000..e52ea79857b48
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
@@ -0,0 +1,292 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 | FileCheck %s -check-prefix=GFX942
+
+define bfloat @v_uitofp_i1_to_bf16(i1 %num) {
+; GFX942-LABEL: v_uitofp_i1_to_bf16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT: s_movk_i32 s0, 0x7fff
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT: v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %op = uitofp i1 %num to bfloat
+ ret bfloat %op
+}
+
+define <2 x bfloat> @v_uitofp_v2i1_to_v2bf16(<2 x i1> %num) {
+; GFX942-LABEL: v_uitofp_v2i1_to_v2bf16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT: s_movk_i32 s0, 0x7fff
+; GFX942-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX942-NEXT: v_add3_u32 v2, v2, v0, s0
+; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX942-NEXT: v_add3_u32 v2, v2, v1, s0
+; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: s_mov_b32 s0, 0x7060302
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX942-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %op = uitofp <2 x i1> %num to <2 x bfloat>
+ ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_uitofp_v3i1_to_v3bf16(<3 x i1> %num) {
+; GFX942-LABEL: v_uitofp_v3i1_to_v3bf16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX942-NEXT: s_movk_i32 s0, 0x7fff
+; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s0
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX942-NEXT: v_add3_u32 v3, v3, v0, s0
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX942-NEXT: v_add3_u32 v3, v3, v1, s0
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: s_mov_b32 s0, 0x7060302
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX942-NEXT: v_alignbit_b32 v1, s0, v2, 16
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %op = uitofp <3 x i1> %num to <3 x bfloat>
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_uitofp_v4i1_to_v4bf16(<4 x i1> %num) {
+; GFX942-LABEL: v_uitofp_v4i1_to_v4bf16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX942-NEXT: s_movk_i32 s0, 0x7fff
+; GFX942-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX942-NEXT: v_add3_u32 v4, v4, v2, s0
+; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX942-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, 1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX942-NEXT: v_add3_u32 v4, v4, v3, s0
+; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX942-NEXT: v_add3_u32 v4, v4, v0, s0
+; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, 1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX942-NEXT: v_add3_u32 v4, v4, v1, s0
+; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: s_mov_b32 s0, 0x7060302
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX942-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX942-NEXT: v_perm_b32 v1, v3, v2, s0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %op = uitofp <4 x i1> %num to <4 x bfloat>
+ ret <4 x bfloat> %op
+}
+
+define bfloat @v_sitofp_i1_to_bf16(i1 %num) {
+; GFX942-LABEL: v_sitofp_i1_to_bf16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT: s_movk_i32 s0, 0x7fff
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT: v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %op = sitofp i1 %num to bfloat
+ ret bfloat %op
+}
+
+define <2 x bfloat> @v_sitofp_v2i1_to_v2bf16(<2 x i1> %num) {
+; GFX942-LABEL: v_sitofp_v2i1_to_v2bf16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT: s_movk_i32 s0, 0x7fff
+; GFX942-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GFX942-NEXT: v_add3_u32 v2, v2, v0, s0
+; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GFX942-NEXT: v_add3_u32 v2, v2, v1, s0
+; GFX942-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: s_mov_b32 s0, 0x7060302
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX942-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %op = sitofp <2 x i1> %num to <2 x bfloat>
+ ret <2 x bfloat> %op
+}
+
+define <3 x bfloat> @v_sitofp_v3i1_to_v3bf16(<3 x i1> %num) {
+; GFX942-LABEL: v_sitofp_v3i1_to_v3bf16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX942-NEXT: s_movk_i32 s0, 0x7fff
+; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GFX942-NEXT: v_add3_u32 v3, v3, v2, s0
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v3, v0, 16, 1
+; GFX942-NEXT: v_add3_u32 v3, v3, v0, s0
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v3, v1, 16, 1
+; GFX942-NEXT: v_add3_u32 v3, v3, v1, s0
+; GFX942-NEXT: v_or_b32_e32 v4, 0x400000, v1
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: s_mov_b32 s0, 0x7060302
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX942-NEXT: v_alignbit_b32 v1, s0, v2, 16
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %op = sitofp <3 x i1> %num to <3 x bfloat>
+ ret <3 x bfloat> %op
+}
+
+define <4 x bfloat> @v_sitofp_v4i1_to_v4bf16(<4 x i1> %num) {
+; GFX942-LABEL: v_sitofp_v4i1_to_v4bf16:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX942-NEXT: s_movk_i32 s0, 0x7fff
+; GFX942-NEXT: v_and_b32_e32 v3, 1, v3
+; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, -1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v4, v2, 16, 1
+; GFX942-NEXT: v_add3_u32 v4, v4, v2, s0
+; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v2
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX942-NEXT: v_and_b32_e32 v1, 1, v1
+; GFX942-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, -1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v4, v3, 16, 1
+; GFX942-NEXT: v_add3_u32 v4, v4, v3, s0
+; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v3
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v4, v0, 16, 1
+; GFX942-NEXT: v_add3_u32 v4, v4, v0, s0
+; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v0
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GFX942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, -1.0, vcc
+; GFX942-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GFX942-NEXT: v_add3_u32 v4, v4, v1, s0
+; GFX942-NEXT: v_or_b32_e32 v5, 0x400000, v1
+; GFX942-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT: s_mov_b32 s0, 0x7060302
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc
+; GFX942-NEXT: v_perm_b32 v0, v1, v0, s0
+; GFX942-NEXT: v_perm_b32 v1, v3, v2, s0
+; GFX942-NEXT: s_setpc_b64 s[30:31]
+ %op = sitofp <4 x i1> %num to <4 x bfloat>
+ ret <4 x bfloat> %op
+}
|
The change looks good to me, but I wonder what the motivation behind this is. |
Fix a bug: SWDEV-511605, but I haven't checked the source regarding i1 to bf16 conversion yet. |
@@ -0,0 +1,292 @@ | |||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | |||
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx942 | FileCheck %s -check-prefix=GFX942 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should also have tests on other targets, particularly the ones without legal bf16
; GFX942-NEXT: s_setpc_b64 s[30:31] | ||
%op = sitofp <4 x i1> %num to <4 x bfloat> | ||
ret <4 x bfloat> %op | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should also test cases with SGPR input / output
Add tests for a few other targets, and with SGPR input/output, based on requests from PR: llvm#130831
Add tests for a few other targets, and with SGPR input/output, based on requests from PR: #130831
…(#130916) Add tests for a few other targets, and with SGPR input/output, based on requests from PR: llvm/llvm-project#130831
We are using the same approach as the conversion of other integer type to bfloat: i1 --> f32 and f32 --> bf16. Refer to LowerUINT_TO_FP and LowerSINT_TO_FP in AMDGPUTargetLowering.cpp for details.
Fixes: SWDEV-511605