Skip to content

Commit ad2fdd8

Browse files
committed
AMDGPU: Handle vectors in copysign sign type combine
This avoids some ugly codegen on pre-16-bit instruction targets now from annoying f16 legalization effects. This also avoids regressions on newer targets in a future patch.
1 parent 4169270 commit ad2fdd8

File tree

3 files changed

+65
-220
lines changed

3 files changed

+65
-220
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 27 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -11737,9 +11737,10 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
1173711737
// lower half with a copy.
1173811738
// fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
1173911739
EVT MagVT = MagnitudeOp.getValueType();
11740-
if (MagVT.getScalarType() == MVT::f64) {
11741-
unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
1174211740

11741+
unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
11742+
11743+
if (MagVT.getScalarType() == MVT::f64) {
1174311744
EVT F32VT = MagVT.isVector()
1174411745
? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
1174511746
: MVT::v2f32;
@@ -11777,21 +11778,39 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
1177711778
return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
1177811779
}
1177911780

11780-
if (SignVT != MVT::f64)
11781+
if (SignVT.getScalarType() != MVT::f64)
1178111782
return SDValue();
1178211783

1178311784
// Reduce width of sign operand, we only need the highest bit.
1178411785
//
1178511786
// fcopysign f64:x, f64:y ->
1178611787
// fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
1178711788
// TODO: In some cases it might make sense to go all the way to f16.
11788-
SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp);
11789-
SDValue SignAsF32 =
11790-
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11791-
DAG.getConstant(1, DL, MVT::i32));
11789+
11790+
EVT F32VT = MagVT.isVector()
11791+
? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
11792+
: MVT::v2f32;
11793+
11794+
SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
11795+
11796+
SmallVector<SDValue, 8> F32Signs;
11797+
for (unsigned I = 0; I != NumElts; ++I) {
11798+
// Take sign from odd elements of cast vector
11799+
SDValue SignAsF32 =
11800+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
11801+
DAG.getConstant(2 * I + 1, DL, MVT::i32));
11802+
F32Signs.push_back(SignAsF32);
11803+
}
11804+
11805+
SDValue NewSign =
11806+
NumElts == 1
11807+
? F32Signs.back()
11808+
: DAG.getNode(ISD::BUILD_VECTOR, DL,
11809+
EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
11810+
F32Signs);
1179211811

1179311812
return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
11794-
SignAsF32);
11813+
NewSign);
1179511814
}
1179611815

1179711816
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)

llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll

Lines changed: 28 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -4677,37 +4677,33 @@ define <2 x bfloat> @v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> %m
46774677
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
46784678
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
46794679
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
4680-
; GCN-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
4681-
; GCN-NEXT: v_cvt_f32_f64_e32 v3, v[4:5]
4680+
; GCN-NEXT: v_and_b32_e32 v2, 0x80000000, v5
4681+
; GCN-NEXT: v_and_b32_e32 v3, 0x80000000, v3
46824682
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4683-
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
46844683
; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
4684+
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
46854685
; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
4686-
; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3
4687-
; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2
4688-
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
4689-
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
4690-
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
4686+
; GCN-NEXT: v_or_b32_e32 v1, v1, v2
4687+
; GCN-NEXT: v_or_b32_e32 v0, v0, v3
46914688
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
4689+
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
46924690
; GCN-NEXT: s_setpc_b64 s[30:31]
46934691
;
46944692
; GFX7-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
46954693
; GFX7: ; %bb.0:
46964694
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4697-
; GFX7-NEXT: v_cvt_f32_f64_e32 v2, v[2:3]
4698-
; GFX7-NEXT: v_cvt_f32_f64_e32 v3, v[4:5]
4699-
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
47004695
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
4696+
; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v5
47014697
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
4702-
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4703-
; GFX7-NEXT: v_and_b32_e32 v3, 0x8000, v3
47044698
; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
4705-
; GFX7-NEXT: v_and_b32_e32 v2, 0x8000, v2
4699+
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
4700+
; GFX7-NEXT: v_or_b32_e32 v1, v1, v2
4701+
; GFX7-NEXT: v_and_b32_e32 v2, 0x80000000, v3
4702+
; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
47064703
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
4707-
; GFX7-NEXT: v_or_b32_e32 v1, v1, v3
47084704
; GFX7-NEXT: v_or_b32_e32 v0, v0, v2
4709-
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
47104705
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
4706+
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
47114707
; GFX7-NEXT: s_setpc_b64 s[30:31]
47124708
;
47134709
; GFX8-LABEL: v_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
@@ -5585,35 +5581,31 @@ define amdgpu_ps i32 @s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64(<2 x bfloat> i
55855581
; GCN: ; %bb.0:
55865582
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1
55875583
; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0
5588-
; GCN-NEXT: v_cvt_f32_f64_e32 v2, s[4:5]
5589-
; GCN-NEXT: v_cvt_f32_f64_e32 v3, s[2:3]
5590-
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
5591-
; GCN-NEXT: v_lshrrev_b32_e32 v3, 16, v3
5584+
; GCN-NEXT: s_and_b32 s0, s3, 0x80000000
5585+
; GCN-NEXT: s_and_b32 s1, s5, 0x80000000
5586+
; GCN-NEXT: s_lshr_b32 s0, s0, 16
55925587
; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
5588+
; GCN-NEXT: s_lshr_b32 s1, s1, 16
55935589
; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
5594-
; GCN-NEXT: v_and_b32_e32 v3, 0x8000, v3
5595-
; GCN-NEXT: v_and_b32_e32 v2, 0x8000, v2
5596-
; GCN-NEXT: v_or_b32_e32 v1, v1, v3
5597-
; GCN-NEXT: v_or_b32_e32 v0, v0, v2
5590+
; GCN-NEXT: v_or_b32_e32 v1, s0, v1
5591+
; GCN-NEXT: v_or_b32_e32 v0, s1, v0
55985592
; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
55995593
; GCN-NEXT: v_or_b32_e32 v0, v1, v0
56005594
; GCN-NEXT: v_readfirstlane_b32 s0, v0
56015595
; GCN-NEXT: ; return to shader part epilog
56025596
;
56035597
; GFX7-LABEL: s_copysign_out_v2bf16_mag_v2bf16_sign_v2f64:
56045598
; GFX7: ; %bb.0:
5605-
; GFX7-NEXT: v_cvt_f32_f64_e32 v0, s[4:5]
5606-
; GFX7-NEXT: v_cvt_f32_f64_e32 v1, s[2:3]
5607-
; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1
5608-
; GFX7-NEXT: v_mul_f32_e64 v3, 1.0, s0
5609-
; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
5610-
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
5611-
; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
5612-
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 15
5613-
; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1
5614-
; GFX7-NEXT: v_bfe_u32 v3, v3, 16, 15
5615-
; GFX7-NEXT: v_or_b32_e32 v0, v2, v0
5616-
; GFX7-NEXT: v_or_b32_e32 v1, v3, v1
5599+
; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
5600+
; GFX7-NEXT: s_and_b32 s0, s3, 0x80000000
5601+
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
5602+
; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
5603+
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s1
5604+
; GFX7-NEXT: v_or_b32_e32 v1, s0, v1
5605+
; GFX7-NEXT: s_and_b32 s0, s5, 0x80000000
5606+
; GFX7-NEXT: s_lshr_b32 s0, s0, 16
5607+
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
5608+
; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
56175609
; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
56185610
; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
56195611
; GFX7-NEXT: v_readfirstlane_b32 s0, v0

llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll

Lines changed: 10 additions & 176 deletions
Original file line numberDiff line numberDiff line change
@@ -4013,96 +4013,13 @@ define <2 x half> @v_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> %mag, <2
40134013
; SI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64:
40144014
; SI: ; %bb.0:
40154015
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
4016-
; SI-NEXT: v_and_b32_e32 v7, 0x1ff, v5
4017-
; SI-NEXT: v_or_b32_e32 v4, v7, v4
4018-
; SI-NEXT: v_lshrrev_b32_e32 v6, 8, v5
4019-
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
4020-
; SI-NEXT: v_and_b32_e32 v6, 0xffe, v6
4021-
; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
4022-
; SI-NEXT: v_bfe_u32 v7, v5, 20, 11
4023-
; SI-NEXT: s_movk_i32 s4, 0x3f1
4024-
; SI-NEXT: v_or_b32_e32 v4, v6, v4
4025-
; SI-NEXT: v_sub_i32_e32 v8, vcc, s4, v7
4026-
; SI-NEXT: v_or_b32_e32 v6, 0x1000, v4
4027-
; SI-NEXT: v_med3_i32 v8, v8, 0, 13
4028-
; SI-NEXT: v_lshrrev_b32_e32 v9, v8, v6
4029-
; SI-NEXT: v_lshlrev_b32_e32 v8, v8, v9
4030-
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v8, v6
4031-
; SI-NEXT: s_movk_i32 s5, 0xfc10
4032-
; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
4033-
; SI-NEXT: v_add_i32_e32 v7, vcc, s5, v7
4034-
; SI-NEXT: v_lshlrev_b32_e32 v8, 12, v7
4035-
; SI-NEXT: v_or_b32_e32 v6, v9, v6
4036-
; SI-NEXT: v_or_b32_e32 v8, v4, v8
4037-
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v7
4038-
; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
4039-
; SI-NEXT: v_and_b32_e32 v8, 7, v6
4040-
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v8
4041-
; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
4042-
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v8
4043-
; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
4044-
; SI-NEXT: v_or_b32_e32 v8, v8, v9
4045-
; SI-NEXT: v_lshrrev_b32_e32 v6, 2, v6
4046-
; SI-NEXT: v_add_i32_e32 v6, vcc, v6, v8
4047-
; SI-NEXT: v_mov_b32_e32 v8, 0x7c00
4048-
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v7
4049-
; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc
4050-
; SI-NEXT: v_mov_b32_e32 v9, 0x7e00
4051-
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
4052-
; SI-NEXT: s_movk_i32 s6, 0x40f
4053-
; SI-NEXT: v_cndmask_b32_e32 v4, v8, v9, vcc
4054-
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7
4055-
; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
4056-
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v5
4057-
; SI-NEXT: v_and_b32_e32 v6, 0x1ff, v3
4058-
; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5
4059-
; SI-NEXT: v_or_b32_e32 v2, v6, v2
4060-
; SI-NEXT: v_or_b32_e32 v4, v5, v4
4061-
; SI-NEXT: v_lshrrev_b32_e32 v5, 8, v3
4062-
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
4063-
; SI-NEXT: v_and_b32_e32 v5, 0xffe, v5
4064-
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
4065-
; SI-NEXT: v_bfe_u32 v6, v3, 20, 11
4066-
; SI-NEXT: v_or_b32_e32 v2, v5, v2
4067-
; SI-NEXT: v_sub_i32_e32 v7, vcc, s4, v6
4068-
; SI-NEXT: v_or_b32_e32 v5, 0x1000, v2
4069-
; SI-NEXT: v_med3_i32 v7, v7, 0, 13
4070-
; SI-NEXT: v_lshrrev_b32_e32 v10, v7, v5
4071-
; SI-NEXT: v_lshlrev_b32_e32 v7, v7, v10
4072-
; SI-NEXT: v_cmp_ne_u32_e32 vcc, v7, v5
4073-
; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
4074-
; SI-NEXT: v_add_i32_e32 v6, vcc, s5, v6
4075-
; SI-NEXT: v_lshlrev_b32_e32 v7, 12, v6
4076-
; SI-NEXT: v_or_b32_e32 v5, v10, v5
4077-
; SI-NEXT: v_or_b32_e32 v7, v2, v7
4078-
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v6
4079-
; SI-NEXT: v_cndmask_b32_e32 v5, v7, v5, vcc
4080-
; SI-NEXT: v_and_b32_e32 v7, 7, v5
4081-
; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v7
4082-
; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
4083-
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v7
4084-
; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
4085-
; SI-NEXT: v_or_b32_e32 v7, v7, v10
4086-
; SI-NEXT: v_lshrrev_b32_e32 v5, 2, v5
4087-
; SI-NEXT: v_add_i32_e32 v5, vcc, v5, v7
4088-
; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v6
4089-
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
40904016
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
4091-
; SI-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
4092-
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
4093-
; SI-NEXT: v_cndmask_b32_e32 v2, v8, v9, vcc
4094-
; SI-NEXT: v_cmp_eq_u32_e32 vcc, s6, v6
4095-
; SI-NEXT: v_lshrrev_b32_e32 v3, 16, v3
4096-
; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
4097-
; SI-NEXT: v_and_b32_e32 v3, 0x8000, v3
4098-
; SI-NEXT: v_or_b32_e32 v2, v3, v2
4099-
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4100-
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4101-
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
4102-
; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
4017+
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
41034018
; SI-NEXT: s_brev_b32 s4, -2
4104-
; SI-NEXT: v_bfi_b32 v0, s4, v0, v2
4105-
; SI-NEXT: v_bfi_b32 v1, s4, v1, v3
4019+
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4020+
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4021+
; SI-NEXT: v_bfi_b32 v0, s4, v0, v3
4022+
; SI-NEXT: v_bfi_b32 v1, s4, v1, v5
41064023
; SI-NEXT: s_setpc_b64 s[30:31]
41074024
;
41084025
; VI-LABEL: v_copysign_out_v2f16_mag_v2f16_sign_v2f64:
@@ -4900,99 +4817,16 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f32(<2 x half> inreg
49004817
define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f16_sign_v2f64(<2 x half> inreg %mag, <2 x double> inreg %sign) {
49014818
; SI-LABEL: s_copysign_out_v2f16_mag_v2f16_sign_v2f64:
49024819
; SI: ; %bb.0:
4903-
; SI-NEXT: v_cvt_f16_f32_e32 v1, s0
4904-
; SI-NEXT: s_lshr_b32 s0, s3, 8
4905-
; SI-NEXT: s_and_b32 s6, s0, 0xffe
4906-
; SI-NEXT: s_and_b32 s0, s3, 0x1ff
4907-
; SI-NEXT: s_or_b32 s0, s0, s2
4908-
; SI-NEXT: s_cmp_lg_u32 s0, 0
49094820
; SI-NEXT: v_cvt_f16_f32_e32 v0, s1
4910-
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
4911-
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
4912-
; SI-NEXT: v_readfirstlane_b32 s0, v2
4913-
; SI-NEXT: s_bfe_u32 s2, s3, 0xb0014
4914-
; SI-NEXT: s_or_b32 s0, s6, s0
4915-
; SI-NEXT: s_sub_i32 s6, 0x3f1, s2
4916-
; SI-NEXT: v_med3_i32 v2, s6, 0, 13
4917-
; SI-NEXT: s_or_b32 s1, s0, 0x1000
4918-
; SI-NEXT: v_readfirstlane_b32 s6, v2
4919-
; SI-NEXT: s_lshr_b32 s7, s1, s6
4920-
; SI-NEXT: s_lshl_b32 s6, s7, s6
4921-
; SI-NEXT: s_cmp_lg_u32 s6, s1
4922-
; SI-NEXT: s_cselect_b32 s1, 1, 0
4923-
; SI-NEXT: s_addk_i32 s2, 0xfc10
4924-
; SI-NEXT: s_lshl_b32 s6, s2, 12
4925-
; SI-NEXT: s_or_b32 s1, s7, s1
4926-
; SI-NEXT: s_or_b32 s6, s0, s6
4927-
; SI-NEXT: s_cmp_lt_i32 s2, 1
4928-
; SI-NEXT: s_cselect_b32 s1, s1, s6
4929-
; SI-NEXT: s_and_b32 s6, s1, 7
4930-
; SI-NEXT: s_cmp_gt_i32 s6, 5
4931-
; SI-NEXT: s_cselect_b32 s7, 1, 0
4932-
; SI-NEXT: s_cmp_eq_u32 s6, 3
4933-
; SI-NEXT: s_cselect_b32 s6, 1, 0
4934-
; SI-NEXT: s_or_b32 s6, s6, s7
4935-
; SI-NEXT: s_lshr_b32 s1, s1, 2
4936-
; SI-NEXT: s_add_i32 s1, s1, s6
4937-
; SI-NEXT: s_cmp_lt_i32 s2, 31
4938-
; SI-NEXT: s_cselect_b32 s1, s1, 0x7c00
4939-
; SI-NEXT: s_cmp_lg_u32 s0, 0
4940-
; SI-NEXT: s_movk_i32 s6, 0x7e00
4941-
; SI-NEXT: s_cselect_b32 s0, s6, 0x7c00
4942-
; SI-NEXT: s_cmpk_eq_i32 s2, 0x40f
4943-
; SI-NEXT: s_cselect_b32 s0, s0, s1
4944-
; SI-NEXT: s_lshr_b32 s1, s3, 16
4945-
; SI-NEXT: s_and_b32 s1, s1, 0x8000
4946-
; SI-NEXT: s_or_b32 s2, s1, s0
4947-
; SI-NEXT: s_lshr_b32 s0, s5, 8
4948-
; SI-NEXT: s_and_b32 s3, s0, 0xffe
4949-
; SI-NEXT: s_and_b32 s0, s5, 0x1ff
4950-
; SI-NEXT: s_or_b32 s0, s0, s4
4951-
; SI-NEXT: s_cmp_lg_u32 s0, 0
4952-
; SI-NEXT: s_cselect_b64 s[0:1], -1, 0
4953-
; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1]
4954-
; SI-NEXT: v_readfirstlane_b32 s0, v2
4955-
; SI-NEXT: s_or_b32 s0, s3, s0
4956-
; SI-NEXT: s_bfe_u32 s3, s5, 0xb0014
4957-
; SI-NEXT: s_sub_i32 s4, 0x3f1, s3
4958-
; SI-NEXT: v_med3_i32 v2, s4, 0, 13
4959-
; SI-NEXT: s_or_b32 s1, s0, 0x1000
4960-
; SI-NEXT: v_readfirstlane_b32 s4, v2
4961-
; SI-NEXT: s_lshr_b32 s7, s1, s4
4962-
; SI-NEXT: s_lshl_b32 s4, s7, s4
4963-
; SI-NEXT: s_cmp_lg_u32 s4, s1
4964-
; SI-NEXT: s_cselect_b32 s1, 1, 0
4965-
; SI-NEXT: s_addk_i32 s3, 0xfc10
4966-
; SI-NEXT: s_lshl_b32 s4, s3, 12
4967-
; SI-NEXT: s_or_b32 s1, s7, s1
4968-
; SI-NEXT: s_or_b32 s4, s0, s4
4969-
; SI-NEXT: s_cmp_lt_i32 s3, 1
4970-
; SI-NEXT: s_cselect_b32 s1, s1, s4
4971-
; SI-NEXT: s_and_b32 s4, s1, 7
4972-
; SI-NEXT: s_cmp_gt_i32 s4, 5
4973-
; SI-NEXT: s_cselect_b32 s7, 1, 0
4974-
; SI-NEXT: s_cmp_eq_u32 s4, 3
4975-
; SI-NEXT: s_cselect_b32 s4, 1, 0
4976-
; SI-NEXT: s_or_b32 s4, s4, s7
4977-
; SI-NEXT: s_lshr_b32 s1, s1, 2
4978-
; SI-NEXT: s_add_i32 s1, s1, s4
4979-
; SI-NEXT: s_cmp_lt_i32 s3, 31
4980-
; SI-NEXT: s_cselect_b32 s1, s1, 0x7c00
4981-
; SI-NEXT: s_cmp_lg_u32 s0, 0
4982-
; SI-NEXT: s_cselect_b32 s0, s6, 0x7c00
4983-
; SI-NEXT: s_cmpk_eq_i32 s3, 0x40f
4984-
; SI-NEXT: s_cselect_b32 s0, s0, s1
4985-
; SI-NEXT: s_lshr_b32 s1, s5, 16
4986-
; SI-NEXT: s_and_b32 s1, s1, 0x8000
4987-
; SI-NEXT: s_or_b32 s0, s1, s0
4821+
; SI-NEXT: v_cvt_f16_f32_e32 v1, s0
4822+
; SI-NEXT: s_brev_b32 s0, -2
4823+
; SI-NEXT: v_mov_b32_e32 v2, s5
49884824
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
4989-
; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
49904825
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
4991-
; SI-NEXT: v_cvt_f32_f16_e32 v3, s2
4992-
; SI-NEXT: s_brev_b32 s0, -2
49934826
; SI-NEXT: v_bfi_b32 v0, s0, v0, v2
4827+
; SI-NEXT: v_mov_b32_e32 v2, s3
49944828
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
4995-
; SI-NEXT: v_bfi_b32 v1, s0, v1, v3
4829+
; SI-NEXT: v_bfi_b32 v1, s0, v1, v2
49964830
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
49974831
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
49984832
; SI-NEXT: v_or_b32_e32 v0, v1, v0

0 commit comments

Comments
 (0)