@@ -37712,12 +37712,10 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
37712
37712
; GFX11TRUE16-LABEL: v_select_bf16:
37713
37713
; GFX11TRUE16: ; %bb.0:
37714
37714
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37715
- ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37716
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
37717
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
37718
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
37719
- ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37720
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
37715
+ ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37716
+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37717
+ ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37718
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v1.l, vcc_lo
37721
37719
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
37722
37720
;
37723
37721
; GFX11FAKE16-LABEL: v_select_bf16:
@@ -37785,14 +37783,11 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
37785
37783
; GFX11TRUE16-LABEL: v_select_fneg_lhs_bf16:
37786
37784
; GFX11TRUE16: ; %bb.0:
37787
37785
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37788
- ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37789
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v1.l
37790
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
37791
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
37792
- ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37793
- ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
37794
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37795
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.h, v0.l, vcc_lo
37786
+ ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37787
+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37788
+ ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37789
+ ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l
37790
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v2.l, v0.l, vcc_lo
37796
37791
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
37797
37792
;
37798
37793
; GFX11FAKE16-LABEL: v_select_fneg_lhs_bf16:
@@ -37862,14 +37857,11 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
37862
37857
; GFX11TRUE16-LABEL: v_select_fneg_rhs_bf16:
37863
37858
; GFX11TRUE16: ; %bb.0:
37864
37859
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
37865
- ; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v0
37866
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.l, v2.l
37867
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
37868
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
37869
- ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3
37870
- ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v0.l
37871
- ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
37872
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v0.h, vcc_lo
37860
+ ; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
37861
+ ; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
37862
+ ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
37863
+ ; GFX11TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v2.l
37864
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, v1.l, vcc_lo
37873
37865
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
37874
37866
;
37875
37867
; GFX11FAKE16-LABEL: v_select_fneg_rhs_bf16:
@@ -42810,17 +42802,16 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
42810
42802
; GFX11TRUE16-NEXT: scratch_load_b32 v85, off, s32 offset:72
42811
42803
; GFX11TRUE16-NEXT: scratch_load_b32 v86, off, s32 offset:4
42812
42804
; GFX11TRUE16-NEXT: scratch_load_b32 v87, off, s32 offset:68
42813
- ; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
42814
42805
; GFX11TRUE16-NEXT: v_and_b32_e32 v0, 1, v0
42815
42806
; GFX11TRUE16-NEXT: v_and_b32_e32 v14, 1, v14
42807
+ ; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
42816
42808
; GFX11TRUE16-NEXT: v_and_b32_e32 v18, 1, v18
42817
42809
; GFX11TRUE16-NEXT: v_and_b32_e32 v20, 1, v20
42818
42810
; GFX11TRUE16-NEXT: v_and_b32_e32 v22, 1, v22
42819
42811
; GFX11TRUE16-NEXT: v_and_b32_e32 v24, 1, v24
42820
42812
; GFX11TRUE16-NEXT: v_and_b32_e32 v26, 1, v26
42821
42813
; GFX11TRUE16-NEXT: v_and_b32_e32 v28, 1, v28
42822
42814
; GFX11TRUE16-NEXT: v_and_b32_e32 v30, 1, v30
42823
- ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
42824
42815
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 1, v1
42825
42816
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 1, v2
42826
42817
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 1, v3
@@ -42844,6 +42835,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
42844
42835
; GFX11TRUE16-NEXT: v_and_b32_e32 v29, 1, v29
42845
42836
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
42846
42837
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s13, 1, v14
42838
+ ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s15, 1, v16
42847
42839
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s17, 1, v18
42848
42840
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s19, 1, v20
42849
42841
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s21, 1, v22
@@ -42873,45 +42865,44 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
42873
42865
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s26, 1, v27
42874
42866
; GFX11TRUE16-NEXT: v_cmp_eq_u32_e64 s29, 1, v29
42875
42867
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(32)
42876
- ; GFX11TRUE16-NEXT: v_mov_b16_e32 v16.l, v31.l
42868
+ ; GFX11TRUE16-NEXT: v_and_b32_e32 v31, 1, v31
42877
42869
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(31)
42878
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17 , 16, v32
42870
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v16 , 16, v32
42879
42871
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(30)
42880
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18 , 16, v33
42872
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v17 , 16, v33
42881
42873
; GFX11TRUE16-NEXT: v_cndmask_b16 v15.l, v33.l, v32.l, s28
42882
42874
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(29)
42883
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19, 16, v34
42884
- ; GFX11TRUE16-NEXT: v_and_b32_e32 v16, 1, v16
42875
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v18, 16, v34
42885
42876
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(28)
42886
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20 , 16, v35
42877
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v19 , 16, v35
42887
42878
; GFX11TRUE16-NEXT: v_cndmask_b16 v14.l, v35.l, v34.l, s27
42888
42879
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(27)
42889
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21 , 16, v36
42880
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v20 , 16, v36
42890
42881
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(26)
42891
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22 , 16, v37
42882
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v21 , 16, v37
42892
42883
; GFX11TRUE16-NEXT: v_cndmask_b16 v13.l, v37.l, v36.l, s25
42893
42884
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(25)
42894
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23 , 16, v38
42885
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v22 , 16, v38
42895
42886
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(24)
42896
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24 , 16, v39
42887
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v23 , 16, v39
42897
42888
; GFX11TRUE16-NEXT: v_cndmask_b16 v12.l, v39.l, v38.l, s23
42898
42889
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(23)
42899
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25 , 16, v48
42890
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v24 , 16, v48
42900
42891
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(22)
42901
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26 , 16, v49
42892
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v25 , 16, v49
42902
42893
; GFX11TRUE16-NEXT: v_cndmask_b16 v11.l, v49.l, v48.l, s21
42903
42894
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(21)
42904
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27 , 16, v50
42895
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v26 , 16, v50
42905
42896
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(20)
42906
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28 , 16, v51
42897
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v27 , 16, v51
42907
42898
; GFX11TRUE16-NEXT: v_cndmask_b16 v10.l, v51.l, v50.l, s19
42908
42899
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(19)
42909
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29 , 16, v52
42900
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v28 , 16, v52
42910
42901
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(18)
42911
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30 , 16, v53
42902
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v29 , 16, v53
42912
42903
; GFX11TRUE16-NEXT: v_cndmask_b16 v9.l, v53.l, v52.l, s17
42913
42904
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(17)
42914
- ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v31 , 16, v54
42905
+ ; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v30 , 16, v54
42915
42906
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(16)
42916
42907
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v32, 16, v55
42917
42908
; GFX11TRUE16-NEXT: v_cndmask_b16 v8.l, v55.l, v54.l, s15
@@ -42949,20 +42940,20 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
42949
42940
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0)
42950
42941
; GFX11TRUE16-NEXT: v_lshrrev_b32_e32 v64, 16, v87
42951
42942
; GFX11TRUE16-NEXT: v_cndmask_b16 v0.l, v87.l, v86.l, vcc_lo
42952
- ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v16
42943
+ ; GFX11TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v31
42953
42944
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.l, v67.l, v66.l, s11
42954
42945
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.l, v69.l, v68.l, s9
42955
42946
; GFX11TRUE16-NEXT: v_cndmask_b16 v4.l, v71.l, v70.l, s7
42956
42947
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.l, v81.l, v80.l, s5
42957
42948
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.l, v83.l, v82.l, s3
42958
42949
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.l, v85.l, v84.l, s1
42959
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v20 .l, v19 .l, s29
42960
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v22 .l, v21 .l, s26
42961
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v24 .l, v23 .l, s24
42962
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v26 .l, v25 .l, s22
42963
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v28 .l, v27 .l, s20
42964
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v30 .l, v29 .l, s18
42965
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v31 .l, s16
42950
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v14.h, v19 .l, v18 .l, s29
42951
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v13.h, v21 .l, v20 .l, s26
42952
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v12.h, v23 .l, v22 .l, s24
42953
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v11.h, v25 .l, v24 .l, s22
42954
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v10.h, v27 .l, v26 .l, s20
42955
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v9.h, v29 .l, v28 .l, s18
42956
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v8.h, v32.l, v30 .l, s16
42966
42957
; GFX11TRUE16-NEXT: v_cndmask_b16 v7.h, v34.l, v33.l, s14
42967
42958
; GFX11TRUE16-NEXT: v_cndmask_b16 v6.h, v36.l, v35.l, s12
42968
42959
; GFX11TRUE16-NEXT: v_cndmask_b16 v5.h, v38.l, v37.l, s10
@@ -42971,7 +42962,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
42971
42962
; GFX11TRUE16-NEXT: v_cndmask_b16 v1.h, v54.l, v53.l, s2
42972
42963
; GFX11TRUE16-NEXT: v_cndmask_b16 v2.h, v52.l, v51.l, s4
42973
42964
; GFX11TRUE16-NEXT: v_cndmask_b16 v3.h, v50.l, v49.l, s6
42974
- ; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v18 .l, v17 .l, vcc_lo
42965
+ ; GFX11TRUE16-NEXT: v_cndmask_b16 v15.h, v17 .l, v16 .l, vcc_lo
42975
42966
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
42976
42967
;
42977
42968
; GFX11FAKE16-LABEL: v_vselect_v32bf16:
0 commit comments