@@ -646,18 +646,18 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
646
646
; GFX9-NEXT: v_rcp_iflag_f32_e32 v1, v0
647
647
; GFX9-NEXT: .LBB4_1: ; %bb3
648
648
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
649
- ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
650
- ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s2
651
- ; GFX9-NEXT: s_lshl_b32 s3, s2, 1
649
+ ; GFX9-NEXT: s_and_b32 s3, 0xffff, s2
650
+ ; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3
652
651
; GFX9-NEXT: s_add_i32 s2, s2, 1
653
- ; GFX9-NEXT: v_mov_b32_e32 v3, s3
652
+ ; GFX9-NEXT: s_lshl_b32 s3, s3, 1
653
+ ; GFX9-NEXT: s_and_b32 s4, s2, 0xffff
654
654
; GFX9-NEXT: v_mul_f32_e32 v4, v2, v1
655
655
; GFX9-NEXT: v_trunc_f32_e32 v4, v4
656
656
; GFX9-NEXT: v_cvt_u32_f32_e32 v5, v4
657
657
; GFX9-NEXT: v_mad_f32 v2, -v4, v0, v2
658
- ; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
659
658
; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0
660
- ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
659
+ ; GFX9-NEXT: v_mov_b32_e32 v3, s3
660
+ ; GFX9-NEXT: s_cmpk_eq_i32 s4, 0x400
661
661
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v5, vcc
662
662
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
663
663
; GFX9-NEXT: global_store_short v3, v2, s[0:1]
@@ -677,10 +677,10 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
677
677
; GFX10-NEXT: v_rcp_iflag_f32_e32 v1, v0
678
678
; GFX10-NEXT: .LBB4_1: ; %bb3
679
679
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
680
- ; GFX10-NEXT: s_and_b32 s2, 0xffff, s2
681
- ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s2
682
- ; GFX10-NEXT: s_lshl_b32 s3, s2, 1
680
+ ; GFX10-NEXT: s_and_b32 s3, 0xffff, s2
683
681
; GFX10-NEXT: s_add_i32 s2, s2, 1
682
+ ; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s3
683
+ ; GFX10-NEXT: s_lshl_b32 s3, s3, 1
684
684
; GFX10-NEXT: v_mov_b32_e32 v4, s3
685
685
; GFX10-NEXT: s_and_b32 s3, s2, 0xffff
686
686
; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
@@ -709,22 +709,22 @@ define amdgpu_kernel void @udiv16_invariant_denom(ptr addrspace(1) nocapture %ar
709
709
; GFX11-NEXT: .p2align 6
710
710
; GFX11-NEXT: .LBB4_1: ; %bb3
711
711
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
712
- ; GFX11-NEXT: s_and_b32 s2, 0xffff, s2
713
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
714
- ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s2
715
- ; GFX11-NEXT: s_lshl_b32 s3, s2, 1
712
+ ; GFX11-NEXT: s_and_b32 s3, 0xffff, s2
716
713
; GFX11-NEXT: s_add_i32 s2, s2, 1
714
+ ; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s3
715
+ ; GFX11-NEXT: s_lshl_b32 s3, s3, 1
716
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
717
717
; GFX11-NEXT: v_mov_b32_e32 v4, s3
718
718
; GFX11-NEXT: s_and_b32 s3, s2, 0xffff
719
719
; GFX11-NEXT: s_waitcnt_depctr 0xfff
720
720
; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
721
721
; GFX11-NEXT: s_cmpk_eq_i32 s3, 0x400
722
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
723
722
; GFX11-NEXT: v_trunc_f32_e32 v3, v3
723
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
724
724
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
725
725
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
726
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
727
726
; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
727
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
728
728
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
729
729
; GFX11-NEXT: global_store_b16 v4, v2, s[0:1]
730
730
; GFX11-NEXT: s_cbranch_scc0 .LBB4_1
@@ -763,8 +763,9 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
763
763
; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
764
764
; GFX9-NEXT: s_and_b32 s3, 0xffff, s2
765
765
; GFX9-NEXT: v_cvt_f32_u32_e32 v2, s3
766
+ ; GFX9-NEXT: s_add_i32 s2, s2, 1
766
767
; GFX9-NEXT: s_lshl_b32 s5, s3, 1
767
- ; GFX9-NEXT: s_add_i32 s2, s3, 1
768
+ ; GFX9-NEXT: s_and_b32 s6, s2, 0xffff
768
769
; GFX9-NEXT: v_mul_f32_e32 v3, v2, v1
769
770
; GFX9-NEXT: v_trunc_f32_e32 v3, v3
770
771
; GFX9-NEXT: v_cvt_u32_f32_e32 v4, v3
@@ -773,8 +774,7 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
773
774
; GFX9-NEXT: v_mov_b32_e32 v3, s5
774
775
; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v4, vcc
775
776
; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
776
- ; GFX9-NEXT: s_and_b32 s5, s2, 0xffff
777
- ; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400
777
+ ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
778
778
; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2
779
779
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
780
780
; GFX9-NEXT: global_store_short v3, v2, s[0:1]
@@ -795,9 +795,9 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
795
795
; GFX10-NEXT: .LBB5_1: ; %bb3
796
796
; GFX10-NEXT: ; =>This Inner Loop Header: Depth=1
797
797
; GFX10-NEXT: s_and_b32 s4, 0xffff, s3
798
+ ; GFX10-NEXT: s_add_i32 s3, s3, 1
798
799
; GFX10-NEXT: v_cvt_f32_u32_e32 v2, s4
799
800
; GFX10-NEXT: s_lshl_b32 s5, s4, 1
800
- ; GFX10-NEXT: s_add_i32 s3, s4, 1
801
801
; GFX10-NEXT: v_mul_f32_e32 v3, v2, v1
802
802
; GFX10-NEXT: v_trunc_f32_e32 v3, v3
803
803
; GFX10-NEXT: v_mad_f32 v2, -v3, v0, v2
@@ -829,24 +829,24 @@ define amdgpu_kernel void @urem16_invariant_denom(ptr addrspace(1) nocapture %ar
829
829
; GFX11-NEXT: .LBB5_1: ; %bb3
830
830
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
831
831
; GFX11-NEXT: s_and_b32 s4, 0xffff, s3
832
- ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
832
+ ; GFX11-NEXT: s_add_i32 s3, s3, 1
833
833
; GFX11-NEXT: v_cvt_f32_u32_e32 v2, s4
834
834
; GFX11-NEXT: s_lshl_b32 s5, s4, 1
835
- ; GFX11-NEXT: s_add_i32 s3, s4, 1
836
835
; GFX11-NEXT: s_waitcnt_depctr 0xfff
837
836
; GFX11-NEXT: v_mul_f32_e32 v3, v2, v1
837
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
838
838
; GFX11-NEXT: v_trunc_f32_e32 v3, v3
839
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
840
839
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
841
840
; GFX11-NEXT: v_cvt_u32_f32_e32 v3, v3
841
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
842
842
; GFX11-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, v0
843
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
844
843
; GFX11-NEXT: v_add_co_ci_u32_e32 v2, vcc_lo, 0, v3, vcc_lo
845
844
; GFX11-NEXT: v_mov_b32_e32 v3, s5
845
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
846
846
; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
847
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
848
847
; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2
849
848
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
849
+ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
850
850
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
851
851
; GFX11-NEXT: global_store_b16 v3, v2, s[0:1]
852
852
; GFX11-NEXT: s_cbranch_scc0 .LBB5_1
@@ -895,13 +895,13 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
895
895
; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v2|, |v0|
896
896
; GFX9-NEXT: s_and_b64 s[6:7], s[6:7], exec
897
897
; GFX9-NEXT: s_cselect_b32 s3, s3, 0
898
- ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
899
- ; GFX9-NEXT: v_add_u32_e32 v2, s3, v3
900
- ; GFX9-NEXT: s_lshl_b32 s3, s2, 1
898
+ ; GFX9-NEXT: s_and_b32 s5, 0xffff, s2
901
899
; GFX9-NEXT: s_add_i32 s2, s2, 1
900
+ ; GFX9-NEXT: v_add_u32_e32 v2, s3, v3
901
+ ; GFX9-NEXT: s_lshl_b32 s3, s5, 1
902
+ ; GFX9-NEXT: s_and_b32 s5, s2, 0xffff
902
903
; GFX9-NEXT: v_mov_b32_e32 v3, s3
903
- ; GFX9-NEXT: s_and_b32 s3, s2, 0xffff
904
- ; GFX9-NEXT: s_cmpk_eq_i32 s3, 0x400
904
+ ; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400
905
905
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
906
906
; GFX9-NEXT: global_store_short v3, v2, s[0:1]
907
907
; GFX9-NEXT: s_cbranch_scc0 .LBB6_1
@@ -932,11 +932,11 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
932
932
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v3
933
933
; GFX10-NEXT: s_and_b32 s5, s5, exec_lo
934
934
; GFX10-NEXT: s_cselect_b32 s4, s4, 0
935
- ; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
936
- ; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v2
937
- ; GFX10-NEXT: s_lshl_b32 s4, s3, 1
935
+ ; GFX10-NEXT: s_and_b32 s5, 0xffff, s3
938
936
; GFX10-NEXT: s_add_i32 s3, s3, 1
939
- ; GFX10-NEXT: v_mov_b32_e32 v3, s4
937
+ ; GFX10-NEXT: s_lshl_b32 s5, s5, 1
938
+ ; GFX10-NEXT: v_add_nc_u32_e32 v2, s4, v2
939
+ ; GFX10-NEXT: v_mov_b32_e32 v3, s5
940
940
; GFX10-NEXT: s_and_b32 s4, s3, 0xffff
941
941
; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
942
942
; GFX10-NEXT: global_store_short v3, v2, s[0:1]
@@ -975,11 +975,11 @@ define amdgpu_kernel void @sdiv16_invariant_denom(ptr addrspace(1) nocapture %ar
975
975
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
976
976
; GFX11-NEXT: s_and_b32 s5, s5, exec_lo
977
977
; GFX11-NEXT: s_cselect_b32 s4, s4, 0
978
- ; GFX11-NEXT: s_and_b32 s3 , 0xffff, s3
978
+ ; GFX11-NEXT: s_and_b32 s5 , 0xffff, s3
979
979
; GFX11-NEXT: v_add_nc_u32_e32 v2, s4, v2
980
- ; GFX11-NEXT: s_lshl_b32 s4, s3 , 1
980
+ ; GFX11-NEXT: s_lshl_b32 s5, s5 , 1
981
981
; GFX11-NEXT: s_add_i32 s3, s3, 1
982
- ; GFX11-NEXT: v_mov_b32_e32 v3, s4
982
+ ; GFX11-NEXT: v_mov_b32_e32 v3, s5
983
983
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
984
984
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
985
985
; GFX11-NEXT: s_cmpk_eq_i32 s4, 0x400
@@ -1032,12 +1032,12 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
1032
1032
; GFX9-NEXT: s_cselect_b32 s5, s5, 0
1033
1033
; GFX9-NEXT: v_add_u32_e32 v2, s5, v3
1034
1034
; GFX9-NEXT: v_mul_lo_u32 v2, v2, s4
1035
- ; GFX9-NEXT: s_and_b32 s2, 0xffff, s2
1036
- ; GFX9-NEXT: s_lshl_b32 s5, s2, 1
1035
+ ; GFX9-NEXT: s_and_b32 s6, 0xffff, s2
1037
1036
; GFX9-NEXT: s_add_i32 s2, s2, 1
1037
+ ; GFX9-NEXT: s_lshl_b32 s5, s6, 1
1038
+ ; GFX9-NEXT: s_and_b32 s6, s2, 0xffff
1038
1039
; GFX9-NEXT: v_mov_b32_e32 v3, s5
1039
- ; GFX9-NEXT: s_and_b32 s5, s2, 0xffff
1040
- ; GFX9-NEXT: s_cmpk_eq_i32 s5, 0x400
1040
+ ; GFX9-NEXT: s_cmpk_eq_i32 s6, 0x400
1041
1041
; GFX9-NEXT: v_sub_u32_e32 v2, s3, v2
1042
1042
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
1043
1043
; GFX9-NEXT: global_store_short v3, v2, s[0:1]
@@ -1069,12 +1069,12 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
1069
1069
; GFX10-NEXT: v_cvt_i32_f32_e32 v2, v3
1070
1070
; GFX10-NEXT: s_and_b32 s6, s6, exec_lo
1071
1071
; GFX10-NEXT: s_cselect_b32 s5, s5, 0
1072
- ; GFX10-NEXT: s_and_b32 s3, 0xffff, s3
1073
1072
; GFX10-NEXT: v_add_nc_u32_e32 v2, s5, v2
1074
- ; GFX10-NEXT: s_lshl_b32 s5, s3, 1
1073
+ ; GFX10-NEXT: s_and_b32 s5, 0xffff, s3
1075
1074
; GFX10-NEXT: s_add_i32 s3, s3, 1
1076
- ; GFX10-NEXT: v_mov_b32_e32 v3 , s5
1075
+ ; GFX10-NEXT: s_lshl_b32 s5 , s5, 1
1077
1076
; GFX10-NEXT: v_mul_lo_u32 v2, v2, s2
1077
+ ; GFX10-NEXT: v_mov_b32_e32 v3, s5
1078
1078
; GFX10-NEXT: v_sub_nc_u32_e32 v2, s4, v2
1079
1079
; GFX10-NEXT: s_and_b32 s4, s3, 0xffff
1080
1080
; GFX10-NEXT: s_cmpk_eq_i32 s4, 0x400
@@ -1111,16 +1111,17 @@ define amdgpu_kernel void @srem16_invariant_denom(ptr addrspace(1) nocapture %ar
1111
1111
; GFX11-NEXT: v_fma_f32 v2, -v3, v0, v2
1112
1112
; GFX11-NEXT: v_cmp_ge_f32_e64 s6, |v2|, |v0|
1113
1113
; GFX11-NEXT: v_cvt_i32_f32_e32 v2, v3
1114
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
1114
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2)
1115
1115
; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
1116
1116
; GFX11-NEXT: s_cselect_b32 s5, s5, 0
1117
- ; GFX11-NEXT: s_and_b32 s3, 0xffff, s3
1117
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
1118
1118
; GFX11-NEXT: v_add_nc_u32_e32 v2, s5, v2
1119
- ; GFX11-NEXT: s_lshl_b32 s5, s3, 1
1119
+ ; GFX11-NEXT: s_and_b32 s5, 0xffff, s3
1120
1120
; GFX11-NEXT: s_add_i32 s3, s3, 1
1121
- ; GFX11-NEXT: v_mov_b32_e32 v3 , s5
1122
- ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2 ) | instskip(NEXT ) | instid1(VALU_DEP_1 )
1121
+ ; GFX11-NEXT: s_lshl_b32 s5 , s5, 1
1122
+ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1 ) | instskip(SKIP_1 ) | instid1(VALU_DEP_2 )
1123
1123
; GFX11-NEXT: v_mul_lo_u32 v2, v2, s2
1124
+ ; GFX11-NEXT: v_mov_b32_e32 v3, s5
1124
1125
; GFX11-NEXT: v_sub_nc_u32_e32 v2, s4, v2
1125
1126
; GFX11-NEXT: s_and_b32 s4, s3, 0xffff
1126
1127
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
0 commit comments