@@ -818,32 +818,29 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
818
818
; GFX8-NEXT: s_mov_b32 s12, s6
819
819
; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
820
820
; GFX8-NEXT: v_mov_b32_e32 v0, s6
821
- ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
822
- ; GFX8-NEXT: s_mov_b32 s13, s7
823
- ; GFX8-NEXT: s_mul_i32 s7, s1, s6
824
- ; GFX8-NEXT: s_mul_i32 s6, s0, s6
821
+ ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
822
+ ; GFX8-NEXT: s_mul_i32 s6, s1, s6
825
823
; GFX8-NEXT: s_mov_b32 s15, 0xf000
826
824
; GFX8-NEXT: s_mov_b32 s14, -1
827
- ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0
828
- ; GFX8-NEXT: v_mov_b32_e32 v0, s6
825
+ ; GFX8-NEXT: s_mov_b32 s13, s7
826
+ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
829
827
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
830
828
; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc
831
829
; GFX8-NEXT: s_waitcnt vmcnt(0)
832
830
; GFX8-NEXT: buffer_wbinvl1_vol
833
831
; GFX8-NEXT: .LBB4_2:
834
832
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
835
- ; GFX8-NEXT: v_readfirstlane_b32 s2, v0
836
833
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
837
- ; GFX8-NEXT: v_mul_lo_u32 v0, s1, v2
838
- ; GFX8-NEXT: v_mul_hi_u32 v3, s0, v2
834
+ ; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2
835
+ ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
836
+ ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
839
837
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
840
- ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v2
841
- ; GFX8-NEXT: s_mov_b32 s7, 0xf000
842
- ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0
838
+ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
843
839
; GFX8-NEXT: v_mov_b32_e32 v3, s1
844
- ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v1
840
+ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
841
+ ; GFX8-NEXT: s_mov_b32 s7, 0xf000
845
842
; GFX8-NEXT: s_mov_b32 s6, -1
846
- ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v2 , vcc
843
+ ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1 , vcc
847
844
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
848
845
; GFX8-NEXT: s_endpgm
849
846
;
@@ -878,17 +875,16 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
878
875
; GFX9-NEXT: .LBB4_2:
879
876
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
880
877
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
881
- ; GFX9-NEXT: v_mul_lo_u32 v3 , s3, v2
882
- ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2
878
+ ; GFX9-NEXT: v_mul_lo_u32 v4 , s3, v2
879
+ ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
883
880
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
884
- ; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2
885
881
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
886
- ; GFX9-NEXT: v_add_u32_e32 v1, v4, v3
887
- ; GFX9-NEXT: v_mov_b32_e32 v2 , s1
888
- ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0
882
+ ; GFX9-NEXT: v_add_u32_e32 v1, v3, v4
883
+ ; GFX9-NEXT: v_mov_b32_e32 v3 , s1
884
+ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v2
889
885
; GFX9-NEXT: s_mov_b32 s7, 0xf000
890
886
; GFX9-NEXT: s_mov_b32 s6, -1
891
- ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2 , v1, vcc
887
+ ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3 , v1, vcc
892
888
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
893
889
; GFX9-NEXT: s_endpgm
894
890
;
@@ -927,14 +923,13 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
927
923
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
928
924
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
929
925
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
930
- ; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2
931
- ; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v2
932
- ; GFX1064-NEXT: v_mul_lo_u32 v2, s2, v2
926
+ ; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2
927
+ ; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
933
928
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
934
929
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
935
930
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
936
931
; GFX1064-NEXT: s_mov_b32 s6, -1
937
- ; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3
932
+ ; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4
938
933
; GFX1064-NEXT: v_add_co_u32 v0, vcc, s0, v2
939
934
; GFX1064-NEXT: v_add_co_ci_u32_e32 v1, vcc, s1, v1, vcc
940
935
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -974,14 +969,13 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
974
969
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
975
970
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
976
971
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
977
- ; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2
978
- ; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v2
979
- ; GFX1032-NEXT: v_mul_lo_u32 v2, s2, v2
972
+ ; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2
973
+ ; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0
980
974
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
981
975
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
982
976
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
983
977
; GFX1032-NEXT: s_mov_b32 s6, -1
984
- ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3
978
+ ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4
985
979
; GFX1032-NEXT: v_add_co_u32 v0, vcc_lo, s0, v2
986
980
; GFX1032-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
987
981
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -1955,32 +1949,29 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
1955
1949
; GFX8-NEXT: s_mov_b32 s12, s6
1956
1950
; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9]
1957
1951
; GFX8-NEXT: v_mov_b32_e32 v0, s6
1958
- ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0
1959
- ; GFX8-NEXT: s_mov_b32 s13, s7
1960
- ; GFX8-NEXT: s_mul_i32 s7, s1, s6
1961
- ; GFX8-NEXT: s_mul_i32 s6, s0, s6
1952
+ ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0
1953
+ ; GFX8-NEXT: s_mul_i32 s6, s1, s6
1962
1954
; GFX8-NEXT: s_mov_b32 s15, 0xf000
1963
1955
; GFX8-NEXT: s_mov_b32 s14, -1
1964
- ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s7, v0
1965
- ; GFX8-NEXT: v_mov_b32_e32 v0, s6
1956
+ ; GFX8-NEXT: s_mov_b32 s13, s7
1957
+ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1
1966
1958
; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
1967
1959
; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc
1968
1960
; GFX8-NEXT: s_waitcnt vmcnt(0)
1969
1961
; GFX8-NEXT: buffer_wbinvl1_vol
1970
1962
; GFX8-NEXT: .LBB10_2:
1971
1963
; GFX8-NEXT: s_or_b64 exec, exec, s[2:3]
1972
- ; GFX8-NEXT: v_readfirstlane_b32 s2, v0
1973
1964
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
1974
- ; GFX8-NEXT: v_mul_lo_u32 v0, s1, v2
1975
- ; GFX8-NEXT: v_mul_hi_u32 v3, s0, v2
1965
+ ; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2
1966
+ ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0
1967
+ ; GFX8-NEXT: v_readfirstlane_b32 s0, v0
1976
1968
; GFX8-NEXT: v_readfirstlane_b32 s1, v1
1977
- ; GFX8-NEXT: v_mul_lo_u32 v1, s0, v2
1978
- ; GFX8-NEXT: s_mov_b32 s7, 0xf000
1979
- ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v3, v0
1969
+ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4
1980
1970
; GFX8-NEXT: v_mov_b32_e32 v3, s1
1981
- ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v1
1971
+ ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2
1972
+ ; GFX8-NEXT: s_mov_b32 s7, 0xf000
1982
1973
; GFX8-NEXT: s_mov_b32 s6, -1
1983
- ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v2 , vcc
1974
+ ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1 , vcc
1984
1975
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
1985
1976
; GFX8-NEXT: s_endpgm
1986
1977
;
@@ -2015,17 +2006,16 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
2015
2006
; GFX9-NEXT: .LBB10_2:
2016
2007
; GFX9-NEXT: s_or_b64 exec, exec, s[0:1]
2017
2008
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
2018
- ; GFX9-NEXT: v_mul_lo_u32 v3 , s3, v2
2019
- ; GFX9-NEXT: v_mul_hi_u32 v4, s2, v2
2009
+ ; GFX9-NEXT: v_mul_lo_u32 v4 , s3, v2
2010
+ ; GFX9-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
2020
2011
; GFX9-NEXT: v_readfirstlane_b32 s0, v0
2021
- ; GFX9-NEXT: v_mul_lo_u32 v0, s2, v2
2022
2012
; GFX9-NEXT: v_readfirstlane_b32 s1, v1
2023
- ; GFX9-NEXT: v_add_u32_e32 v1, v4, v3
2024
- ; GFX9-NEXT: v_mov_b32_e32 v2 , s1
2025
- ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v0
2013
+ ; GFX9-NEXT: v_add_u32_e32 v1, v3, v4
2014
+ ; GFX9-NEXT: v_mov_b32_e32 v3 , s1
2015
+ ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v2
2026
2016
; GFX9-NEXT: s_mov_b32 s7, 0xf000
2027
2017
; GFX9-NEXT: s_mov_b32 s6, -1
2028
- ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2 , v1, vcc
2018
+ ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v3 , v1, vcc
2029
2019
; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
2030
2020
; GFX9-NEXT: s_endpgm
2031
2021
;
@@ -2064,14 +2054,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
2064
2054
; GFX1064-NEXT: s_waitcnt_depctr 0xffe3
2065
2055
; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1]
2066
2056
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
2067
- ; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v2
2068
- ; GFX1064-NEXT: v_mul_hi_u32 v4, s2, v2
2069
- ; GFX1064-NEXT: v_mul_lo_u32 v2, s2, v2
2057
+ ; GFX1064-NEXT: v_mul_lo_u32 v4, s3, v2
2058
+ ; GFX1064-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0
2070
2059
; GFX1064-NEXT: v_readfirstlane_b32 s0, v0
2071
2060
; GFX1064-NEXT: v_readfirstlane_b32 s1, v1
2072
2061
; GFX1064-NEXT: s_mov_b32 s7, 0x31016000
2073
2062
; GFX1064-NEXT: s_mov_b32 s6, -1
2074
- ; GFX1064-NEXT: v_add_nc_u32_e32 v1, v4, v3
2063
+ ; GFX1064-NEXT: v_add_nc_u32_e32 v1, v3, v4
2075
2064
; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s0, v2
2076
2065
; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s1, v1, vcc
2077
2066
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -2111,14 +2100,13 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace
2111
2100
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
2112
2101
; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0
2113
2102
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
2114
- ; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v2
2115
- ; GFX1032-NEXT: v_mul_hi_u32 v4, s2, v2
2116
- ; GFX1032-NEXT: v_mul_lo_u32 v2, s2, v2
2103
+ ; GFX1032-NEXT: v_mul_lo_u32 v4, s3, v2
2104
+ ; GFX1032-NEXT: v_mad_u64_u32 v[2:3], s0, s2, v2, 0
2117
2105
; GFX1032-NEXT: v_readfirstlane_b32 s0, v0
2118
2106
; GFX1032-NEXT: v_readfirstlane_b32 s1, v1
2119
2107
; GFX1032-NEXT: s_mov_b32 s7, 0x31016000
2120
2108
; GFX1032-NEXT: s_mov_b32 s6, -1
2121
- ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v4, v3
2109
+ ; GFX1032-NEXT: v_add_nc_u32_e32 v1, v3, v4
2122
2110
; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s0, v2
2123
2111
; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s1, v1, vcc_lo
2124
2112
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
0 commit comments