@@ -602,15 +602,13 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
602
602
; GFX12-NEXT: s_wait_bvhcnt 0x0
603
603
; GFX12-NEXT: s_wait_kmcnt 0x0
604
604
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
605
- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
606
605
; GFX12-NEXT: s_mov_b32 s0, 0
607
606
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
608
607
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
609
608
; GFX12-NEXT: s_wait_loadcnt 0x0
610
609
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
611
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
612
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
613
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
610
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
611
+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
614
612
; GFX12-NEXT: s_wait_storecnt 0x0
615
613
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
616
614
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -757,21 +755,18 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
757
755
; GFX12-NEXT: s_wait_samplecnt 0x0
758
756
; GFX12-NEXT: s_wait_bvhcnt 0x0
759
757
; GFX12-NEXT: s_wait_kmcnt 0x0
760
- ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
761
- ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
758
+ ; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
762
759
; GFX12-NEXT: s_mov_b32 s0, 0
763
760
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
764
761
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
765
762
; GFX12-NEXT: s_wait_loadcnt 0x0
766
- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
767
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
768
- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
763
+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
769
764
; GFX12-NEXT: s_wait_storecnt 0x0
770
- ; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3 ], v[0:1], v[2:5 ], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
765
+ ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5 ], v[0:1], v[4:7 ], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
771
766
; GFX12-NEXT: s_wait_loadcnt 0x0
772
767
; GFX12-NEXT: global_inv scope:SCOPE_DEV
773
- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3 ], v[4:5 ]
774
- ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
768
+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5 ], v[6:7 ]
769
+ ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
775
770
; GFX12-NEXT: s_wait_alu 0xfffe
776
771
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
777
772
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1188,15 +1183,13 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
1188
1183
; GFX12-NEXT: s_wait_bvhcnt 0x0
1189
1184
; GFX12-NEXT: s_wait_kmcnt 0x0
1190
1185
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1191
- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
1192
1186
; GFX12-NEXT: s_mov_b32 s0, 0
1193
1187
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
1194
1188
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1195
1189
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1196
1190
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1197
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1198
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
1199
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
1191
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1192
+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
1200
1193
; GFX12-NEXT: s_wait_storecnt 0x0
1201
1194
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1202
1195
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1341,21 +1334,18 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
1341
1334
; GFX12-NEXT: s_wait_samplecnt 0x0
1342
1335
; GFX12-NEXT: s_wait_bvhcnt 0x0
1343
1336
; GFX12-NEXT: s_wait_kmcnt 0x0
1344
- ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1345
- ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
1337
+ ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
1346
1338
; GFX12-NEXT: s_mov_b32 s0, 0
1347
1339
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
1348
1340
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1349
1341
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1350
- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
1351
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1352
- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
1342
+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
1353
1343
; GFX12-NEXT: s_wait_storecnt 0x0
1354
- ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3 ], v[0:1], v[2:5 ] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1344
+ ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5 ], v[0:1], v[4:7 ] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1355
1345
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1356
1346
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1357
- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3 ], v[4:5 ]
1358
- ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
1347
+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5 ], v[6:7 ]
1348
+ ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1359
1349
; GFX12-NEXT: s_wait_alu 0xfffe
1360
1350
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
1361
1351
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1799,19 +1789,16 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
1799
1789
; GFX12-NEXT: s_wait_bvhcnt 0x0
1800
1790
; GFX12-NEXT: s_wait_kmcnt 0x0
1801
1791
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1802
- ; GFX12-NEXT: v_dual_mov_b32 v2 , v0 :: v_dual_mov_b32 v3 , v1
1792
+ ; GFX12-NEXT: v_dual_mov_b32 v4 , v0 :: v_dual_mov_b32 v5 , v1
1803
1793
; GFX12-NEXT: s_mov_b32 s4, 0
1804
1794
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1805
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
1806
1795
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
1807
1796
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1808
1797
; GFX12-NEXT: s_wait_loadcnt 0x0
1809
1798
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1810
1799
; GFX12-NEXT: s_wait_storecnt 0x0
1811
1800
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1812
- ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1813
- ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
1814
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1801
+ ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5]
1815
1802
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1816
1803
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1817
1804
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1971,23 +1958,21 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
1971
1958
; GFX12-NEXT: s_wait_bvhcnt 0x0
1972
1959
; GFX12-NEXT: s_wait_kmcnt 0x0
1973
1960
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1974
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
1975
1961
; GFX12-NEXT: s_mov_b32 s4, 0
1976
- ; GFX12-NEXT: buffer_load_b64 v[2:3 ], v6, s[0:3], null offen
1962
+ ; GFX12-NEXT: buffer_load_b64 v[4:5 ], v6, s[0:3], null offen
1977
1963
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
1978
1964
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1979
1965
; GFX12-NEXT: s_wait_loadcnt 0x0
1980
- ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
1966
+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1]
1967
+ ; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
1981
1968
; GFX12-NEXT: s_wait_storecnt 0x0
1982
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1983
- ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
1984
- ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1985
- ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
1969
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1970
+ ; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
1986
1971
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
1987
1972
; GFX12-NEXT: s_wait_loadcnt 0x0
1988
1973
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1989
- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3 ]
1990
- ; GFX12-NEXT: v_dual_mov_b32 v2 , v7 :: v_dual_mov_b32 v3 , v8
1974
+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5 ]
1975
+ ; GFX12-NEXT: v_dual_mov_b32 v4 , v7 :: v_dual_mov_b32 v5 , v8
1991
1976
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
1992
1977
; GFX12-NEXT: s_wait_alu 0xfffe
1993
1978
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
0 commit comments