@@ -602,13 +602,15 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
602
602
; GFX12-NEXT: s_wait_bvhcnt 0x0
603
603
; GFX12-NEXT: s_wait_kmcnt 0x0
604
604
; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
605
+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
605
606
; GFX12-NEXT: s_mov_b32 s0, 0
606
607
; GFX12-NEXT: .LBB6_1: ; %atomicrmw.start
607
608
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
608
609
; GFX12-NEXT: s_wait_loadcnt 0x0
609
610
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
610
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
611
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
611
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
612
+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
613
+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
612
614
; GFX12-NEXT: s_wait_storecnt 0x0
613
615
; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
614
616
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -755,18 +757,21 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
755
757
; GFX12-NEXT: s_wait_samplecnt 0x0
756
758
; GFX12-NEXT: s_wait_bvhcnt 0x0
757
759
; GFX12-NEXT: s_wait_kmcnt 0x0
758
- ; GFX12-NEXT: global_load_b64 v[6:7], v[0:1], off
760
+ ; GFX12-NEXT: global_load_b64 v[4:5], v[0:1], off
761
+ ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
759
762
; GFX12-NEXT: s_mov_b32 s0, 0
760
763
; GFX12-NEXT: .LBB7_1: ; %atomicrmw.start
761
764
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
762
765
; GFX12-NEXT: s_wait_loadcnt 0x0
763
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
766
+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
767
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
768
+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
764
769
; GFX12-NEXT: s_wait_storecnt 0x0
765
- ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5 ], v[0:1], v[4:7 ], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
770
+ ; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3 ], v[0:1], v[2:5 ], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
766
771
; GFX12-NEXT: s_wait_loadcnt 0x0
767
772
; GFX12-NEXT: global_inv scope:SCOPE_DEV
768
- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5 ], v[6:7 ]
769
- ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
773
+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3 ], v[4:5 ]
774
+ ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
770
775
; GFX12-NEXT: s_wait_alu 0xfffe
771
776
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
772
777
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1183,13 +1188,15 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
1183
1188
; GFX12-NEXT: s_wait_bvhcnt 0x0
1184
1189
; GFX12-NEXT: s_wait_kmcnt 0x0
1185
1190
; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1191
+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
1186
1192
; GFX12-NEXT: s_mov_b32 s0, 0
1187
1193
; GFX12-NEXT: .LBB10_1: ; %atomicrmw.start
1188
1194
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1189
1195
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1190
1196
; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1191
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1192
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
1197
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1198
+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
1199
+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
1193
1200
; GFX12-NEXT: s_wait_storecnt 0x0
1194
1201
; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1195
1202
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
@@ -1334,18 +1341,21 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
1334
1341
; GFX12-NEXT: s_wait_samplecnt 0x0
1335
1342
; GFX12-NEXT: s_wait_bvhcnt 0x0
1336
1343
; GFX12-NEXT: s_wait_kmcnt 0x0
1337
- ; GFX12-NEXT: flat_load_b64 v[6:7], v[0:1]
1344
+ ; GFX12-NEXT: flat_load_b64 v[4:5], v[0:1]
1345
+ ; GFX12-NEXT: v_max_num_f64_e32 v[6:7], v[2:3], v[2:3]
1338
1346
; GFX12-NEXT: s_mov_b32 s0, 0
1339
1347
; GFX12-NEXT: .LBB11_1: ; %atomicrmw.start
1340
1348
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1341
1349
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1342
- ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[2:3]
1350
+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
1351
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1352
+ ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
1343
1353
; GFX12-NEXT: s_wait_storecnt 0x0
1344
- ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5 ], v[0:1], v[4:7 ] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1354
+ ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3 ], v[0:1], v[2:5 ] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1345
1355
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1346
1356
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1347
- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5 ], v[6:7 ]
1348
- ; GFX12-NEXT: v_dual_mov_b32 v7, v5 :: v_dual_mov_b32 v6, v4
1357
+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3 ], v[4:5 ]
1358
+ ; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
1349
1359
; GFX12-NEXT: s_wait_alu 0xfffe
1350
1360
; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0
1351
1361
; GFX12-NEXT: s_wait_alu 0xfffe
@@ -1789,16 +1799,19 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
1789
1799
; GFX12-NEXT: s_wait_bvhcnt 0x0
1790
1800
; GFX12-NEXT: s_wait_kmcnt 0x0
1791
1801
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1792
- ; GFX12-NEXT: v_dual_mov_b32 v4 , v0 :: v_dual_mov_b32 v5 , v1
1802
+ ; GFX12-NEXT: v_dual_mov_b32 v2 , v0 :: v_dual_mov_b32 v3 , v1
1793
1803
; GFX12-NEXT: s_mov_b32 s4, 0
1794
1804
; GFX12-NEXT: buffer_load_b64 v[0:1], v6, s[0:3], null offen
1805
+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
1795
1806
; GFX12-NEXT: .LBB14_1: ; %atomicrmw.start
1796
1807
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1797
1808
; GFX12-NEXT: s_wait_loadcnt 0x0
1798
1809
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
1799
1810
; GFX12-NEXT: s_wait_storecnt 0x0
1800
1811
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1801
- ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[9:10], v[4:5]
1812
+ ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
1813
+ ; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
1814
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1802
1815
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
1803
1816
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
1804
1817
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -1958,21 +1971,23 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
1958
1971
; GFX12-NEXT: s_wait_bvhcnt 0x0
1959
1972
; GFX12-NEXT: s_wait_kmcnt 0x0
1960
1973
; GFX12-NEXT: v_mov_b32_e32 v6, s16
1974
+ ; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[0:1], v[0:1]
1961
1975
; GFX12-NEXT: s_mov_b32 s4, 0
1962
- ; GFX12-NEXT: buffer_load_b64 v[4:5 ], v6, s[0:3], null offen
1976
+ ; GFX12-NEXT: buffer_load_b64 v[2:3 ], v6, s[0:3], null offen
1963
1977
; GFX12-NEXT: .LBB15_1: ; %atomicrmw.start
1964
1978
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
1965
1979
; GFX12-NEXT: s_wait_loadcnt 0x0
1966
- ; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[0:1]
1967
- ; GFX12-NEXT: v_dual_mov_b32 v10, v5 :: v_dual_mov_b32 v9, v4
1980
+ ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
1968
1981
; GFX12-NEXT: s_wait_storecnt 0x0
1969
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
1970
- ; GFX12-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v7, v2
1982
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
1983
+ ; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
1984
+ ; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
1985
+ ; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
1971
1986
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
1972
1987
; GFX12-NEXT: s_wait_loadcnt 0x0
1973
1988
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1974
- ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[4:5 ]
1975
- ; GFX12-NEXT: v_dual_mov_b32 v4 , v7 :: v_dual_mov_b32 v5 , v8
1989
+ ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[7:8], v[2:3 ]
1990
+ ; GFX12-NEXT: v_dual_mov_b32 v2 , v7 :: v_dual_mov_b32 v3 , v8
1976
1991
; GFX12-NEXT: s_or_b32 s4, vcc_lo, s4
1977
1992
; GFX12-NEXT: s_wait_alu 0xfffe
1978
1993
; GFX12-NEXT: s_and_not1_b32 exec_lo, exec_lo, s4
0 commit comments