@@ -18,6 +18,7 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) {
18
18
; GFX12-NEXT: s_wait_samplecnt 0x0
19
19
; GFX12-NEXT: s_wait_bvhcnt 0x0
20
20
; GFX12-NEXT: s_wait_kmcnt 0x0
21
+ ; GFX12-NEXT: global_wb scope:SCOPE_SE
21
22
; GFX12-NEXT: s_wait_storecnt 0x0
22
23
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
23
24
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -90,6 +91,7 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) {
90
91
; GFX12-NEXT: s_wait_samplecnt 0x0
91
92
; GFX12-NEXT: s_wait_bvhcnt 0x0
92
93
; GFX12-NEXT: s_wait_kmcnt 0x0
94
+ ; GFX12-NEXT: global_wb scope:SCOPE_SE
93
95
; GFX12-NEXT: s_wait_storecnt 0x0
94
96
; GFX12-NEXT: ds_max_num_f32 v0, v1
95
97
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -162,6 +164,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) {
162
164
; GFX12-NEXT: s_wait_samplecnt 0x0
163
165
; GFX12-NEXT: s_wait_bvhcnt 0x0
164
166
; GFX12-NEXT: s_wait_kmcnt 0x0
167
+ ; GFX12-NEXT: global_wb scope:SCOPE_SE
165
168
; GFX12-NEXT: s_wait_storecnt 0x0
166
169
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
167
170
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -238,6 +241,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) {
238
241
; GFX12-NEXT: s_wait_samplecnt 0x0
239
242
; GFX12-NEXT: s_wait_bvhcnt 0x0
240
243
; GFX12-NEXT: s_wait_kmcnt 0x0
244
+ ; GFX12-NEXT: global_wb scope:SCOPE_SE
241
245
; GFX12-NEXT: s_wait_storecnt 0x0
242
246
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
243
247
; GFX12-NEXT: s_wait_dscnt 0x0
@@ -324,8 +328,9 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
324
328
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
325
329
; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
326
330
; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
331
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
327
332
; GFX12-NEXT: s_wait_storecnt 0x0
328
- ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN
333
+ ; GFX12-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
329
334
; GFX12-NEXT: s_wait_loadcnt 0x0
330
335
; GFX12-NEXT: global_inv scope:SCOPE_DEV
331
336
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
@@ -538,8 +543,9 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
538
543
; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
539
544
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
540
545
; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
546
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
541
547
; GFX12-NEXT: s_wait_storecnt 0x0
542
- ; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN
548
+ ; GFX12-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
543
549
; GFX12-NEXT: s_wait_loadcnt 0x0
544
550
; GFX12-NEXT: global_inv scope:SCOPE_DEV
545
551
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
@@ -746,8 +752,9 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
746
752
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
747
753
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
748
754
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
755
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
749
756
; GFX12-NEXT: s_wait_storecnt 0x0
750
- ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN
757
+ ; GFX12-NEXT: global_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
751
758
; GFX12-NEXT: s_wait_loadcnt 0x0
752
759
; GFX12-NEXT: global_inv scope:SCOPE_DEV
753
760
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
@@ -972,8 +979,9 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
972
979
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
973
980
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
974
981
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
982
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
975
983
; GFX12-NEXT: s_wait_storecnt 0x0
976
- ; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN
984
+ ; GFX12-NEXT: global_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
977
985
; GFX12-NEXT: s_wait_loadcnt 0x0
978
986
; GFX12-NEXT: global_inv scope:SCOPE_DEV
979
987
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -1186,8 +1194,9 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
1186
1194
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1187
1195
; GFX12-NEXT: v_max_num_f32_e32 v3, v4, v4
1188
1196
; GFX12-NEXT: v_max_num_f32_e32 v3, v3, v2
1197
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
1189
1198
; GFX12-NEXT: s_wait_storecnt 0x0
1190
- ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN
1199
+ ; GFX12-NEXT: flat_atomic_cmpswap_b32 v3, v[0:1], v[3:4] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1191
1200
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1192
1201
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1193
1202
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
@@ -1395,8 +1404,9 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
1395
1404
; GFX12-NEXT: v_max_num_f32_e32 v2, v3, v3
1396
1405
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1397
1406
; GFX12-NEXT: v_max_num_f32_e32 v2, v2, v4
1407
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
1398
1408
; GFX12-NEXT: s_wait_storecnt 0x0
1399
- ; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN
1409
+ ; GFX12-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1400
1410
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1401
1411
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1402
1412
; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
@@ -1598,8 +1608,9 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
1598
1608
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
1599
1609
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[6:7], v[6:7]
1600
1610
; GFX12-NEXT: v_max_num_f64_e32 v[4:5], v[4:5], v[2:3]
1611
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
1601
1612
; GFX12-NEXT: s_wait_storecnt 0x0
1602
- ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN
1613
+ ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[4:5], v[0:1], v[4:7] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1603
1614
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1604
1615
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1605
1616
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[4:5], v[6:7]
@@ -1823,8 +1834,9 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
1823
1834
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[4:5], v[4:5]
1824
1835
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
1825
1836
; GFX12-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[6:7]
1837
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
1826
1838
; GFX12-NEXT: s_wait_storecnt 0x0
1827
- ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN
1839
+ ; GFX12-NEXT: flat_atomic_cmpswap_b64 v[2:3], v[0:1], v[2:5] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
1828
1840
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
1829
1841
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1830
1842
; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
@@ -2035,11 +2047,11 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
2035
2047
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2036
2048
; GFX12-NEXT: s_wait_loadcnt 0x0
2037
2049
; GFX12-NEXT: v_mov_b32_e32 v5, v0
2050
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
2038
2051
; GFX12-NEXT: s_wait_storecnt 0x0
2039
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2040
2052
; GFX12-NEXT: v_max_num_f32_e32 v0, v5, v5
2053
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2041
2054
; GFX12-NEXT: v_max_num_f32_e32 v4, v0, v3
2042
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2043
2055
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
2044
2056
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
2045
2057
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2285,9 +2297,10 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
2285
2297
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2286
2298
; GFX12-NEXT: s_wait_loadcnt 0x0
2287
2299
; GFX12-NEXT: v_max_num_f32_e32 v0, v1, v1
2300
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
2288
2301
; GFX12-NEXT: s_wait_storecnt 0x0
2289
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2290
2302
; GFX12-NEXT: v_max_num_f32_e32 v0, v0, v3
2303
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2291
2304
; GFX12-NEXT: v_dual_mov_b32 v5, v1 :: v_dual_mov_b32 v4, v0
2292
2305
; GFX12-NEXT: buffer_atomic_cmpswap_b32 v[4:5], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
2293
2306
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2527,11 +2540,11 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
2527
2540
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2528
2541
; GFX12-NEXT: s_wait_loadcnt 0x0
2529
2542
; GFX12-NEXT: v_dual_mov_b32 v10, v1 :: v_dual_mov_b32 v9, v0
2543
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
2530
2544
; GFX12-NEXT: s_wait_storecnt 0x0
2531
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2532
2545
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[9:10], v[9:10]
2546
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
2533
2547
; GFX12-NEXT: v_max_num_f64_e32 v[7:8], v[0:1], v[4:5]
2534
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
2535
2548
; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8
2536
2549
; GFX12-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10
2537
2550
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[0:3], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
@@ -2800,10 +2813,11 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
2800
2813
; GFX12-NEXT: ; =>This Inner Loop Header: Depth=1
2801
2814
; GFX12-NEXT: s_wait_loadcnt 0x0
2802
2815
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[2:3]
2816
+ ; GFX12-NEXT: global_wb scope:SCOPE_DEV
2803
2817
; GFX12-NEXT: s_wait_storecnt 0x0
2804
- ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
2805
2818
; GFX12-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[4:5]
2806
2819
; GFX12-NEXT: v_dual_mov_b32 v10, v3 :: v_dual_mov_b32 v9, v2
2820
+ ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
2807
2821
; GFX12-NEXT: v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v7, v0
2808
2822
; GFX12-NEXT: buffer_atomic_cmpswap_b64 v[7:10], v6, s[0:3], null offen th:TH_ATOMIC_RETURN
2809
2823
; GFX12-NEXT: s_wait_loadcnt 0x0
0 commit comments