Skip to content

Commit ec3a905

Browse files
authored
[AMDGPU][InsertWaitCnts] Track global_wb/inv/wbinv (#135340)
wb/wbinv use storecnt, inv uses loadcnt. Track them as VMEM_WRITE_ACCESS and VMEM_READ_ACCESS to avoid InsertWaitCnt incorrectly eliminating the waitcnts after these instructions. Solves SWDEV-526604
1 parent 47903e3 commit ec3a905

32 files changed

+954
-4
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -698,6 +698,16 @@ class SIInsertWaitcnts {
698698
// Return the appropriate VMEM_*_ACCESS type for Inst, which must be a VMEM or
699699
// FLAT instruction.
700700
WaitEventType getVmemWaitEventType(const MachineInstr &Inst) const {
701+
switch (Inst.getOpcode()) {
702+
case AMDGPU::GLOBAL_INV:
703+
return VMEM_READ_ACCESS; // tracked using loadcnt
704+
case AMDGPU::GLOBAL_WB:
705+
case AMDGPU::GLOBAL_WBINV:
706+
return VMEM_WRITE_ACCESS; // tracked using storecnt
707+
default:
708+
break;
709+
}
710+
701711
// Maps VMEM access types to their corresponding WaitEventType.
702712
static const WaitEventType VmemReadMapping[NUM_VMEM_TYPES] = {
703713
VMEM_READ_ACCESS, VMEM_SAMPLER_READ_ACCESS, VMEM_BVH_READ_ACCESS};
@@ -2049,7 +2059,7 @@ bool SIInsertWaitcnts::mayAccessScratchThroughFlat(
20492059
});
20502060
}
20512061

2052-
static bool isCacheInvOrWBInst(MachineInstr &Inst) {
2062+
static bool isGFX12CacheInvOrWBInst(MachineInstr &Inst) {
20532063
auto Opc = Inst.getOpcode();
20542064
return Opc == AMDGPU::GLOBAL_INV || Opc == AMDGPU::GLOBAL_WB ||
20552065
Opc == AMDGPU::GLOBAL_WBINV;
@@ -2130,9 +2140,11 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
21302140
ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
21312141
}
21322142
} else if (TII->isFLAT(Inst)) {
2133-
// TODO: Track this properly.
2134-
if (isCacheInvOrWBInst(Inst))
2143+
if (isGFX12CacheInvOrWBInst(Inst)) {
2144+
ScoreBrackets->updateByEvent(TII, TRI, MRI, getVmemWaitEventType(Inst),
2145+
Inst);
21352146
return;
2147+
}
21362148

21372149
assert(Inst.mayLoadOrStore());
21382150

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmax.ll

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ define float @local_atomic_fmax_ret_f32(ptr addrspace(3) %ptr, float %val) {
2222
; GFX12-NEXT: ds_max_num_rtn_f32 v0, v0, v1
2323
; GFX12-NEXT: s_wait_dscnt 0x0
2424
; GFX12-NEXT: global_inv scope:SCOPE_SE
25+
; GFX12-NEXT: s_wait_loadcnt 0x0
2526
; GFX12-NEXT: s_setpc_b64 s[30:31]
2627
;
2728
; GFX942-LABEL: local_atomic_fmax_ret_f32:
@@ -94,6 +95,7 @@ define void @local_atomic_fmax_noret_f32(ptr addrspace(3) %ptr, float %val) {
9495
; GFX12-NEXT: ds_max_num_f32 v0, v1
9596
; GFX12-NEXT: s_wait_dscnt 0x0
9697
; GFX12-NEXT: global_inv scope:SCOPE_SE
98+
; GFX12-NEXT: s_wait_loadcnt 0x0
9799
; GFX12-NEXT: s_setpc_b64 s[30:31]
98100
;
99101
; GFX942-LABEL: local_atomic_fmax_noret_f32:
@@ -166,6 +168,7 @@ define double @local_atomic_fmax_ret_f64(ptr addrspace(3) %ptr, double %val) {
166168
; GFX12-NEXT: ds_max_num_rtn_f64 v[0:1], v0, v[1:2]
167169
; GFX12-NEXT: s_wait_dscnt 0x0
168170
; GFX12-NEXT: global_inv scope:SCOPE_SE
171+
; GFX12-NEXT: s_wait_loadcnt 0x0
169172
; GFX12-NEXT: s_setpc_b64 s[30:31]
170173
;
171174
; GFX942-LABEL: local_atomic_fmax_ret_f64:
@@ -242,6 +245,7 @@ define void @local_atomic_fmax_noret_f64(ptr addrspace(3) %ptr, double %val) {
242245
; GFX12-NEXT: ds_max_num_f64 v0, v[1:2]
243246
; GFX12-NEXT: s_wait_dscnt 0x0
244247
; GFX12-NEXT: global_inv scope:SCOPE_SE
248+
; GFX12-NEXT: s_wait_loadcnt 0x0
245249
; GFX12-NEXT: s_setpc_b64 s[30:31]
246250
;
247251
; GFX942-LABEL: local_atomic_fmax_noret_f64:
@@ -318,6 +322,7 @@ define float @global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(pt
318322
; GFX12-NEXT: global_atomic_max_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
319323
; GFX12-NEXT: s_wait_loadcnt 0x0
320324
; GFX12-NEXT: global_inv scope:SCOPE_DEV
325+
; GFX12-NEXT: s_wait_loadcnt 0x0
321326
; GFX12-NEXT: s_setpc_b64 s[30:31]
322327
;
323328
; GFX942-LABEL: global_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -464,6 +469,7 @@ define void @global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(p
464469
; GFX12-NEXT: global_atomic_max_num_f32 v[0:1], v2, off scope:SCOPE_DEV
465470
; GFX12-NEXT: s_wait_storecnt 0x0
466471
; GFX12-NEXT: global_inv scope:SCOPE_DEV
472+
; GFX12-NEXT: s_wait_loadcnt 0x0
467473
; GFX12-NEXT: s_setpc_b64 s[30:31]
468474
;
469475
; GFX942-LABEL: global_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -624,6 +630,7 @@ define double @global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(p
624630
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
625631
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
626632
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
633+
; GFX12-NEXT: s_wait_loadcnt 0x0
627634
; GFX12-NEXT: s_setpc_b64 s[30:31]
628635
;
629636
; GFX942-LABEL: global_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -779,6 +786,7 @@ define void @global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(p
779786
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
780787
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
781788
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
789+
; GFX12-NEXT: s_wait_loadcnt 0x0
782790
; GFX12-NEXT: s_setpc_b64 s[30:31]
783791
;
784792
; GFX942-LABEL: global_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -909,6 +917,7 @@ define float @flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory(ptr
909917
; GFX12-NEXT: flat_atomic_max_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
910918
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
911919
; GFX12-NEXT: global_inv scope:SCOPE_DEV
920+
; GFX12-NEXT: s_wait_loadcnt 0x0
912921
; GFX12-NEXT: s_setpc_b64 s[30:31]
913922
;
914923
; GFX942-LABEL: flat_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1051,6 +1060,7 @@ define void @flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory(ptr
10511060
; GFX12-NEXT: flat_atomic_max_num_f32 v[0:1], v2 scope:SCOPE_DEV
10521061
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
10531062
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1063+
; GFX12-NEXT: s_wait_loadcnt 0x0
10541064
; GFX12-NEXT: s_setpc_b64 s[30:31]
10551065
;
10561066
; GFX942-LABEL: flat_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1210,6 +1220,7 @@ define double @flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory(ptr
12101220
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
12111221
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
12121222
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
1223+
; GFX12-NEXT: s_wait_loadcnt 0x0
12131224
; GFX12-NEXT: s_setpc_b64 s[30:31]
12141225
;
12151226
; GFX942-LABEL: flat_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1363,6 +1374,7 @@ define void @flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory(ptr
13631374
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
13641375
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
13651376
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
1377+
; GFX12-NEXT: s_wait_loadcnt 0x0
13661378
; GFX12-NEXT: s_setpc_b64 s[30:31]
13671379
;
13681380
; GFX942-LABEL: flat_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1495,6 +1507,7 @@ define float @buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_m
14951507
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
14961508
; GFX12-NEXT: s_wait_loadcnt 0x0
14971509
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1510+
; GFX12-NEXT: s_wait_loadcnt 0x0
14981511
; GFX12-NEXT: s_setpc_b64 s[30:31]
14991512
;
15001513
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1651,6 +1664,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_
16511664
; GFX12-NEXT: buffer_atomic_max_num_f32 v0, v1, s[0:3], null offen
16521665
; GFX12-NEXT: s_wait_storecnt 0x0
16531666
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1667+
; GFX12-NEXT: s_wait_loadcnt 0x0
16541668
; GFX12-NEXT: s_setpc_b64 s[30:31]
16551669
;
16561670
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1824,6 +1838,7 @@ define double @buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_
18241838
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
18251839
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
18261840
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
1841+
; GFX12-NEXT: s_wait_loadcnt 0x0
18271842
; GFX12-NEXT: s_setpc_b64 s[30:31]
18281843
;
18291844
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1994,6 +2009,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_
19942009
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
19952010
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
19962011
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
2012+
; GFX12-NEXT: s_wait_loadcnt 0x0
19972013
; GFX12-NEXT: s_setpc_b64 s[30:31]
19982014
;
19992015
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmax_noret_f64__amdgpu_no_fine_grained_memory:

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_fmin.ll

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ define float @local_atomic_fmin_ret_f32(ptr addrspace(3) %ptr, float %val) {
2222
; GFX12-NEXT: ds_min_num_rtn_f32 v0, v0, v1
2323
; GFX12-NEXT: s_wait_dscnt 0x0
2424
; GFX12-NEXT: global_inv scope:SCOPE_SE
25+
; GFX12-NEXT: s_wait_loadcnt 0x0
2526
; GFX12-NEXT: s_setpc_b64 s[30:31]
2627
;
2728
; GFX942-LABEL: local_atomic_fmin_ret_f32:
@@ -94,6 +95,7 @@ define void @local_atomic_fmin_noret_f32(ptr addrspace(3) %ptr, float %val) {
9495
; GFX12-NEXT: ds_min_num_f32 v0, v1
9596
; GFX12-NEXT: s_wait_dscnt 0x0
9697
; GFX12-NEXT: global_inv scope:SCOPE_SE
98+
; GFX12-NEXT: s_wait_loadcnt 0x0
9799
; GFX12-NEXT: s_setpc_b64 s[30:31]
98100
;
99101
; GFX942-LABEL: local_atomic_fmin_noret_f32:
@@ -166,6 +168,7 @@ define double @local_atomic_fmin_ret_f64(ptr addrspace(3) %ptr, double %val) {
166168
; GFX12-NEXT: ds_min_num_rtn_f64 v[0:1], v0, v[1:2]
167169
; GFX12-NEXT: s_wait_dscnt 0x0
168170
; GFX12-NEXT: global_inv scope:SCOPE_SE
171+
; GFX12-NEXT: s_wait_loadcnt 0x0
169172
; GFX12-NEXT: s_setpc_b64 s[30:31]
170173
;
171174
; GFX942-LABEL: local_atomic_fmin_ret_f64:
@@ -242,6 +245,7 @@ define void @local_atomic_fmin_noret_f64(ptr addrspace(3) %ptr, double %val) {
242245
; GFX12-NEXT: ds_min_num_f64 v0, v[1:2]
243246
; GFX12-NEXT: s_wait_dscnt 0x0
244247
; GFX12-NEXT: global_inv scope:SCOPE_SE
248+
; GFX12-NEXT: s_wait_loadcnt 0x0
245249
; GFX12-NEXT: s_setpc_b64 s[30:31]
246250
;
247251
; GFX942-LABEL: local_atomic_fmin_noret_f64:
@@ -318,6 +322,7 @@ define float @global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(pt
318322
; GFX12-NEXT: global_atomic_min_num_f32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
319323
; GFX12-NEXT: s_wait_loadcnt 0x0
320324
; GFX12-NEXT: global_inv scope:SCOPE_DEV
325+
; GFX12-NEXT: s_wait_loadcnt 0x0
321326
; GFX12-NEXT: s_setpc_b64 s[30:31]
322327
;
323328
; GFX942-LABEL: global_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -464,6 +469,7 @@ define void @global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(p
464469
; GFX12-NEXT: global_atomic_min_num_f32 v[0:1], v2, off scope:SCOPE_DEV
465470
; GFX12-NEXT: s_wait_storecnt 0x0
466471
; GFX12-NEXT: global_inv scope:SCOPE_DEV
472+
; GFX12-NEXT: s_wait_loadcnt 0x0
467473
; GFX12-NEXT: s_setpc_b64 s[30:31]
468474
;
469475
; GFX942-LABEL: global_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -624,6 +630,7 @@ define double @global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(p
624630
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
625631
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
626632
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
633+
; GFX12-NEXT: s_wait_loadcnt 0x0
627634
; GFX12-NEXT: s_setpc_b64 s[30:31]
628635
;
629636
; GFX942-LABEL: global_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -779,6 +786,7 @@ define void @global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(p
779786
; GFX12-NEXT: s_cbranch_execnz .LBB7_1
780787
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
781788
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
789+
; GFX12-NEXT: s_wait_loadcnt 0x0
782790
; GFX12-NEXT: s_setpc_b64 s[30:31]
783791
;
784792
; GFX942-LABEL: global_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -909,6 +917,7 @@ define float @flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory(ptr
909917
; GFX12-NEXT: flat_atomic_min_num_f32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
910918
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
911919
; GFX12-NEXT: global_inv scope:SCOPE_DEV
920+
; GFX12-NEXT: s_wait_loadcnt 0x0
912921
; GFX12-NEXT: s_setpc_b64 s[30:31]
913922
;
914923
; GFX942-LABEL: flat_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1051,6 +1060,7 @@ define void @flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory(ptr
10511060
; GFX12-NEXT: flat_atomic_min_num_f32 v[0:1], v2 scope:SCOPE_DEV
10521061
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
10531062
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1063+
; GFX12-NEXT: s_wait_loadcnt 0x0
10541064
; GFX12-NEXT: s_setpc_b64 s[30:31]
10551065
;
10561066
; GFX942-LABEL: flat_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1210,6 +1220,7 @@ define double @flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory(ptr
12101220
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
12111221
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
12121222
; GFX12-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v5
1223+
; GFX12-NEXT: s_wait_loadcnt 0x0
12131224
; GFX12-NEXT: s_setpc_b64 s[30:31]
12141225
;
12151226
; GFX942-LABEL: flat_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1363,6 +1374,7 @@ define void @flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory(ptr
13631374
; GFX12-NEXT: s_cbranch_execnz .LBB11_1
13641375
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
13651376
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s0
1377+
; GFX12-NEXT: s_wait_loadcnt 0x0
13661378
; GFX12-NEXT: s_setpc_b64 s[30:31]
13671379
;
13681380
; GFX942-LABEL: flat_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:
@@ -1495,6 +1507,7 @@ define float @buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_m
14951507
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen th:TH_ATOMIC_RETURN
14961508
; GFX12-NEXT: s_wait_loadcnt 0x0
14971509
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1510+
; GFX12-NEXT: s_wait_loadcnt 0x0
14981511
; GFX12-NEXT: s_setpc_b64 s[30:31]
14991512
;
15001513
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f32__amdgpu_no_fine_grained_memory:
@@ -1651,6 +1664,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_
16511664
; GFX12-NEXT: buffer_atomic_min_num_f32 v0, v1, s[0:3], null offen
16521665
; GFX12-NEXT: s_wait_storecnt 0x0
16531666
; GFX12-NEXT: global_inv scope:SCOPE_DEV
1667+
; GFX12-NEXT: s_wait_loadcnt 0x0
16541668
; GFX12-NEXT: s_setpc_b64 s[30:31]
16551669
;
16561670
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f32__amdgpu_no_fine_grained_memory:
@@ -1824,6 +1838,7 @@ define double @buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_
18241838
; GFX12-NEXT: s_cbranch_execnz .LBB14_1
18251839
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
18261840
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
1841+
; GFX12-NEXT: s_wait_loadcnt 0x0
18271842
; GFX12-NEXT: s_setpc_b64 s[30:31]
18281843
;
18291844
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f64__amdgpu_no_fine_grained_memory:
@@ -1994,6 +2009,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_
19942009
; GFX12-NEXT: s_cbranch_execnz .LBB15_1
19952010
; GFX12-NEXT: ; %bb.2: ; %atomicrmw.end
19962011
; GFX12-NEXT: s_or_b32 exec_lo, exec_lo, s4
2012+
; GFX12-NEXT: s_wait_loadcnt 0x0
19972013
; GFX12-NEXT: s_setpc_b64 s[30:31]
19982014
;
19992015
; GFX942-LABEL: buffer_fat_ptr_agent_atomic_fmin_noret_f64__amdgpu_no_fine_grained_memory:

llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -606,6 +606,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
606606
; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
607607
; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
608608
; GFX12-NEXT: global_wb scope:SCOPE_SYS
609+
; GFX12-NEXT: s_wait_storecnt 0x0
609610
; GFX12-NEXT: s_wait_kmcnt 0x0
610611
; GFX12-NEXT: global_atomic_inc_u32 v0, v1, v0, s[2:3] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
611612
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -853,6 +854,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
853854
; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
854855
; GFX12-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
855856
; GFX12-NEXT: global_wb scope:SCOPE_SYS
857+
; GFX12-NEXT: s_wait_storecnt 0x0
856858
; GFX12-NEXT: s_wait_kmcnt 0x0
857859
; GFX12-NEXT: global_atomic_inc_u32 v1, v0, s[0:1] offset:16 scope:SCOPE_SYS
858860
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -1817,6 +1819,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
18171819
; GFX12-NEXT: v_mov_b32_e32 v0, 42
18181820
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
18191821
; GFX12-NEXT: global_wb scope:SCOPE_SYS
1822+
; GFX12-NEXT: s_wait_storecnt 0x0
18201823
; GFX12-NEXT: s_wait_kmcnt 0x0
18211824
; GFX12-NEXT: global_atomic_inc_u64 v[0:1], v2, v[0:1], s[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
18221825
; GFX12-NEXT: s_wait_loadcnt 0x0
@@ -2082,6 +2085,7 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
20822085
; GFX12-NEXT: v_mov_b32_e32 v0, 42
20832086
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
20842087
; GFX12-NEXT: global_wb scope:SCOPE_SYS
2088+
; GFX12-NEXT: s_wait_storecnt 0x0
20852089
; GFX12-NEXT: s_wait_kmcnt 0x0
20862090
; GFX12-NEXT: global_atomic_inc_u64 v2, v[0:1], s[0:1] offset:32 scope:SCOPE_SYS
20872091
; GFX12-NEXT: s_wait_storecnt 0x0
@@ -2628,6 +2632,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
26282632
; GFX12-NEXT: s_wait_kmcnt 0x0
26292633
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
26302634
; GFX12-NEXT: global_wb scope:SCOPE_SYS
2635+
; GFX12-NEXT: s_wait_storecnt 0x0
26312636
; GFX12-NEXT: flat_atomic_inc_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
26322637
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
26332638
; GFX12-NEXT: global_inv scope:SCOPE_SYS
@@ -2916,6 +2921,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
29162921
; GFX12-NEXT: s_wait_kmcnt 0x0
29172922
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
29182923
; GFX12-NEXT: global_wb scope:SCOPE_SYS
2924+
; GFX12-NEXT: s_wait_storecnt 0x0
29192925
; GFX12-NEXT: flat_atomic_inc_u32 v[0:1], v2 offset:16 scope:SCOPE_SYS
29202926
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
29212927
; GFX12-NEXT: global_inv scope:SCOPE_SYS
@@ -3687,6 +3693,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
36873693
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2
36883694
; GFX12-NEXT: v_mov_b32_e32 v3, s3
36893695
; GFX12-NEXT: global_wb scope:SCOPE_SYS
3696+
; GFX12-NEXT: s_wait_storecnt 0x0
36903697
; GFX12-NEXT: flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
36913698
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
36923699
; GFX12-NEXT: global_inv scope:SCOPE_SYS
@@ -3993,6 +4000,7 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
39934000
; GFX12-NEXT: s_wait_kmcnt 0x0
39944001
; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
39954002
; GFX12-NEXT: global_wb scope:SCOPE_SYS
4003+
; GFX12-NEXT: s_wait_storecnt 0x0
39964004
; GFX12-NEXT: flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
39974005
; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
39984006
; GFX12-NEXT: global_inv scope:SCOPE_SYS
@@ -4402,6 +4410,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
44024410
; GFX12-NEXT: ds_inc_rtn_u32 v2, v0, v1
44034411
; GFX12-NEXT: s_wait_dscnt 0x0
44044412
; GFX12-NEXT: global_inv scope:SCOPE_SE
4413+
; GFX12-NEXT: s_wait_loadcnt 0x0
44054414
; GFX12-NEXT: ds_inc_rtn_u32 v0, v0, v1
44064415
; GFX12-NEXT: s_wait_dscnt 0x0
44074416
; GFX12-NEXT: global_inv scope:SCOPE_SE

0 commit comments

Comments
 (0)