Skip to content

Commit 3e35ba5

Browse files
AMDGPU/GFX12: Insert waitcnts before stores with scope_sys (#82996)
Insert waitcnts for loads and atomics before stores with system scope. Scope is field in instruction encoding and corresponds to desired coherence level in cache hierarchy. Intrinsic stores can set scope in cache policy operand. If volatile keyword is used on generic stores memory legalizer will set scope to system. Generic stores, by default, get lowest scope level. Waitcnts are not required if it is guaranteed that memory is cached. For example vulkan shaders can guarantee this. TODO: implement flag for frontends to give us a hint not to insert waits. Expecting vulkan flag to be implemented as vulkan:private MMRA.
1 parent cb6c0f1 commit 3e35ba5

11 files changed

+154
-1
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
949949
return AMDGPU::S_WAIT_BVHCNT;
950950
case AMDGPU::S_WAIT_DSCNT_soft:
951951
return AMDGPU::S_WAIT_DSCNT;
952+
case AMDGPU::S_WAIT_KMCNT_soft:
953+
return AMDGPU::S_WAIT_KMCNT;
952954
default:
953955
return Opcode;
954956
}

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,10 @@ class SICacheControl {
312312
SIMemOp Op, bool IsVolatile,
313313
bool IsNonTemporal) const = 0;
314314

315+
virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
316+
return false;
317+
};
318+
315319
/// Inserts any necessary instructions at position \p Pos relative
316320
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
317321
/// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -589,6 +593,15 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
589593
bool setScope(const MachineBasicBlock::iterator MI,
590594
AMDGPU::CPol::CPol Value) const;
591595

596+
// Stores with system scope (SCOPE_SYS) need to wait for:
597+
// - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
598+
// - non-returning-atomics - wait for STORECNT==0
599+
// TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
600+
// since it does not distinguish atomics-with-return from regular stores.
601+
// There is no need to wait if memory is cached (mtype != UC).
602+
bool
603+
insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
604+
592605
public:
593606
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {}
594607

@@ -603,6 +616,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
603616
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
604617
bool IsVolatile,
605618
bool IsNonTemporal) const override;
619+
620+
bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
606621
};
607622

608623
class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2194,6 +2209,22 @@ bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
21942209
return false;
21952210
}
21962211

2212+
bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2213+
const MachineBasicBlock::iterator MI) const {
2214+
// TODO: implement flag for frontend to give us a hint not to insert waits.
2215+
2216+
MachineBasicBlock &MBB = *MI->getParent();
2217+
const DebugLoc &DL = MI->getDebugLoc();
2218+
2219+
BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2220+
BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2221+
BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2222+
BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2223+
BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2224+
2225+
return true;
2226+
}
2227+
21972228
bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
21982229
SIAtomicScope Scope,
21992230
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
@@ -2364,6 +2395,9 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
23642395
if (IsVolatile) {
23652396
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
23662397

2398+
if (Op == SIMemOp::STORE)
2399+
Changed |= insertWaitsBeforeSystemScopeStore(MI);
2400+
23672401
// Ensure operation has completed at system scope to cause all volatile
23682402
// operations to be visible outside the program in a global order. Do not
23692403
// request cross address space as only the global address space can be
@@ -2381,6 +2415,15 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
23812415
return Changed;
23822416
}
23832417

2418+
bool SIGfx12CacheControl::expandSystemScopeStore(
2419+
MachineBasicBlock::iterator &MI) const {
2420+
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2421+
if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS))
2422+
return insertWaitsBeforeSystemScopeStore(MI);
2423+
2424+
return false;
2425+
}
2426+
23842427
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
23852428
if (AtomicPseudoMIs.empty())
23862429
return false;
@@ -2467,6 +2510,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
24672510
Changed |= CC->enableVolatileAndOrNonTemporal(
24682511
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
24692512
MOI.isNonTemporal());
2513+
2514+
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2515+
// instruction field, do not confuse it with atomic scope.
2516+
Changed |= CC->expandSystemScopeStore(MI);
24702517
return Changed;
24712518
}
24722519

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1601,6 +1601,7 @@ let SubtargetPredicate = isGFX12Plus in {
16011601
def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
16021602
def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
16031603
def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
1604+
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
16041605
}
16051606

16061607
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",

llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch.ll

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,6 +256,7 @@ define void @store_load_vindex_foo(i32 %idx) {
256256
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
257257
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
258258
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
259+
; GFX12-NEXT: s_wait_storecnt 0x0
259260
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
260261
; GFX12-NEXT: s_wait_storecnt 0x0
261262
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -607,6 +608,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
607608
; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
608609
; GFX12-NEXT: s_wait_loadcnt 0x0
609610
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
611+
; GFX12-NEXT: s_wait_storecnt 0x0
610612
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
611613
; GFX12-NEXT: s_wait_storecnt 0x0
612614
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -921,6 +923,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
921923
; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
922924
; GFX12-NEXT: s_wait_loadcnt 0x0
923925
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
926+
; GFX12-NEXT: s_wait_storecnt 0x0
924927
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
925928
; GFX12-NEXT: s_wait_storecnt 0x0
926929
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -1089,6 +1092,7 @@ define void @store_load_large_imm_offset_foo() {
10891092
; GFX12-NEXT: s_wait_bvhcnt 0x0
10901093
; GFX12-NEXT: s_wait_kmcnt 0x0
10911094
; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
1095+
; GFX12-NEXT: s_wait_storecnt 0x0
10921096
; GFX12-NEXT: scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
10931097
; GFX12-NEXT: s_wait_storecnt 0x0
10941098
; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -1242,6 +1246,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
12421246
; GFX12-NEXT: s_wait_kmcnt 0x0
12431247
; GFX12-NEXT: v_mov_b32_e32 v1, 15
12441248
; GFX12-NEXT: v_mov_b32_e32 v2, 0
1249+
; GFX12-NEXT: s_wait_storecnt 0x0
12451250
; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
12461251
; GFX12-NEXT: s_wait_storecnt 0x0
12471252
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -1306,6 +1311,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
13061311
; GFX12-NEXT: s_wait_kmcnt 0x0
13071312
; GFX12-NEXT: v_mov_b32_e32 v1, 15
13081313
; GFX12-NEXT: v_mov_b32_e32 v2, 0
1314+
; GFX12-NEXT: s_wait_storecnt 0x0
13091315
; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
13101316
; GFX12-NEXT: s_wait_storecnt 0x0
13111317
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -1389,6 +1395,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
13891395
; GFX12-NEXT: s_mov_b32 s0, 1
13901396
; GFX12-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v2, s1
13911397
; GFX12-NEXT: v_mov_b32_e32 v1, s0
1398+
; GFX12-NEXT: s_wait_storecnt 0x0
13921399
; GFX12-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
13931400
; GFX12-NEXT: s_wait_storecnt 0x0
13941401
; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -1478,6 +1485,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
14781485
; GFX12-NEXT: s_mov_b32 s0, 1
14791486
; GFX12-NEXT: v_dual_mov_b32 v4, s3 :: v_dual_mov_b32 v3, s2
14801487
; GFX12-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
1488+
; GFX12-NEXT: s_wait_storecnt 0x0
14811489
; GFX12-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
14821490
; GFX12-NEXT: s_wait_storecnt 0x0
14831491
; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS

llvm/test/CodeGen/AMDGPU/clamp.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,6 +525,7 @@ define amdgpu_kernel void @v_clamp_multi_use_max_f32(ptr addrspace(1) %out, ptr
525525
; GFX12-NEXT: v_max_num_f32_e32 v1, 0, v1
526526
; GFX12-NEXT: v_min_num_f32_e32 v2, 1.0, v1
527527
; GFX12-NEXT: global_store_b32 v0, v2, s[0:1]
528+
; GFX12-NEXT: s_wait_storecnt 0x0
528529
; GFX12-NEXT: global_store_b32 v[0:1], v1, off scope:SCOPE_SYS
529530
; GFX12-NEXT: s_wait_storecnt 0x0
530531
; GFX12-NEXT: s_nop 0

llvm/test/CodeGen/AMDGPU/flat-scratch.ll

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -893,6 +893,7 @@ define void @store_load_vindex_foo(i32 %idx) {
893893
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
894894
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2)
895895
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
896+
; GFX12-NEXT: s_wait_storecnt 0x0
896897
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
897898
; GFX12-NEXT: s_wait_storecnt 0x0
898899
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -964,6 +965,7 @@ define void @store_load_vindex_foo(i32 %idx) {
964965
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0
965966
; GFX12-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2)
966967
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
968+
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
967969
; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 scope:SCOPE_SYS
968970
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
969971
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 scope:SCOPE_SYS
@@ -2137,6 +2139,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
21372139
; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
21382140
; GFX12-NEXT: s_wait_loadcnt 0x0
21392141
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
2142+
; GFX12-NEXT: s_wait_storecnt 0x0
21402143
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
21412144
; GFX12-NEXT: s_wait_storecnt 0x0
21422145
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -2221,6 +2224,7 @@ define void @store_load_vindex_small_offset_foo(i32 %idx) {
22212224
; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
22222225
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
22232226
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
2227+
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
22242228
; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:256 scope:SCOPE_SYS
22252229
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
22262230
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:256 scope:SCOPE_SYS
@@ -3382,6 +3386,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
33823386
; GFX12-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
33833387
; GFX12-NEXT: s_wait_loadcnt 0x0
33843388
; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v1
3389+
; GFX12-NEXT: s_wait_storecnt 0x0
33853390
; GFX12-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
33863391
; GFX12-NEXT: s_wait_storecnt 0x0
33873392
; GFX12-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -3468,6 +3473,7 @@ define void @store_load_vindex_large_offset_foo(i32 %idx) {
34683473
; GFX12-PAL-NEXT: scratch_load_b32 v3, off, s32 scope:SCOPE_SYS
34693474
; GFX12-PAL-NEXT: s_wait_loadcnt 0x0
34703475
; GFX12-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1
3476+
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
34713477
; GFX12-PAL-NEXT: scratch_store_b32 v0, v2, s32 offset:16384 scope:SCOPE_SYS
34723478
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
34733479
; GFX12-PAL-NEXT: scratch_load_b32 v0, v1, s32 offset:16384 scope:SCOPE_SYS
@@ -3714,6 +3720,7 @@ define void @store_load_large_imm_offset_foo() {
37143720
; GFX12-NEXT: s_wait_bvhcnt 0x0
37153721
; GFX12-NEXT: s_wait_kmcnt 0x0
37163722
; GFX12-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3723+
; GFX12-NEXT: s_wait_storecnt 0x0
37173724
; GFX12-NEXT: scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
37183725
; GFX12-NEXT: s_wait_storecnt 0x0
37193726
; GFX12-NEXT: scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -3789,6 +3796,7 @@ define void @store_load_large_imm_offset_foo() {
37893796
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
37903797
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
37913798
; GFX12-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 15
3799+
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
37923800
; GFX12-PAL-NEXT: scratch_store_b32 off, v0, s32 scope:SCOPE_SYS
37933801
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
37943802
; GFX12-PAL-NEXT: scratch_store_b32 off, v1, s32 offset:16000 scope:SCOPE_SYS
@@ -3998,6 +4006,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
39984006
; GFX12-NEXT: s_wait_bvhcnt 0x0
39994007
; GFX12-NEXT: s_wait_kmcnt 0x0
40004008
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4009+
; GFX12-NEXT: s_wait_storecnt 0x0
40014010
; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
40024011
; GFX12-NEXT: s_wait_storecnt 0x0
40034012
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4055,6 +4064,7 @@ define void @store_load_i64_aligned(ptr addrspace(5) nocapture %arg) {
40554064
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
40564065
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
40574066
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4067+
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
40584068
; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
40594069
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
40604070
; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4107,6 +4117,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
41074117
; GFX12-NEXT: s_wait_bvhcnt 0x0
41084118
; GFX12-NEXT: s_wait_kmcnt 0x0
41094119
; GFX12-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4120+
; GFX12-NEXT: s_wait_storecnt 0x0
41104121
; GFX12-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
41114122
; GFX12-NEXT: s_wait_storecnt 0x0
41124123
; GFX12-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4164,6 +4175,7 @@ define void @store_load_i64_unaligned(ptr addrspace(5) nocapture %arg) {
41644175
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
41654176
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
41664177
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0
4178+
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
41674179
; GFX12-PAL-NEXT: scratch_store_b64 v0, v[1:2], off scope:SCOPE_SYS
41684180
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
41694181
; GFX12-PAL-NEXT: scratch_load_b64 v[0:1], v0, off scope:SCOPE_SYS
@@ -4220,6 +4232,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
42204232
; GFX12-NEXT: s_wait_kmcnt 0x0
42214233
; GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
42224234
; GFX12-NEXT: v_mov_b32_e32 v3, 3
4235+
; GFX12-NEXT: s_wait_storecnt 0x0
42234236
; GFX12-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
42244237
; GFX12-NEXT: s_wait_storecnt 0x0
42254238
; GFX12-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -4282,6 +4295,7 @@ define void @store_load_v3i32_unaligned(ptr addrspace(5) nocapture %arg) {
42824295
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
42834296
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
42844297
; GFX12-PAL-NEXT: v_mov_b32_e32 v3, 3
4298+
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
42854299
; GFX12-PAL-NEXT: scratch_store_b96 v0, v[1:3], off scope:SCOPE_SYS
42864300
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
42874301
; GFX12-PAL-NEXT: scratch_load_b96 v[0:2], v0, off scope:SCOPE_SYS
@@ -4340,6 +4354,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
43404354
; GFX12-NEXT: s_wait_kmcnt 0x0
43414355
; GFX12-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
43424356
; GFX12-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
4357+
; GFX12-NEXT: s_wait_storecnt 0x0
43434358
; GFX12-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
43444359
; GFX12-NEXT: s_wait_storecnt 0x0
43454360
; GFX12-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
@@ -4405,6 +4420,7 @@ define void @store_load_v4i32_unaligned(ptr addrspace(5) nocapture %arg) {
44054420
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
44064421
; GFX12-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2
44074422
; GFX12-PAL-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4
4423+
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
44084424
; GFX12-PAL-NEXT: scratch_store_b128 v0, v[1:4], off scope:SCOPE_SYS
44094425
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
44104426
; GFX12-PAL-NEXT: scratch_load_b128 v[0:3], v0, off scope:SCOPE_SYS
@@ -4456,6 +4472,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
44564472
; GFX12-NEXT: s_wait_bvhcnt 0x0
44574473
; GFX12-NEXT: s_wait_kmcnt 0x0
44584474
; GFX12-NEXT: v_mov_b32_e32 v1, 1
4475+
; GFX12-NEXT: s_wait_storecnt 0x0
44594476
; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
44604477
; GFX12-NEXT: s_wait_storecnt 0x0
44614478
; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
@@ -4523,6 +4540,7 @@ define void @store_load_i32_negative_unaligned(ptr addrspace(5) nocapture %arg)
45234540
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
45244541
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
45254542
; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1
4543+
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
45264544
; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-1 scope:SCOPE_SYS
45274545
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
45284546
; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-1 scope:SCOPE_SYS
@@ -4576,6 +4594,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
45764594
; GFX12-NEXT: s_wait_bvhcnt 0x0
45774595
; GFX12-NEXT: s_wait_kmcnt 0x0
45784596
; GFX12-NEXT: v_mov_b32_e32 v1, 1
4597+
; GFX12-NEXT: s_wait_storecnt 0x0
45794598
; GFX12-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
45804599
; GFX12-NEXT: s_wait_storecnt 0x0
45814600
; GFX12-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS
@@ -4644,6 +4663,7 @@ define void @store_load_i32_large_negative_unaligned(ptr addrspace(5) nocapture
46444663
; GFX12-PAL-NEXT: s_wait_bvhcnt 0x0
46454664
; GFX12-PAL-NEXT: s_wait_kmcnt 0x0
46464665
; GFX12-PAL-NEXT: v_mov_b32_e32 v1, 1
4666+
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
46474667
; GFX12-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-4225 scope:SCOPE_SYS
46484668
; GFX12-PAL-NEXT: s_wait_storecnt 0x0
46494669
; GFX12-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-4225 scope:SCOPE_SYS

llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,7 @@ define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1)
269269
; GFX12-NEXT: v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
270270
; GFX12-NEXT: s_lshr_b32 s1, ttmp7, 16
271271
; GFX12-NEXT: v_mov_b32_e32 v8, s1
272+
; GFX12-NEXT: s_wait_storecnt 0x0
272273
; GFX12-NEXT: global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
273274
; GFX12-NEXT: s_wait_storecnt 0x0
274275
; GFX12-NEXT: global_store_b32 v[2:3], v7, off scope:SCOPE_SYS

llvm/test/CodeGen/AMDGPU/omod.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -651,8 +651,8 @@ define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
651651
; GFX12-NEXT: v_add_f32_e32 v0, 1.0, v0
652652
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
653653
; GFX12-NEXT: v_mul_f32_e32 v1, 4.0, v0
654-
; GFX12-NEXT: s_clause 0x1
655654
; GFX12-NEXT: global_store_b32 v[0:1], v1, off
655+
; GFX12-NEXT: s_wait_storecnt 0x0
656656
; GFX12-NEXT: global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
657657
; GFX12-NEXT: s_wait_storecnt 0x0
658658
; GFX12-NEXT: s_nop 0

0 commit comments

Comments
 (0)