Skip to content

Commit b41acdc

Browse files
AMDGPU/GFX12: Insert waitcnts before stores with scope_sys
Insert waitcnts for loads and atomics before stores with system scope. Scope is field in instruction encoding and corresponds to desired coherence level in cache hierarchy. Only intrinsic stores can set scope. Currently there is no reliable way to set scope on generic stores they are by default lowest scope level. Waitcnts are not required if it is guaranteed that memory is cached. For example vulkan shaders can guarantee this. TODO: implement flag for frontends to give us a hint not to insert waits. Expecting vulkan flag to be implemented as vulkan:private MMRA.
1 parent 749384c commit b41acdc

File tree

5 files changed

+78
-0
lines changed

5 files changed

+78
-0
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -949,6 +949,8 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
949949
return AMDGPU::S_WAIT_BVHCNT;
950950
case AMDGPU::S_WAIT_DSCNT_soft:
951951
return AMDGPU::S_WAIT_DSCNT;
952+
case AMDGPU::S_WAIT_KMCNT_soft:
953+
return AMDGPU::S_WAIT_KMCNT;
952954
default:
953955
return Opcode;
954956
}

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,10 @@ class SICacheControl {
312312
SIMemOp Op, bool IsVolatile,
313313
bool IsNonTemporal) const = 0;
314314

315+
virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
316+
return false;
317+
};
318+
315319
/// Inserts any necessary instructions at position \p Pos relative
316320
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
317321
/// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -603,6 +607,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
603607
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
604608
bool IsVolatile,
605609
bool IsNonTemporal) const override;
610+
611+
bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
606612
};
607613

608614
class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -2381,6 +2387,33 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
23812387
return Changed;
23822388
}
23832389

2390+
bool SIGfx12CacheControl::expandSystemScopeStore(
2391+
MachineBasicBlock::iterator &MI) const {
2392+
2393+
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2394+
if (!CPol || ((CPol->getImm() & CPol::SCOPE) != CPol::SCOPE_SYS))
2395+
return false;
2396+
2397+
// Stores with system scope (SCOPE_SYS) need to wait for:
2398+
// - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
2399+
// - non-returning-atomics - wait for STORECNT==0
2400+
// TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
2401+
// since it does not distinguish atomics-with-return from regular stores.
2402+
2403+
// There is no need to wait if memory is cached (mtype != UC).
2404+
// For example shader-visible memory is cached.
2405+
// TODO: implement flag for frontend to give us a hint not to insert waits.
2406+
MachineBasicBlock &MBB = *MI->getParent();
2407+
const DebugLoc &DL = MI->getDebugLoc();
2408+
2409+
BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2410+
BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2411+
BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2412+
BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2413+
BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2414+
return true;
2415+
}
2416+
23842417
bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
23852418
if (AtomicPseudoMIs.empty())
23862419
return false;
@@ -2467,6 +2500,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
24672500
Changed |= CC->enableVolatileAndOrNonTemporal(
24682501
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
24692502
MOI.isNonTemporal());
2503+
2504+
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2505+
// instruction field, do not confuse it with atomic scope.
2506+
Changed |= CC->expandSystemScopeStore(MI);
24702507
return Changed;
24712508
}
24722509

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1601,6 +1601,7 @@ let SubtargetPredicate = isGFX12Plus in {
16011601
def S_WAIT_SAMPLECNT_soft : SOPP_Pseudo <"s_soft_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
16021602
def S_WAIT_BVHCNT_soft : SOPP_Pseudo <"s_soft_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
16031603
def S_WAIT_DSCNT_soft : SOPP_Pseudo <"s_soft_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
1604+
def S_WAIT_KMCNT_soft : SOPP_Pseudo <"s_soft_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
16041605
}
16051606

16061607
def S_SETHALT : SOPP_Pseudo <"s_sethalt" , (ins i32imm:$simm16), "$simm16",
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2+
; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
3+
; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
4+
5+
define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
6+
; GFX12-LABEL: intrinsic_store_system_scope:
7+
; GFX12: ; %bb.0:
8+
; GFX12-NEXT: buffer_store_b32 v0, v[1:2], s[0:3], s4 idxen offen scope:SCOPE_SYS
9+
; GFX12-NEXT: s_nop 0
10+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
11+
; GFX12-NEXT: s_endpgm
12+
call void @llvm.amdgcn.struct.buffer.store.i32(i32 %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 24)
13+
ret void
14+
}
15+
16+
declare void @llvm.amdgcn.struct.buffer.store.i32(i32, <4 x i32>, i32, i32, i32, i32 immarg)
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=si-memory-legalizer %s -o - | FileCheck -check-prefix=GFX12 %s
3+
4+
---
5+
name: intrinsic_store_system_scope
6+
body: |
7+
bb.0:
8+
liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
9+
10+
; GFX12-LABEL: name: intrinsic_store_system_scope
11+
; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $vgpr0, $vgpr1, $vgpr2
12+
; GFX12-NEXT: {{ $}}
13+
; GFX12-NEXT: S_WAIT_LOADCNT_soft 0
14+
; GFX12-NEXT: S_WAIT_SAMPLECNT_soft 0
15+
; GFX12-NEXT: S_WAIT_BVHCNT_soft 0
16+
; GFX12-NEXT: S_WAIT_KMCNT_soft 0
17+
; GFX12-NEXT: S_WAIT_STORECNT_soft 0
18+
; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
19+
; GFX12-NEXT: S_ENDPGM 0
20+
BUFFER_STORE_DWORD_VBUFFER_BOTHEN_exact killed renamable $vgpr0, killed renamable $vgpr1_vgpr2, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $sgpr4, 0, 24, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 8)
21+
S_ENDPGM 0
22+
...

0 commit comments

Comments
 (0)