-
Notifications
You must be signed in to change notification settings - Fork 14.3k
AMDGPU/GFX12: Insert waitcnts before stores with scope_sys #82996
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -312,6 +312,10 @@ class SICacheControl { | |
SIMemOp Op, bool IsVolatile, | ||
bool IsNonTemporal) const = 0; | ||
|
||
virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const { | ||
return false; | ||
}; | ||
|
||
/// Inserts any necessary instructions at position \p Pos relative | ||
/// to instruction \p MI to ensure memory instructions before \p Pos of kind | ||
/// \p Op associated with address spaces \p AddrSpace have completed. Used | ||
|
@@ -589,6 +593,15 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { | |
bool setScope(const MachineBasicBlock::iterator MI, | ||
AMDGPU::CPol::CPol Value) const; | ||
|
||
// Stores with system scope (SCOPE_SYS) need to wait for: | ||
// - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0 | ||
// - non-returning-atomics - wait for STORECNT==0 | ||
// TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits | ||
// since it does not distinguish atomics-with-return from regular stores. | ||
// There is no need to wait if memory is cached (mtype != UC). | ||
bool | ||
insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const; | ||
|
||
public: | ||
SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {} | ||
|
||
|
@@ -603,6 +616,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl { | |
SIAtomicAddrSpace AddrSpace, SIMemOp Op, | ||
bool IsVolatile, | ||
bool IsNonTemporal) const override; | ||
|
||
bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override; | ||
}; | ||
|
||
class SIMemoryLegalizer final : public MachineFunctionPass { | ||
|
@@ -2194,6 +2209,22 @@ bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI, | |
return false; | ||
} | ||
|
||
bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore( | ||
const MachineBasicBlock::iterator MI) const { | ||
// TODO: implement flag for frontend to give us a hint not to insert waits. | ||
|
||
MachineBasicBlock &MBB = *MI->getParent(); | ||
const DebugLoc &DL = MI->getDebugLoc(); | ||
|
||
BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0); | ||
BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0); | ||
BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0); | ||
BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0); | ||
BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0); | ||
|
||
return true; | ||
} | ||
|
||
bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI, | ||
SIAtomicScope Scope, | ||
SIAtomicAddrSpace AddrSpace, SIMemOp Op, | ||
|
@@ -2364,6 +2395,9 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( | |
if (IsVolatile) { | ||
Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS); | ||
|
||
if (Op == SIMemOp::STORE) | ||
Changed |= insertWaitsBeforeSystemScopeStore(MI); | ||
|
||
// Ensure operation has completed at system scope to cause all volatile | ||
// operations to be visible outside the program in a global order. Do not | ||
// request cross address space as only the global address space can be | ||
|
@@ -2381,6 +2415,15 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal( | |
return Changed; | ||
} | ||
|
||
bool SIGfx12CacheControl::expandSystemScopeStore( | ||
MachineBasicBlock::iterator &MI) const { | ||
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol); | ||
if (CPol && ((CPol->getImm() & CPol::SCOPE) == CPol::SCOPE_SYS)) | ||
return insertWaitsBeforeSystemScopeStore(MI); | ||
|
||
return false; | ||
} | ||
|
||
bool SIMemoryLegalizer::removeAtomicPseudoMIs() { | ||
if (AtomicPseudoMIs.empty()) | ||
return false; | ||
|
@@ -2467,6 +2510,10 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI, | |
Changed |= CC->enableVolatileAndOrNonTemporal( | ||
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(), | ||
MOI.isNonTemporal()); | ||
|
||
// GFX12 specific, scope(desired coherence domain in cache hierarchy) is | ||
// instruction field, do not confuse it with atomic scope. | ||
Changed |= CC->expandSystemScopeStore(MI); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this also needed for atomic stores? They returned early on line 2495 so they won't hit this code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. As far as I know no, needed only non-atomic stores There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't see why it wouldn't be needed on a store release? store release already waits, but it doesn't (explicitly) wait for all the counters we need to wait on for |
||
return Changed; | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a bit messy that we need this extra call to insertWaitsBeforeSystemScopeStore here, because the call to insertWait below modifies MI so it no longer refers to the store. But I guess it is OK.