Skip to content

Commit 2f499b9

Browse files
committed
[AMDGPU] Add volatile support to SIMemoryLegalizer
Treat a non-atomic volatile load and store as a relaxed atomic at system scope for the address spaces accessed. This will ensure all relevant caches will be bypassed. A volatile atomic is not changed and still only bypasses caches upto the level specified by the SyncScope operand. Differential Revision: https://reviews.llvm.org/D94214
1 parent aab25fa commit 2f499b9

File tree

87 files changed

+3559
-1093
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

87 files changed

+3559
-1093
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4736,18 +4736,48 @@ in table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx6-gfx9-table`.
47364736

47374737
1. buffer/global/flat_load
47384738
glc=1
4739-
4740-
- nontemporal
4739+
2. s_waitcnt vmcnt(0)
4740+
4741+
- Must happen before
4742+
any following volatile
4743+
global/generic
4744+
load/store.
4745+
- Ensures that
4746+
volatile
4747+
operations to
4748+
different
4749+
addresses will not
4750+
be reordered by
4751+
hardware.
4752+
4753+
- !volatile & nontemporal
47414754

47424755
1. buffer/global/flat_load
47434756
glc=1 slc=1
47444757

47454758
load *none* *none* - local 1. ds_load
4746-
store *none* *none* - global - !nontemporal
4759+
store *none* *none* - global - !volatile & !nontemporal
47474760
- generic
47484761
- private 1. buffer/global/flat_store
47494762
- constant
4750-
- nontemporal
4763+
- volatile & !nontemporal
4764+
4765+
1. buffer/global/flat_store
4766+
2. s_waitcnt vmcnt(0)
4767+
4768+
- Must happen before
4769+
any following volatile
4770+
global/generic
4771+
load/store.
4772+
- Ensures that
4773+
volatile
4774+
operations to
4775+
different
4776+
addresses will not
4777+
be reordered by
4778+
hardware.
4779+
4780+
- !volatile & nontemporal
47514781

47524782
1. buffer/global/flat_store
47534783
glc=1 slc=1
@@ -6008,18 +6038,48 @@ table :ref:`amdgpu-amdhsa-memory-model-code-sequences-gfx10-table`.
60086038

60096039
1. buffer/global/flat_load
60106040
glc=1 dlc=1
6011-
6012-
- nontemporal
6041+
2. s_waitcnt vmcnt(0)
6042+
6043+
- Must happen before
6044+
any following volatile
6045+
global/generic
6046+
load/store.
6047+
- Ensures that
6048+
volatile
6049+
operations to
6050+
different
6051+
addresses will not
6052+
be reordered by
6053+
hardware.
6054+
6055+
- !volatile & nontemporal
60136056

60146057
1. buffer/global/flat_load
60156058
slc=1
60166059

60176060
load *none* *none* - local 1. ds_load
6018-
store *none* *none* - global - !nontemporal
6061+
store *none* *none* - global - !volatile & !nontemporal
60196062
- generic
60206063
- private 1. buffer/global/flat_store
60216064
- constant
6022-
- nontemporal
6065+
- volatile & !nontemporal
6066+
6067+
1. buffer/global/flat_store
6068+
2. s_waitcnt vscnt(0)
6069+
6070+
- Must happen before
6071+
any following volatile
6072+
global/generic
6073+
load/store.
6074+
- Ensures that
6075+
volatile
6076+
operations to
6077+
different
6078+
addresses will not
6079+
be reordered by
6080+
hardware.
6081+
6082+
- !volatile & nontemporal
60236083

60246084
1. buffer/global/flat_store
60256085
slc=1

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 115 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,7 @@ class SIMemOpInfo final {
110110
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
111111
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
112112
bool IsCrossAddressSpaceOrdering = false;
113+
bool IsVolatile = false;
113114
bool IsNonTemporal = false;
114115

115116
SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
@@ -119,11 +120,13 @@ class SIMemOpInfo final {
119120
bool IsCrossAddressSpaceOrdering = true,
120121
AtomicOrdering FailureOrdering =
121122
AtomicOrdering::SequentiallyConsistent,
123+
bool IsVolatile = false,
122124
bool IsNonTemporal = false)
123125
: Ordering(Ordering), FailureOrdering(FailureOrdering),
124126
Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
125127
InstrAddrSpace(InstrAddrSpace),
126128
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
129+
IsVolatile(IsVolatile),
127130
IsNonTemporal(IsNonTemporal) {
128131
// There is also no cross address space ordering if the ordering
129132
// address space is the same as the instruction address space and
@@ -171,7 +174,13 @@ class SIMemOpInfo final {
171174
}
172175

173176
/// \returns True if memory access of the machine instruction used to
174-
/// create this SIMemOpInfo is non-temporal, false otherwise.
177+
/// create this SIMemOpInfo is volatile, false otherwise.
178+
bool isVolatile() const {
179+
return IsVolatile;
180+
}
181+
182+
/// \returns True if memory access of the machine instruction used to
183+
/// create this SIMemOpInfo is nontemporal, false otherwise.
175184
bool isNonTemporal() const {
176185
return IsNonTemporal;
177186
}
@@ -259,10 +268,13 @@ class SICacheControl {
259268
SIAtomicScope Scope,
260269
SIAtomicAddrSpace AddrSpace) const = 0;
261270

262-
/// Update \p MI memory instruction to indicate it is
263-
/// nontemporal. Return true iff the instruction was modified.
264-
virtual bool enableNonTemporal(const MachineBasicBlock::iterator &MI)
265-
const = 0;
271+
/// Update \p MI memory instruction of kind \p Op associated with address
272+
/// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
273+
/// true iff the instruction was modified.
274+
virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
275+
SIAtomicAddrSpace AddrSpace,
276+
SIMemOp Op, bool IsVolatile,
277+
bool IsNonTemporal) const = 0;
266278

267279
/// Inserts any necessary instructions at position \p Pos relative
268280
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
@@ -328,7 +340,10 @@ class SIGfx6CacheControl : public SICacheControl {
328340
SIAtomicScope Scope,
329341
SIAtomicAddrSpace AddrSpace) const override;
330342

331-
bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
343+
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
344+
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
345+
bool IsVolatile,
346+
bool IsNonTemporal) const override;
332347

333348
bool insertWait(MachineBasicBlock::iterator &MI,
334349
SIAtomicScope Scope,
@@ -378,7 +393,10 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
378393
SIAtomicScope Scope,
379394
SIAtomicAddrSpace AddrSpace) const override;
380395

381-
bool enableNonTemporal(const MachineBasicBlock::iterator &MI) const override;
396+
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
397+
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
398+
bool IsVolatile,
399+
bool IsNonTemporal) const override;
382400

383401
bool insertWait(MachineBasicBlock::iterator &MI,
384402
SIAtomicScope Scope,
@@ -529,11 +547,13 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
529547
AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
530548
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
531549
bool IsNonTemporal = true;
550+
bool IsVolatile = false;
532551

533552
// Validator should check whether or not MMOs cover the entire set of
534553
// locations accessed by the memory instruction.
535554
for (const auto &MMO : MI->memoperands()) {
536555
IsNonTemporal &= MMO->isNonTemporal();
556+
IsVolatile |= MMO->isVolatile();
537557
InstrAddrSpace |=
538558
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
539559
AtomicOrdering OpOrdering = MMO->getOrdering();
@@ -576,7 +596,8 @@ Optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
576596
}
577597
}
578598
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
579-
IsCrossAddressSpaceOrdering, FailureOrdering, IsNonTemporal);
599+
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
600+
IsNonTemporal);
580601
}
581602

582603
Optional<SIMemOpInfo> SIMemOpAccess::getLoadInfo(
@@ -703,14 +724,43 @@ bool SIGfx6CacheControl::enableLoadCacheBypass(
703724
return Changed;
704725
}
705726

706-
bool SIGfx6CacheControl::enableNonTemporal(
707-
const MachineBasicBlock::iterator &MI) const {
727+
bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
728+
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
729+
bool IsVolatile, bool IsNonTemporal) const {
730+
// Only handle load and store, not atomic read-modify-write insructions. The
731+
// latter use glc to indicate if the atomic returns a result and so must not
732+
// be used for cache control.
708733
assert(MI->mayLoad() ^ MI->mayStore());
734+
735+
// Only update load and store, not LLVM IR atomic read-modify-write
736+
// instructions. The latter are always marked as volatile so cannot sensibly
737+
// handle it as do not want to pessimize all atomics. Also they do not support
738+
// the nontemporal attribute.
739+
assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
740+
709741
bool Changed = false;
710742

711-
/// TODO: Do not enableGLCBit if rmw atomic.
712-
Changed |= enableGLCBit(MI);
713-
Changed |= enableSLCBit(MI);
743+
if (IsVolatile) {
744+
if (Op == SIMemOp::LOAD)
745+
Changed |= enableGLCBit(MI);
746+
747+
// Ensure operation has completed at system scope to cause all volatile
748+
// operations to be visible outside the program in a global order. Do not
749+
// request cross address space as only the global address space can be
750+
// observable outside the program, so no need to cause a waitcnt for LDS
751+
// address space operations.
752+
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
753+
Position::AFTER);
754+
755+
return Changed;
756+
}
757+
758+
if (IsNonTemporal) {
759+
// Request L1 MISS_EVICT and L2 STREAM for load and store instructions.
760+
Changed |= enableGLCBit(MI);
761+
Changed |= enableSLCBit(MI);
762+
return Changed;
763+
}
714764

715765
return Changed;
716766
}
@@ -732,7 +782,8 @@ bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
732782
bool VMCnt = false;
733783
bool LGKMCnt = false;
734784

735-
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
785+
if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
786+
SIAtomicAddrSpace::NONE) {
736787
switch (Scope) {
737788
case SIAtomicScope::SYSTEM:
738789
case SIAtomicScope::AGENT:
@@ -959,13 +1010,45 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
9591010
return Changed;
9601011
}
9611012

962-
bool SIGfx10CacheControl::enableNonTemporal(
963-
const MachineBasicBlock::iterator &MI) const {
1013+
bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1014+
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1015+
bool IsVolatile, bool IsNonTemporal) const {
1016+
1017+
// Only handle load and store, not atomic read-modify-write insructions. The
1018+
// latter use glc to indicate if the atomic returns a result and so must not
1019+
// be used for cache control.
9641020
assert(MI->mayLoad() ^ MI->mayStore());
1021+
1022+
// Only update load and store, not LLVM IR atomic read-modify-write
1023+
// instructions. The latter are always marked as volatile so cannot sensibly
1024+
// handle it as do not want to pessimize all atomics. Also they do not support
1025+
// the nontemporal attribute.
1026+
assert( Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1027+
9651028
bool Changed = false;
9661029

967-
Changed |= enableSLCBit(MI);
968-
/// TODO for store (non-rmw atomic) instructions also enableGLCBit(MI)
1030+
if (IsVolatile) {
1031+
1032+
if (Op == SIMemOp::LOAD) {
1033+
Changed |= enableGLCBit(MI);
1034+
Changed |= enableDLCBit(MI);
1035+
}
1036+
1037+
// Ensure operation has completed at system scope to cause all volatile
1038+
// operations to be visible outside the program in a global order. Do not
1039+
// request cross address space as only the global address space can be
1040+
// observable outside the program, so no need to cause a waitcnt for LDS
1041+
// address space operations.
1042+
Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1043+
Position::AFTER);
1044+
return Changed;
1045+
}
1046+
1047+
if (IsNonTemporal) {
1048+
// Request L0/L1 HIT_EVICT and L2 STREAM for load and store instructions.
1049+
Changed |= enableSLCBit(MI);
1050+
return Changed;
1051+
}
9691052

9701053
return Changed;
9711054
}
@@ -988,7 +1071,8 @@ bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
9881071
bool VSCnt = false;
9891072
bool LGKMCnt = false;
9901073

991-
if ((AddrSpace & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE) {
1074+
if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1075+
SIAtomicAddrSpace::NONE) {
9921076
switch (Scope) {
9931077
case SIAtomicScope::SYSTEM:
9941078
case SIAtomicScope::AGENT:
@@ -1191,12 +1275,12 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
11911275
return Changed;
11921276
}
11931277

1194-
// Atomic instructions do not have the nontemporal attribute.
1195-
if (MOI.isNonTemporal()) {
1196-
Changed |= CC->enableNonTemporal(MI);
1197-
return Changed;
1198-
}
1199-
1278+
// Atomic instructions already bypass caches to the scope specified by the
1279+
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
1280+
// need additional treatment.
1281+
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
1282+
SIMemOp::LOAD, MOI.isVolatile(),
1283+
MOI.isNonTemporal());
12001284
return Changed;
12011285
}
12021286

@@ -1217,12 +1301,12 @@ bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
12171301
return Changed;
12181302
}
12191303

1220-
// Atomic instructions do not have the nontemporal attribute.
1221-
if (MOI.isNonTemporal()) {
1222-
Changed |= CC->enableNonTemporal(MI);
1223-
return Changed;
1224-
}
1225-
1304+
// Atomic instructions already bypass caches to the scope specified by the
1305+
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
1306+
// need additional treatment.
1307+
Changed |= CC->enableVolatileAndOrNonTemporal(
1308+
MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
1309+
MOI.isNonTemporal());
12261310
return Changed;
12271311
}
12281312

llvm/test/CodeGen/AMDGPU/GlobalISel/bool-legalization.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,11 @@ define amdgpu_kernel void @sgpr_trunc_brcond(i32 %cond) {
5959
; GCN-NEXT: ; %bb.1: ; %bb0
6060
; GCN-NEXT: v_mov_b32_e32 v0, 0
6161
; GCN-NEXT: flat_store_dword v[0:1], v0
62+
; GCN-NEXT: s_waitcnt vmcnt(0)
6263
; GCN-NEXT: BB3_2: ; %bb1
6364
; GCN-NEXT: v_mov_b32_e32 v0, 1
6465
; GCN-NEXT: flat_store_dword v[0:1], v0
66+
; GCN-NEXT: s_waitcnt vmcnt(0)
6567
entry:
6668
%trunc = trunc i32 %cond to i1
6769
br i1 %trunc, label %bb0, label %bb1
@@ -88,9 +90,11 @@ define amdgpu_kernel void @brcond_sgpr_trunc_and(i32 %cond0, i32 %cond1) {
8890
; GCN-NEXT: ; %bb.1: ; %bb0
8991
; GCN-NEXT: v_mov_b32_e32 v0, 0
9092
; GCN-NEXT: flat_store_dword v[0:1], v0
93+
; GCN-NEXT: s_waitcnt vmcnt(0)
9194
; GCN-NEXT: BB4_2: ; %bb1
9295
; GCN-NEXT: v_mov_b32_e32 v0, 1
9396
; GCN-NEXT: flat_store_dword v[0:1], v0
97+
; GCN-NEXT: s_waitcnt vmcnt(0)
9498
entry:
9599
%trunc0 = trunc i32 %cond0 to i1
96100
%trunc1 = trunc i32 %cond1 to i1

0 commit comments

Comments
 (0)