Skip to content

Commit 1fd1f4c

Browse files
authored
[AMDGPU] Handle amdgpu.last.use metadata (#83816)
Convert !amdgpu.last.use metadata into MachineMemOperand for last use and handle it in SIMemoryLegalizer similar to nontemporal and volatile.
1 parent a87dc23 commit 1fd1f4c

File tree

6 files changed

+332
-46
lines changed

6 files changed

+332
-46
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1306,6 +1306,21 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
13061306

13071307
List AMDGPU intrinsics.
13081308

1309+
LLVM IR Metadata
1310+
------------------
1311+
1312+
The AMDGPU backend implements the following LLVM IR metadata.
1313+
1314+
.. table:: AMDGPU LLVM IR Metadata
1315+
:name: amdgpu-llvm-ir-metadata-table
1316+
1317+
============================================== ==========================================================
1318+
LLVM IR Metadata Description
1319+
============================================== ==========================================================
1320+
!amdgpu.last.use Sets TH_LOAD_LU temporal hint on load instructions that support it.
1321+
Takes priority over nontemporal hint (TH_LOAD_NT).
1322+
============================================== ==========================================================
1323+
13091324
LLVM IR Attributes
13101325
------------------
13111326

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16251,9 +16251,12 @@ bool SITargetLowering::isReassocProfitable(MachineRegisterInfo &MRI,
1625116251
MachineMemOperand::Flags
1625216252
SITargetLowering::getTargetMMOFlags(const Instruction &I) const {
1625316253
// Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
16254+
MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
1625416255
if (I.getMetadata("amdgpu.noclobber"))
16255-
return MONoClobber;
16256-
return MachineMemOperand::MONone;
16256+
Flags |= MONoClobber;
16257+
if (I.getMetadata("amdgpu.last.use"))
16258+
Flags |= MOLastUse;
16259+
return Flags;
1625716260
}
1625816261

1625916262
bool SITargetLowering::checkForPhysRegDependency(

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 55 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -98,22 +98,22 @@ class SIMemOpInfo final {
9898
bool IsCrossAddressSpaceOrdering = false;
9999
bool IsVolatile = false;
100100
bool IsNonTemporal = false;
101-
102-
SIMemOpInfo(AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
103-
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
104-
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
105-
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
106-
bool IsCrossAddressSpaceOrdering = true,
107-
AtomicOrdering FailureOrdering =
108-
AtomicOrdering::SequentiallyConsistent,
109-
bool IsVolatile = false,
110-
bool IsNonTemporal = false)
111-
: Ordering(Ordering), FailureOrdering(FailureOrdering),
112-
Scope(Scope), OrderingAddrSpace(OrderingAddrSpace),
113-
InstrAddrSpace(InstrAddrSpace),
114-
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115-
IsVolatile(IsVolatile),
116-
IsNonTemporal(IsNonTemporal) {
101+
bool IsLastUse = false;
102+
103+
SIMemOpInfo(
104+
AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
105+
SIAtomicScope Scope = SIAtomicScope::SYSTEM,
106+
SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
107+
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
108+
bool IsCrossAddressSpaceOrdering = true,
109+
AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
110+
bool IsVolatile = false, bool IsNonTemporal = false,
111+
bool IsLastUse = false)
112+
: Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
113+
OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
114+
IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
115+
IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
116+
IsLastUse(IsLastUse) {
117117

118118
if (Ordering == AtomicOrdering::NotAtomic) {
119119
assert(Scope == SIAtomicScope::NONE &&
@@ -201,6 +201,10 @@ class SIMemOpInfo final {
201201
return IsNonTemporal;
202202
}
203203

204+
/// \returns True if memory access of the machine instruction used to
205+
/// create this SIMemOpInfo is last use, false otherwise.
206+
bool isLastUse() const { return IsLastUse; }
207+
204208
/// \returns True if ordering constraint of the machine instruction used to
205209
/// create this SIMemOpInfo is unordered or higher, false otherwise.
206210
bool isAtomic() const {
@@ -305,12 +309,13 @@ class SICacheControl {
305309
SIAtomicAddrSpace AddrSpace) const = 0;
306310

307311
/// Update \p MI memory instruction of kind \p Op associated with address
308-
/// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
309-
/// true iff the instruction was modified.
312+
/// spaces \p AddrSpace to indicate it is volatile and/or
313+
/// nontemporal/last-use. Return true iff the instruction was modified.
310314
virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
311315
SIAtomicAddrSpace AddrSpace,
312316
SIMemOp Op, bool IsVolatile,
313-
bool IsNonTemporal) const = 0;
317+
bool IsNonTemporal,
318+
bool IsLastUse = false) const = 0;
314319

315320
virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
316321
return false;
@@ -394,8 +399,8 @@ class SIGfx6CacheControl : public SICacheControl {
394399

395400
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
396401
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
397-
bool IsVolatile,
398-
bool IsNonTemporal) const override;
402+
bool IsVolatile, bool IsNonTemporal,
403+
bool IsLastUse) const override;
399404

400405
bool insertWait(MachineBasicBlock::iterator &MI,
401406
SIAtomicScope Scope,
@@ -447,8 +452,8 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
447452

448453
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
449454
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
450-
bool IsVolatile,
451-
bool IsNonTemporal) const override;
455+
bool IsVolatile, bool IsNonTemporal,
456+
bool IsLastUse) const override;
452457

453458
bool insertWait(MachineBasicBlock::iterator &MI,
454459
SIAtomicScope Scope,
@@ -508,8 +513,8 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl {
508513

509514
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
510515
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
511-
bool IsVolatile,
512-
bool IsNonTemporal) const override;
516+
bool IsVolatile, bool IsNonTemporal,
517+
bool IsLastUse) const override;
513518

514519
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
515520
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
@@ -552,8 +557,8 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
552557

553558
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
554559
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
555-
bool IsVolatile,
556-
bool IsNonTemporal) const override;
560+
bool IsVolatile, bool IsNonTemporal,
561+
bool IsLastUse) const override;
557562

558563
bool insertWait(MachineBasicBlock::iterator &MI,
559564
SIAtomicScope Scope,
@@ -578,8 +583,8 @@ class SIGfx11CacheControl : public SIGfx10CacheControl {
578583

579584
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
580585
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
581-
bool IsVolatile,
582-
bool IsNonTemporal) const override;
586+
bool IsVolatile, bool IsNonTemporal,
587+
bool IsLastUse) const override;
583588
};
584589

585590
class SIGfx12CacheControl : public SIGfx11CacheControl {
@@ -614,8 +619,8 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
614619

615620
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
616621
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
617-
bool IsVolatile,
618-
bool IsNonTemporal) const override;
622+
bool IsVolatile, bool IsNonTemporal,
623+
bool IsLastUse) const override;
619624

620625
bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
621626
};
@@ -745,12 +750,14 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
745750
SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
746751
bool IsNonTemporal = true;
747752
bool IsVolatile = false;
753+
bool IsLastUse = false;
748754

749755
// Validator should check whether or not MMOs cover the entire set of
750756
// locations accessed by the memory instruction.
751757
for (const auto &MMO : MI->memoperands()) {
752758
IsNonTemporal &= MMO->isNonTemporal();
753759
IsVolatile |= MMO->isVolatile();
760+
IsLastUse |= MMO->getFlags() & MOLastUse;
754761
InstrAddrSpace |=
755762
toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
756763
AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
@@ -792,7 +799,7 @@ std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
792799
}
793800
return SIMemOpInfo(Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
794801
IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
795-
IsNonTemporal);
802+
IsNonTemporal, IsLastUse);
796803
}
797804

798805
std::optional<SIMemOpInfo>
@@ -969,7 +976,7 @@ bool SIGfx6CacheControl::enableRMWCacheBypass(
969976

970977
bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
971978
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
972-
bool IsVolatile, bool IsNonTemporal) const {
979+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
973980
// Only handle load and store, not atomic read-modify-write insructions. The
974981
// latter use glc to indicate if the atomic returns a result and so must not
975982
// be used for cache control.
@@ -1322,7 +1329,7 @@ bool SIGfx90ACacheControl::enableRMWCacheBypass(
13221329

13231330
bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
13241331
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1325-
bool IsVolatile, bool IsNonTemporal) const {
1332+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
13261333
// Only handle load and store, not atomic read-modify-write insructions. The
13271334
// latter use glc to indicate if the atomic returns a result and so must not
13281335
// be used for cache control.
@@ -1624,7 +1631,7 @@ bool SIGfx940CacheControl::enableRMWCacheBypass(
16241631

16251632
bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
16261633
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1627-
bool IsVolatile, bool IsNonTemporal) const {
1634+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
16281635
// Only handle load and store, not atomic read-modify-write insructions. The
16291636
// latter use glc to indicate if the atomic returns a result and so must not
16301637
// be used for cache control.
@@ -1856,7 +1863,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
18561863

18571864
bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
18581865
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1859-
bool IsVolatile, bool IsNonTemporal) const {
1866+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
18601867

18611868
// Only handle load and store, not atomic read-modify-write insructions. The
18621869
// latter use glc to indicate if the atomic returns a result and so must not
@@ -2127,7 +2134,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass(
21272134

21282135
bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
21292136
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2130-
bool IsVolatile, bool IsNonTemporal) const {
2137+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
21312138

21322139
// Only handle load and store, not atomic read-modify-write insructions. The
21332140
// latter use glc to indicate if the atomic returns a result and so must not
@@ -2379,7 +2386,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
23792386

23802387
bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
23812388
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2382-
bool IsVolatile, bool IsNonTemporal) const {
2389+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
23832390

23842391
// Only handle load and store, not atomic read-modify-write instructions.
23852392
assert(MI->mayLoad() ^ MI->mayStore());
@@ -2392,7 +2399,10 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
23922399

23932400
bool Changed = false;
23942401

2395-
if (IsNonTemporal) {
2402+
if (IsLastUse) {
2403+
// Set last-use hint.
2404+
Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2405+
} else if (IsNonTemporal) {
23962406
// Set non-temporal hint for all cache levels.
23972407
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
23982408
}
@@ -2472,11 +2482,12 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
24722482
}
24732483

24742484
// Atomic instructions already bypass caches to the scope specified by the
2475-
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
2476-
// need additional treatment.
2477-
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2478-
SIMemOp::LOAD, MOI.isVolatile(),
2479-
MOI.isNonTemporal());
2485+
// SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2486+
// instructions need additional treatment.
2487+
Changed |= CC->enableVolatileAndOrNonTemporal(
2488+
MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2489+
MOI.isNonTemporal(), MOI.isLastUse());
2490+
24802491
return Changed;
24812492
}
24822493

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
3+
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
4+
5+
define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
6+
; GFX12-LABEL: flat_last_use_load_0:
7+
; GFX12: ; %bb.0: ; %entry
8+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
9+
; GFX12-NEXT: s_wait_kmcnt 0x0
10+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
11+
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
12+
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
13+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
14+
; GFX12-NEXT: flat_store_b32 v[0:1], v2
15+
; GFX12-NEXT: s_endpgm
16+
entry:
17+
%val = load i32, ptr %in, align 4, !amdgpu.last.use !{}
18+
store i32 %val, ptr %out
19+
ret void
20+
}
21+
22+
define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
23+
; GFX12-LABEL: flat_last_use_load_1:
24+
; GFX12: ; %bb.0: ; %entry
25+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
26+
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 2, v0
27+
; GFX12-NEXT: s_wait_kmcnt 0x0
28+
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
29+
; GFX12-NEXT: v_add_co_u32 v0, s0, s0, v0
30+
; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0
31+
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
32+
; GFX12-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2
33+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
34+
; GFX12-NEXT: flat_store_b32 v[0:1], v2
35+
; GFX12-NEXT: s_endpgm
36+
entry:
37+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
38+
%val.gep = getelementptr inbounds i32, ptr %in, i32 %tid
39+
%val = load i32, ptr %val.gep, align 4, !amdgpu.last.use !{}
40+
store i32 %val, ptr %out
41+
ret void
42+
}
43+
44+
define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
45+
; GFX12-LABEL: flat_last_use_and_volatile_load:
46+
; GFX12: ; %bb.0: ; %entry
47+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
48+
; GFX12-NEXT: s_wait_kmcnt 0x0
49+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
50+
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
51+
; GFX12-NEXT: s_wait_loadcnt 0x0
52+
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
53+
; GFX12-NEXT: s_wait_dscnt 0x0
54+
; GFX12-NEXT: flat_store_b32 v[0:1], v2
55+
; GFX12-NEXT: s_endpgm
56+
entry:
57+
%val = load volatile i32, ptr %in, align 4, !amdgpu.last.use !{}
58+
store i32 %val, ptr %out
59+
ret void
60+
}
61+
62+
define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) {
63+
; GFX12-LABEL: flat_last_use_and_nontemporal_load:
64+
; GFX12: ; %bb.0: ; %entry
65+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
66+
; GFX12-NEXT: s_wait_kmcnt 0x0
67+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
68+
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
69+
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
70+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
71+
; GFX12-NEXT: flat_store_b32 v[0:1], v2
72+
; GFX12-NEXT: s_endpgm
73+
entry:
74+
%val = load i32, ptr %in, align 4, !amdgpu.last.use !{}, !nontemporal !0
75+
store i32 %val, ptr %out
76+
ret void
77+
}
78+
79+
!0 = !{i32 1}
80+
declare i32 @llvm.amdgcn.workitem.id.x()
81+
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
82+
; GFX12-CU: {{.*}}
83+
; GFX12-WGP: {{.*}}

0 commit comments

Comments
 (0)