Skip to content

Commit af0c8df

Browse files
committed
Docs; new tests + renames; refactor enableLastUse
1 parent e301287 commit af0c8df

File tree

5 files changed

+112
-57
lines changed

5 files changed

+112
-57
lines changed

llvm/docs/AMDGPUUsage.rst

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1299,6 +1299,21 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
12991299

13001300
List AMDGPU intrinsics.
13011301

1302+
LLVM IR Metadata
1303+
------------------
1304+
1305+
The AMDGPU backend implements the following LLVM IR metadata.
1306+
1307+
.. table:: AMDGPU LLVM IR Metadata
1308+
:name: amdgpu-llvm-ir-metadata-table
1309+
1310+
============================================== ==========================================================
1311+
LLVM IR Metadata Description
1312+
============================================== ==========================================================
1313+
!amdgpu.last.use Sets TH_LOAD_LU temporal hint on load instructions that support it.
1314+
Takes priority over nontemporal hint (TH_LOAD_NT).
1315+
============================================== ==========================================================
1316+
13021317
LLVM IR Attributes
13031318
------------------
13041319

llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp

Lines changed: 31 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -309,23 +309,18 @@ class SICacheControl {
309309
SIAtomicAddrSpace AddrSpace) const = 0;
310310

311311
/// Update \p MI memory instruction of kind \p Op associated with address
312-
/// spaces \p AddrSpace to indicate it is volatile and/or nontemporal. Return
313-
/// true iff the instruction was modified.
312+
/// spaces \p AddrSpace to indicate it is volatile and/or
313+
/// nontemporal/last-use. Return true iff the instruction was modified.
314314
virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
315315
SIAtomicAddrSpace AddrSpace,
316316
SIMemOp Op, bool IsVolatile,
317-
bool IsNonTemporal) const = 0;
317+
bool IsNonTemporal,
318+
bool IsLastUse = false) const = 0;
318319

319320
virtual bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const {
320321
return false;
321322
};
322323

323-
/// Update \p MI memory instruction to indicate it is a last use. Return true
324-
/// iff the instruction was modified.
325-
virtual bool enableLastUse(MachineInstr &MI, bool IsLastUse) const {
326-
return false;
327-
}
328-
329324
/// Inserts any necessary instructions at position \p Pos relative
330325
/// to instruction \p MI to ensure memory instructions before \p Pos of kind
331326
/// \p Op associated with address spaces \p AddrSpace have completed. Used
@@ -404,8 +399,8 @@ class SIGfx6CacheControl : public SICacheControl {
404399

405400
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
406401
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
407-
bool IsVolatile,
408-
bool IsNonTemporal) const override;
402+
bool IsVolatile, bool IsNonTemporal,
403+
bool IsLastUse) const override;
409404

410405
bool insertWait(MachineBasicBlock::iterator &MI,
411406
SIAtomicScope Scope,
@@ -457,8 +452,8 @@ class SIGfx90ACacheControl : public SIGfx7CacheControl {
457452

458453
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
459454
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
460-
bool IsVolatile,
461-
bool IsNonTemporal) const override;
455+
bool IsVolatile, bool IsNonTemporal,
456+
bool IsLastUse) const override;
462457

463458
bool insertWait(MachineBasicBlock::iterator &MI,
464459
SIAtomicScope Scope,
@@ -518,8 +513,8 @@ class SIGfx940CacheControl : public SIGfx90ACacheControl {
518513

519514
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
520515
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
521-
bool IsVolatile,
522-
bool IsNonTemporal) const override;
516+
bool IsVolatile, bool IsNonTemporal,
517+
bool IsLastUse) const override;
523518

524519
bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
525520
SIAtomicAddrSpace AddrSpace, Position Pos) const override;
@@ -562,8 +557,8 @@ class SIGfx10CacheControl : public SIGfx7CacheControl {
562557

563558
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
564559
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
565-
bool IsVolatile,
566-
bool IsNonTemporal) const override;
560+
bool IsVolatile, bool IsNonTemporal,
561+
bool IsLastUse) const override;
567562

568563
bool insertWait(MachineBasicBlock::iterator &MI,
569564
SIAtomicScope Scope,
@@ -588,8 +583,8 @@ class SIGfx11CacheControl : public SIGfx10CacheControl {
588583

589584
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
590585
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
591-
bool IsVolatile,
592-
bool IsNonTemporal) const override;
586+
bool IsVolatile, bool IsNonTemporal,
587+
bool IsLastUse) const override;
593588
};
594589

595590
class SIGfx12CacheControl : public SIGfx11CacheControl {
@@ -624,12 +619,10 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
624619

625620
bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
626621
SIAtomicAddrSpace AddrSpace, SIMemOp Op,
627-
bool IsVolatile,
628-
bool IsNonTemporal) const override;
622+
bool IsVolatile, bool IsNonTemporal,
623+
bool IsLastUse) const override;
629624

630625
bool expandSystemScopeStore(MachineBasicBlock::iterator &MI) const override;
631-
632-
bool enableLastUse(MachineInstr &MI, bool IsLastUse) const override;
633626
};
634627

635628
class SIMemoryLegalizer final : public MachineFunctionPass {
@@ -983,7 +976,7 @@ bool SIGfx6CacheControl::enableRMWCacheBypass(
983976

984977
bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
985978
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
986-
bool IsVolatile, bool IsNonTemporal) const {
979+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
987980
// Only handle load and store, not atomic read-modify-write insructions. The
988981
// latter use glc to indicate if the atomic returns a result and so must not
989982
// be used for cache control.
@@ -1336,7 +1329,7 @@ bool SIGfx90ACacheControl::enableRMWCacheBypass(
13361329

13371330
bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
13381331
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1339-
bool IsVolatile, bool IsNonTemporal) const {
1332+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
13401333
// Only handle load and store, not atomic read-modify-write insructions. The
13411334
// latter use glc to indicate if the atomic returns a result and so must not
13421335
// be used for cache control.
@@ -1638,7 +1631,7 @@ bool SIGfx940CacheControl::enableRMWCacheBypass(
16381631

16391632
bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
16401633
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1641-
bool IsVolatile, bool IsNonTemporal) const {
1634+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
16421635
// Only handle load and store, not atomic read-modify-write insructions. The
16431636
// latter use glc to indicate if the atomic returns a result and so must not
16441637
// be used for cache control.
@@ -1870,7 +1863,7 @@ bool SIGfx10CacheControl::enableLoadCacheBypass(
18701863

18711864
bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
18721865
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1873-
bool IsVolatile, bool IsNonTemporal) const {
1866+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
18741867

18751868
// Only handle load and store, not atomic read-modify-write insructions. The
18761869
// latter use glc to indicate if the atomic returns a result and so must not
@@ -2141,7 +2134,7 @@ bool SIGfx11CacheControl::enableLoadCacheBypass(
21412134

21422135
bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
21432136
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2144-
bool IsVolatile, bool IsNonTemporal) const {
2137+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
21452138

21462139
// Only handle load and store, not atomic read-modify-write insructions. The
21472140
// latter use glc to indicate if the atomic returns a result and so must not
@@ -2393,7 +2386,7 @@ bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
23932386

23942387
bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
23952388
MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2396-
bool IsVolatile, bool IsNonTemporal) const {
2389+
bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
23972390

23982391
// Only handle load and store, not atomic read-modify-write instructions.
23992392
assert(MI->mayLoad() ^ MI->mayStore());
@@ -2406,7 +2399,10 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
24062399

24072400
bool Changed = false;
24082401

2409-
if (IsNonTemporal) {
2402+
if (IsLastUse) {
2403+
// Set last-use hint.
2404+
Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2405+
} else if (IsNonTemporal) {
24102406
// Set non-temporal hint for all cache levels.
24112407
Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
24122408
}
@@ -2429,12 +2425,6 @@ bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
24292425
return Changed;
24302426
}
24312427

2432-
bool SIGfx12CacheControl::enableLastUse(MachineInstr &MI,
2433-
bool IsLastUse) const {
2434-
assert(MI.mayLoad() && !MI.mayStore());
2435-
return IsLastUse ? setTH(MI, AMDGPU::CPol::TH_LU) : false;
2436-
}
2437-
24382428
bool SIGfx12CacheControl::expandSystemScopeStore(
24392429
MachineBasicBlock::iterator &MI) const {
24402430
MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
@@ -2491,18 +2481,12 @@ bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
24912481
return Changed;
24922482
}
24932483

2494-
// enableVolatileAndOrNonTemporal can insert instructions and advance iterator
2495-
// MI and we need original instruction for enabling last use.
2496-
MachineInstr &Inst = *MI;
2497-
24982484
// Atomic instructions already bypass caches to the scope specified by the
2499-
// SyncScope operand. Only non-atomic volatile and nontemporal instructions
2500-
// need additional treatment.
2501-
Changed |= CC->enableVolatileAndOrNonTemporal(MI, MOI.getInstrAddrSpace(),
2502-
SIMemOp::LOAD, MOI.isVolatile(),
2503-
MOI.isNonTemporal());
2504-
2505-
Changed |= CC->enableLastUse(Inst, MOI.isLastUse());
2485+
// SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2486+
// instructions need additional treatment.
2487+
Changed |= CC->enableVolatileAndOrNonTemporal(
2488+
MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2489+
MOI.isNonTemporal(), MOI.isLastUse());
25062490

25072491
return Changed;
25082492
}

llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,8 @@ entry:
4141
ret void
4242
}
4343

44-
define amdgpu_kernel void @flat_last_use_volatile_load(ptr %in, ptr %out) {
45-
; GFX12-LABEL: flat_last_use_volatile_load:
44+
define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
45+
; GFX12-LABEL: flat_last_use_and_volatile_load:
4646
; GFX12: ; %bb.0: ; %entry
4747
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
4848
; GFX12-NEXT: s_wait_kmcnt 0x0
@@ -59,6 +59,23 @@ entry:
5959
ret void
6060
}
6161

62+
define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out) {
63+
; GFX12-LABEL: flat_last_use_and_nontemporal_load:
64+
; GFX12: ; %bb.0: ; %entry
65+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
66+
; GFX12-NEXT: s_wait_kmcnt 0x0
67+
; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
68+
; GFX12-NEXT: flat_load_b32 v2, v[0:1] th:TH_LOAD_LU
69+
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
70+
; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
71+
; GFX12-NEXT: flat_store_b32 v[0:1], v2
72+
; GFX12-NEXT: s_endpgm
73+
entry:
74+
%val = load i32, ptr %in, align 4, !amdgpu.last.use !{}, !nontemporal !0
75+
store i32 %val, ptr %out
76+
ret void
77+
}
78+
6279
!0 = !{i32 1}
6380
declare i32 @llvm.amdgcn.workitem.id.x()
6481
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ entry:
4040
ret void
4141
}
4242

43-
define amdgpu_kernel void @global_last_use_volatile_load(ptr addrspace(1) %in, ptr addrspace(1) %out) {
44-
; GFX12-LABEL: global_last_use_volatile_load:
43+
define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %in, ptr addrspace(1) %out) {
44+
; GFX12-LABEL: global_last_use_and_volatile_load:
4545
; GFX12: ; %bb.0: ; %entry
4646
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
4747
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -58,6 +58,25 @@ entry:
5858
ret void
5959
}
6060

61+
define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1) %in, ptr addrspace(1) %out) {
62+
; GFX12-LABEL: global_last_use_and_nontemporal_load:
63+
; GFX12: ; %bb.0: ; %entry
64+
; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x0
65+
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
66+
; GFX12-NEXT: s_wait_kmcnt 0x0
67+
; GFX12-NEXT: global_load_b32 v0, v0, s[0:1] th:TH_LOAD_LU
68+
; GFX12-NEXT: s_wait_loadcnt 0x0
69+
; GFX12-NEXT: global_store_b32 v1, v0, s[2:3]
70+
; GFX12-NEXT: s_nop 0
71+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
72+
; GFX12-NEXT: s_endpgm
73+
entry:
74+
%tid = call i32 @llvm.amdgcn.workitem.id.x()
75+
%val.gep = getelementptr inbounds i32, ptr addrspace(1) %in, i32 %tid
76+
%val = load i32, ptr addrspace(1) %val.gep, align 4, !amdgpu.last.use !{}, !nontemporal !0
77+
store i32 %val, ptr addrspace(1) %out
78+
ret void
79+
}
6180
!0 = !{i32 1}
6281
declare i32 @llvm.amdgcn.workitem.id.x()
6382
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,8 @@
22
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12,GFX12-WGP %s
33
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12,GFX12-CU %s
44

5-
define amdgpu_kernel void @private_nontemporal_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) {
6-
; GFX12-LABEL: private_nontemporal_load_0:
5+
define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr addrspace(1) %out) {
6+
; GFX12-LABEL: private_last_use_load_0:
77
; GFX12: ; %bb.0: ; %entry
88
; GFX12-NEXT: s_clause 0x1
99
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
@@ -22,8 +22,8 @@ entry:
2222
ret void
2323
}
2424

25-
define amdgpu_kernel void @private_nontemporal_load_1(ptr addrspace(5) %in, ptr addrspace(1) %out) {
26-
; GFX12-LABEL: private_nontemporal_load_1:
25+
define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr addrspace(1) %out) {
26+
; GFX12-LABEL: private_last_use_load_1:
2727
; GFX12: ; %bb.0: ; %entry
2828
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
2929
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
@@ -43,8 +43,8 @@ entry:
4343
ret void
4444
}
4545

46-
define amdgpu_kernel void @private_nontemporal_volatile_load(ptr addrspace(5) %in, ptr addrspace(1) %out) {
47-
; GFX12-LABEL: private_nontemporal_volatile_load:
46+
define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %in, ptr addrspace(1) %out) {
47+
; GFX12-LABEL: private_last_use_and_volatile_load:
4848
; GFX12: ; %bb.0: ; %entry
4949
; GFX12-NEXT: s_clause 0x1
5050
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
@@ -63,6 +63,26 @@ entry:
6363
ret void
6464
}
6565

66+
define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5) %in, ptr addrspace(1) %out) {
67+
; GFX12-LABEL: private_last_use_and_nontemporal_load:
68+
; GFX12: ; %bb.0: ; %entry
69+
; GFX12-NEXT: s_clause 0x1
70+
; GFX12-NEXT: s_load_b32 s2, s[0:1], 0x0
71+
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x8
72+
; GFX12-NEXT: v_mov_b32_e32 v1, 0
73+
; GFX12-NEXT: s_wait_kmcnt 0x0
74+
; GFX12-NEXT: scratch_load_b32 v0, off, s2 th:TH_LOAD_LU
75+
; GFX12-NEXT: s_wait_loadcnt 0x0
76+
; GFX12-NEXT: global_store_b32 v1, v0, s[0:1]
77+
; GFX12-NEXT: s_nop 0
78+
; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
79+
; GFX12-NEXT: s_endpgm
80+
entry:
81+
%val = load i32, ptr addrspace(5) %in, align 4, !amdgpu.last.use !{}, !nontemporal !0
82+
store i32 %val, ptr addrspace(1) %out
83+
ret void
84+
}
85+
6686
!0 = !{i32 1}
6787
declare i32 @llvm.amdgcn.workitem.id.x()
6888
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

0 commit comments

Comments
 (0)