Skip to content

Commit 8a20c64

Browse files
authored
[AMDGPU] Create new option for force flush load counter (#124974)
In ceratin situations it is beneficial to wait for all outstanding loads regardless of specific load's data we need. This may allow to reduce a number of cache requests. Fixes: SWDEV-511507
1 parent 345512c commit 8a20c64

File tree

2 files changed

+56
-0
lines changed

2 files changed

+56
-0
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,11 @@ static cl::opt<bool>
5353
"s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"),
5454
cl::init(false), cl::Hidden);
5555

56+
static cl::opt<bool> ForceEmitZeroLoadFlag(
57+
"amdgpu-waitcnt-load-forcezero",
58+
cl::desc("Force all waitcnt load counters to wait until 0"),
59+
cl::init(false), cl::Hidden);
60+
5661
namespace {
5762
// Class of object that encapsulates latest instruction counter score
5863
// associated with the operand. Used for determining whether
@@ -1850,6 +1855,9 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
18501855
Wait.BvhCnt = 0;
18511856
}
18521857

1858+
if (ForceEmitZeroLoadFlag && Wait.LoadCnt != ~0u)
1859+
Wait.LoadCnt = 0;
1860+
18531861
return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets,
18541862
OldWaitcntInstr);
18551863
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -march=amdgcn -mcpu=gfx1100 < %s | FileCheck --check-prefixes=DEFAULT %s
3+
; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-waitcnt-load-forcezero < %s | FileCheck --check-prefixes=LDZERO %s
4+
5+
define amdgpu_kernel void @copy(ptr addrspace(1) noalias nocapture readonly %src1, ptr addrspace(1) noalias nocapture readonly %src2, ptr addrspace(1) noalias nocapture writeonly %dst1, ptr addrspace(1) noalias nocapture writeonly %dst2) {
6+
; DEFAULT-LABEL: copy:
7+
; DEFAULT: ; %bb.0:
8+
; DEFAULT-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
9+
; DEFAULT-NEXT: v_and_b32_e32 v0, 0x3ff, v0
10+
; DEFAULT-NEXT: s_delay_alu instid0(VALU_DEP_1)
11+
; DEFAULT-NEXT: v_lshlrev_b32_e32 v0, 2, v0
12+
; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
13+
; DEFAULT-NEXT: s_clause 0x1
14+
; DEFAULT-NEXT: global_load_b32 v1, v0, s[0:1]
15+
; DEFAULT-NEXT: global_load_b32 v2, v0, s[2:3]
16+
; DEFAULT-NEXT: s_waitcnt vmcnt(1)
17+
; DEFAULT-NEXT: global_store_b32 v0, v1, s[4:5]
18+
; DEFAULT-NEXT: s_waitcnt vmcnt(0)
19+
; DEFAULT-NEXT: global_store_b32 v0, v2, s[6:7]
20+
; DEFAULT-NEXT: s_endpgm
21+
;
22+
; LDZERO-LABEL: copy:
23+
; LDZERO: ; %bb.0:
24+
; LDZERO-NEXT: s_load_b256 s[0:7], s[4:5], 0x24
25+
; LDZERO-NEXT: v_and_b32_e32 v0, 0x3ff, v0
26+
; LDZERO-NEXT: s_delay_alu instid0(VALU_DEP_1)
27+
; LDZERO-NEXT: v_lshlrev_b32_e32 v0, 2, v0
28+
; LDZERO-NEXT: s_waitcnt lgkmcnt(0)
29+
; LDZERO-NEXT: s_clause 0x1
30+
; LDZERO-NEXT: global_load_b32 v1, v0, s[0:1]
31+
; LDZERO-NEXT: global_load_b32 v2, v0, s[2:3]
32+
; LDZERO-NEXT: s_waitcnt vmcnt(0)
33+
; LDZERO-NEXT: s_clause 0x1
34+
; LDZERO-NEXT: global_store_b32 v0, v1, s[4:5]
35+
; LDZERO-NEXT: global_store_b32 v0, v2, s[6:7]
36+
; LDZERO-NEXT: s_endpgm
37+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
38+
%idx = zext i32 %id to i64
39+
%gep.ld1 = getelementptr inbounds nuw float, ptr addrspace(1) %src1, i64 %idx
40+
%v1 = load float, ptr addrspace(1) %gep.ld1, align 4
41+
%gep.ld2 = getelementptr inbounds nuw float, ptr addrspace(1) %src2, i64 %idx
42+
%v2 = load float, ptr addrspace(1) %gep.ld2, align 4
43+
%gep.st1 = getelementptr inbounds nuw float, ptr addrspace(1) %dst1, i64 %idx
44+
store float %v1, ptr addrspace(1) %gep.st1, align 4
45+
%gep.st2 = getelementptr inbounds nuw float, ptr addrspace(1) %dst2, i64 %idx
46+
store float %v2, ptr addrspace(1) %gep.st2, align 4
47+
ret void
48+
}

0 commit comments

Comments
 (0)