Skip to content

Commit ba44b44

Browse files
committed
[AMDGPU] Relax lds dma waitcnt with no aliasing pair
If we cannot find any lds DMA instruction that is aliased by some load from lds, we will still insert vmcnt(0). This is overly cautious since handling inter-thread dependences is normally managed by the memory model instead of the waitcnt pass, so this change updates the behavior to be more inline with how other types of memory events are handled.
1 parent 2e39533 commit ba44b44

File tree

2 files changed

+152
-1
lines changed

2 files changed

+152
-1
lines changed

llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ static cl::opt<bool> ForceEmitZeroLoadFlag(
5858
cl::desc("Force all waitcnt load counters to wait until 0"),
5959
cl::init(false), cl::Hidden);
6060

61+
static cl::opt<bool> RelaxLDSDMA(
62+
"amdgpu-relax-lds-dma-waitcnt",
63+
cl::desc("Relax the waitcnt for LDS DMA instructions that do not alias"),
64+
cl::init(false), cl::ReallyHidden);
65+
6166
namespace {
6267
// Class of object that encapsulates latest instruction counter score
6368
// associated with the operand. Used for determining whether
@@ -1748,7 +1753,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI,
17481753
}
17491754
}
17501755
}
1751-
if (!FoundAliasingStore)
1756+
if (!FoundAliasingStore && !RelaxLDSDMA)
17521757
ScoreBrackets.determineWait(LOAD_CNT, RegNo, Wait);
17531758
if (Memop->isStore()) {
17541759
ScoreBrackets.determineWait(EXP_CNT, RegNo, Wait);
Lines changed: 146 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefix=DEFAULT
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-relax-lds-dma-waitcnt < %s | FileCheck %s --check-prefix=RELAXED
4+
5+
; In relaxed mode don't wait on vmcnt(0) if the global_laod_lds and ds_reads do not alias
6+
7+
define amdgpu_kernel void @global_load_lds_no_alias_ds_read(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
8+
; DEFAULT-LABEL: global_load_lds_no_alias_ds_read:
9+
; DEFAULT: ; %bb.0: ; %main_body
10+
; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
11+
; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
12+
; DEFAULT-NEXT: v_mov_b32_e32 v2, 0
13+
; DEFAULT-NEXT: s_mov_b32 m0, 0
14+
; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
15+
; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1]
16+
; DEFAULT-NEXT: s_movk_i32 m0, 0x100
17+
; DEFAULT-NEXT: s_nop 0
18+
; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:4
19+
; DEFAULT-NEXT: s_lshl_b32 s0, s2, 2
20+
; DEFAULT-NEXT: v_mov_b32_e32 v0, s0
21+
; DEFAULT-NEXT: s_lshl_b32 s0, s3, 2
22+
; DEFAULT-NEXT: v_mov_b32_e32 v1, s0
23+
; DEFAULT-NEXT: s_waitcnt vmcnt(1)
24+
; DEFAULT-NEXT: s_barrier
25+
; DEFAULT-NEXT: s_waitcnt vmcnt(0)
26+
; DEFAULT-NEXT: ds_read_b32 v0, v0 offset:512
27+
; DEFAULT-NEXT: s_waitcnt vmcnt(0)
28+
; DEFAULT-NEXT: s_barrier
29+
; DEFAULT-NEXT: ds_read_b32 v1, v1 offset:768
30+
; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
31+
; DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
32+
; DEFAULT-NEXT: s_endpgm
33+
;
34+
; RELAXED-LABEL: global_load_lds_no_alias_ds_read:
35+
; RELAXED: ; %bb.0: ; %main_body
36+
; RELAXED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
37+
; RELAXED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
38+
; RELAXED-NEXT: v_mov_b32_e32 v2, 0
39+
; RELAXED-NEXT: s_mov_b32 m0, 0
40+
; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
41+
; RELAXED-NEXT: global_load_lds_dword v2, s[0:1]
42+
; RELAXED-NEXT: s_movk_i32 m0, 0x100
43+
; RELAXED-NEXT: s_nop 0
44+
; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:4
45+
; RELAXED-NEXT: s_lshl_b32 s0, s2, 2
46+
; RELAXED-NEXT: v_mov_b32_e32 v0, s0
47+
; RELAXED-NEXT: s_lshl_b32 s0, s3, 2
48+
; RELAXED-NEXT: v_mov_b32_e32 v1, s0
49+
; RELAXED-NEXT: s_waitcnt vmcnt(1)
50+
; RELAXED-NEXT: s_barrier
51+
; RELAXED-NEXT: ds_read_b32 v0, v0 offset:512
52+
; RELAXED-NEXT: s_waitcnt vmcnt(0)
53+
; RELAXED-NEXT: s_barrier
54+
; RELAXED-NEXT: ds_read_b32 v1, v1 offset:768
55+
; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
56+
; RELAXED-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
57+
; RELAXED-NEXT: s_endpgm
58+
main_body:
59+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
60+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 4, i32 0)
61+
call void @llvm.amdgcn.s.waitcnt(i32 3953)
62+
call void @llvm.amdgcn.s.barrier()
63+
%gep.0 = getelementptr float, ptr addrspace(3) @lds.2, i32 %i1
64+
%val.0 = load float, ptr addrspace(3) %gep.0, align 4
65+
call void @llvm.amdgcn.s.waitcnt(i32 3952)
66+
call void @llvm.amdgcn.s.barrier()
67+
%gep.1 = getelementptr float, ptr addrspace(3) @lds.3, i32 %i2
68+
%val.1 = load float, ptr addrspace(3) %gep.1, align 4
69+
%tmp = insertelement <2 x float> poison, float %val.0, i32 0
70+
%res = insertelement <2 x float> %tmp, float %val.1, i32 1
71+
store <2 x float> %res, ptr addrspace(1) %out
72+
ret void
73+
}
74+
75+
; Always wait on vmcnt(0) if the global_laod_lds and ds_reads alias
76+
77+
define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
78+
; DEFAULT-LABEL: global_load_lds_dword_2_arrays:
79+
; DEFAULT: ; %bb.0: ; %main_body
80+
; DEFAULT-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
81+
; DEFAULT-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
82+
; DEFAULT-NEXT: v_mov_b32_e32 v2, 0
83+
; DEFAULT-NEXT: s_mov_b32 m0, 0
84+
; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
85+
; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1]
86+
; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:4
87+
; DEFAULT-NEXT: s_movk_i32 m0, 0x100
88+
; DEFAULT-NEXT: s_nop 0
89+
; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:8
90+
; DEFAULT-NEXT: global_load_lds_dword v2, s[0:1] offset:12
91+
; DEFAULT-NEXT: s_lshl_b32 s0, s2, 2
92+
; DEFAULT-NEXT: s_lshl_b32 s1, s3, 2
93+
; DEFAULT-NEXT: v_mov_b32_e32 v0, s0
94+
; DEFAULT-NEXT: v_mov_b32_e32 v1, s1
95+
; DEFAULT-NEXT: s_waitcnt vmcnt(0)
96+
; DEFAULT-NEXT: ds_read_b32 v0, v0
97+
; DEFAULT-NEXT: ; wave barrier
98+
; DEFAULT-NEXT: ds_read_b32 v1, v1 offset:256
99+
; DEFAULT-NEXT: s_waitcnt lgkmcnt(0)
100+
; DEFAULT-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
101+
; DEFAULT-NEXT: s_endpgm
102+
;
103+
; RELAXED-LABEL: global_load_lds_dword_2_arrays:
104+
; RELAXED: ; %bb.0: ; %main_body
105+
; RELAXED-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
106+
; RELAXED-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
107+
; RELAXED-NEXT: v_mov_b32_e32 v2, 0
108+
; RELAXED-NEXT: s_mov_b32 m0, 0
109+
; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
110+
; RELAXED-NEXT: global_load_lds_dword v2, s[0:1]
111+
; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:4
112+
; RELAXED-NEXT: s_movk_i32 m0, 0x100
113+
; RELAXED-NEXT: s_nop 0
114+
; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:8
115+
; RELAXED-NEXT: global_load_lds_dword v2, s[0:1] offset:12
116+
; RELAXED-NEXT: s_lshl_b32 s0, s2, 2
117+
; RELAXED-NEXT: s_lshl_b32 s1, s3, 2
118+
; RELAXED-NEXT: v_mov_b32_e32 v0, s0
119+
; RELAXED-NEXT: v_mov_b32_e32 v1, s1
120+
; RELAXED-NEXT: s_waitcnt vmcnt(0)
121+
; RELAXED-NEXT: ds_read_b32 v0, v0
122+
; RELAXED-NEXT: ; wave barrier
123+
; RELAXED-NEXT: ds_read_b32 v1, v1 offset:256
124+
; RELAXED-NEXT: s_waitcnt lgkmcnt(0)
125+
; RELAXED-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
126+
; RELAXED-NEXT: s_endpgm
127+
main_body:
128+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
129+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0)
130+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0)
131+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0)
132+
%gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
133+
%gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
134+
%val.0 = load float, ptr addrspace(3) %gep.0, align 4
135+
call void @llvm.amdgcn.wave.barrier()
136+
%val.1 = load float, ptr addrspace(3) %gep.1, align 4
137+
%tmp.0 = insertelement <2 x float> poison, float %val.0, i32 0
138+
%res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
139+
store <2 x float> %res, ptr addrspace(1) %out
140+
ret void
141+
}
142+
143+
@lds.0 = internal addrspace(3) global [64 x float] poison, align 16
144+
@lds.1 = internal addrspace(3) global [64 x float] poison, align 16
145+
@lds.2 = internal addrspace(3) global [64 x float] poison, align 16
146+
@lds.3 = internal addrspace(3) global [64 x float] poison, align 16

0 commit comments

Comments
 (0)