Skip to content

Commit ce0c001

Browse files
committed
AMDGPU: If a store defines (alias) a load, it clobbers the load.
Summary: If a store defines (must alias) a load, it clobbers the load. Fixes: SWDEV-258915 Reviewers: arsenm Differential Revision: https://reviews.llvm.org/D92951
1 parent eed0b9a commit ce0c001

File tree

3 files changed

+51
-4
lines changed

3 files changed

+51
-4
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,9 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) {
110110
BasicBlock::iterator(Load) : BB->end();
111111
auto Q = MDR->getPointerDependencyFrom(
112112
MemoryLocation::getBeforeOrAfter(Ptr), true, StartIt, BB, Load);
113-
if (Q.isClobber() || Q.isUnknown())
113+
if (Q.isClobber() || Q.isUnknown() ||
114+
// Store defines the load and thus clobbers it.
115+
(Q.isDef() && Q.getInst()->mayWriteToMemory()))
114116
return true;
115117
}
116118
return false;
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
; RUN: opt -S --amdgpu-annotate-uniform < %s | FileCheck -check-prefix=OPT %s
2+
target datalayout = "A5"
3+
4+
; "load vaddr" depends on the store, so we should not mark vaddr as amdgpu.noclobber.
5+
6+
; OPT-LABEL: @store_clobbers_load(
7+
; OPT: %vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)*, !amdgpu.uniform !0
8+
; OPT-NEXT: %zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16
9+
define amdgpu_kernel void @store_clobbers_load(i32 addrspace(1)* %out, i32 %index) {
10+
entry:
11+
%alloca = alloca [4 x i32], addrspace(5)
12+
%addr0 = bitcast [4 x i32] addrspace(5)* %alloca to i32 addrspace(5)*
13+
store i32 0, i32 addrspace(5)* %addr0
14+
%vaddr = bitcast [4 x i32] addrspace(5)* %alloca to <4 x i32> addrspace(5)*
15+
%zero = load <4 x i32>, <4 x i32> addrspace(5)* %vaddr, align 16
16+
%one = insertelement <4 x i32> %zero, i32 1, i32 1
17+
%two = insertelement <4 x i32> %one, i32 2, i32 2
18+
%three = insertelement <4 x i32> %two, i32 3, i32 3
19+
store <4 x i32> %three, <4 x i32> addrspace(5)* %vaddr, align 16
20+
%rslt = extractelement <4 x i32> %three, i32 %index
21+
store i32 %rslt, i32 addrspace(1)* %out, align 4
22+
ret void
23+
}
24+
25+
declare i32 @llvm.amdgcn.workitem.id.x()
26+
@lds0 = addrspace(3) global [512 x i32] undef, align 4
27+
28+
; To check that %arrayidx0 is not marked as amdgpu.noclobber.
29+
30+
; OPT-LABEL: @atomicrmw_clobbers_load(
31+
; OPT: %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0, !amdgpu.uniform !0
32+
; OPT-NEXT: %val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
33+
34+
define amdgpu_kernel void @atomicrmw_clobbers_load(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1) {
35+
%tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
36+
%idx.0 = add nsw i32 %tid.x, 2
37+
%arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
38+
%val = atomicrmw xchg i32 addrspace(3)* %arrayidx0, i32 3 seq_cst
39+
%load = load i32, i32 addrspace(3)* %arrayidx0, align 4
40+
store i32 %val, i32 addrspace(1)* %out0, align 4
41+
store i32 %load, i32 addrspace(1)* %out1, align 4
42+
ret void
43+
}

llvm/test/CodeGen/AMDGPU/wave32.ll

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,9 +231,9 @@ bb13:
231231
; GCN: ; %bb.{{[0-9]+}}: ; %.preheader
232232
; GCN: BB{{.*}}:
233233

234+
; GCN: global_store_dword
234235
; GFX1032: s_or_b32 [[MASK0:s[0-9]+]], [[MASK0]], vcc_lo
235236
; GFX1064: s_or_b64 [[MASK0:s\[[0-9:]+\]]], [[MASK0]], vcc
236-
; GCN: global_store_dword
237237
; GFX1032: s_andn2_b32 [[MASK1:s[0-9]+]], [[MASK1]], exec_lo
238238
; GFX1064: s_andn2_b64 [[MASK1:s\[[0-9:]+\]]], [[MASK1]], exec
239239
; GFX1032: s_and_b32 [[MASK0]], [[MASK0]], exec_lo
@@ -249,10 +249,12 @@ bb13:
249249
; GFX1064: s_andn2_b64 exec, exec, [[ACC]]
250250
; GCN: s_cbranch_execz
251251
; GCN: BB{{.*}}:
252-
; GCN: s_load_dword [[LOAD:s[0-9]+]]
252+
253253
; GFX1032: s_or_b32 [[MASK1]], [[MASK1]], exec_lo
254254
; GFX1064: s_or_b64 [[MASK1]], [[MASK1]], exec
255-
; GCN: s_cmp_lt_i32 [[LOAD]], 11
255+
; GCN: global_load_dword [[LOAD:v[0-9]+]]
256+
; GFX1032: v_cmp_gt_i32_e32 vcc_lo, 11, [[LOAD]]
257+
; GFX1064: v_cmp_gt_i32_e32 vcc, 11, [[LOAD]]
256258
define amdgpu_kernel void @test_loop_with_if_else_break(i32 addrspace(1)* %arg) #0 {
257259
bb:
258260
%tmp = tail call i32 @llvm.amdgcn.workitem.id.x()

0 commit comments

Comments
 (0)