Skip to content

Commit 5d016fc

Browse files
author
git apple-llvm automerger
committed
Merge commit '7f54070194f5' from llvm.org/main into next
2 parents 74c5f6e + 7f54070 commit 5d016fc

File tree

1 file changed

+78
-0
lines changed

1 file changed

+78
-0
lines changed
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
; RUN: llc -march=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN,GFX9
2+
; RUN: llc -march=amdgcn -mcpu=gfx1030 < %s | FileCheck %s --check-prefixes=GCN,GFX10
3+
4+
@lds.0 = internal addrspace(3) global [64 x float] poison, align 16
5+
@lds.1 = internal addrspace(3) global [64 x float] poison, align 16
6+
7+
declare void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) nocapture, i32 %size, i32 %voffset, i32 %soffset, i32 %offset, i32 %aux)
8+
declare void @llvm.amdgcn.global.load.lds(ptr addrspace(1) nocapture %gptr, ptr addrspace(3) nocapture %lptr, i32 %size, i32 %offset, i32 %aux)
9+
10+
; FIXME: vmcnt(0) is too strong, it shall use vmcnt(2) before the first
11+
; ds_read_b32 and vmcnt(0) before the second.
12+
; FIXME: GFX10 does not get a waitcount at all.
13+
14+
; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
15+
; GCN-COUNT-4: buffer_load_dword
16+
; GFX9: s_waitcnt vmcnt(0)
17+
18+
; FIXME:
19+
; GFX10-NOT: s_waitcnt
20+
21+
; GCN: ds_read_b32
22+
23+
; FIXME:
24+
; GCN-NOT: s_waitcnt
25+
26+
; GCN: ds_read_b32
27+
define amdgpu_kernel void @buffer_load_lds_dword_2_arrays(<4 x i32> %rsrc, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
28+
main_body:
29+
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0, i32 0, i32 0)
30+
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0, i32 0, i32 0)
31+
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0, i32 0, i32 0)
32+
call void @llvm.amdgcn.raw.buffer.load.lds(<4 x i32> %rsrc, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0, i32 0, i32 0)
33+
%gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
34+
%gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
35+
%val.0 = load float, ptr addrspace(3) %gep.0, align 4
36+
call void @llvm.amdgcn.wave.barrier()
37+
%val.1 = load float, ptr addrspace(3) %gep.1, align 4
38+
%tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0
39+
%res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
40+
store <2 x float> %res, ptr addrspace(1) %out
41+
ret void
42+
}
43+
44+
; On gfx9 if there is a pending FLAT operation, and this is a VMem or LGKM
45+
; waitcnt and the target can report early completion, then we need to force a waitcnt 0.
46+
47+
; GCN-LABEL: {{^}}global_load_lds_dword_2_arrays:
48+
; GCN-COUNT-4: global_load_dword
49+
; GFX9: s_waitcnt vmcnt(0)
50+
; GFX9-COUNT-2: ds_read_b32
51+
52+
; FIXME:
53+
; GFX10-NOT: s_waitcnt
54+
55+
; GFX10: ds_read_b32
56+
57+
; FIXME:
58+
; GFX10-NOT: s_waitcnt
59+
60+
; GFX10: ds_read_b32
61+
define amdgpu_kernel void @global_load_lds_dword_2_arrays(ptr addrspace(1) nocapture %gptr, i32 %i1, i32 %i2, ptr addrspace(1) %out) {
62+
main_body:
63+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 0, i32 0)
64+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.0, i32 4, i32 4, i32 0)
65+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 8, i32 0)
66+
call void @llvm.amdgcn.global.load.lds(ptr addrspace(1) %gptr, ptr addrspace(3) @lds.1, i32 4, i32 12, i32 0)
67+
%gep.0 = getelementptr float, ptr addrspace(3) @lds.0, i32 %i1
68+
%gep.1 = getelementptr float, ptr addrspace(3) @lds.1, i32 %i2
69+
%val.0 = load float, ptr addrspace(3) %gep.0, align 4
70+
call void @llvm.amdgcn.wave.barrier()
71+
%val.1 = load float, ptr addrspace(3) %gep.1, align 4
72+
%tmp.0 = insertelement <2 x float> undef, float %val.0, i32 0
73+
%res = insertelement <2 x float> %tmp.0, float %val.1, i32 1
74+
store <2 x float> %res, ptr addrspace(1) %out
75+
ret void
76+
}
77+
78+
declare void @llvm.amdgcn.wave.barrier()

0 commit comments

Comments
 (0)