3
3
4
4
@lds.0 = internal addrspace (3 ) global [64 x float ] poison, align 16
5
5
@lds.1 = internal addrspace (3 ) global [64 x float ] poison, align 16
6
+ @lds.2 = internal addrspace (3 ) global [64 x float ] poison, align 16
7
+ @lds.3 = internal addrspace (3 ) global [64 x float ] poison, align 16
8
+ @lds.4 = internal addrspace (3 ) global [64 x float ] poison, align 16
9
+ @lds.5 = internal addrspace (3 ) global [64 x float ] poison, align 16
10
+ @lds.6 = internal addrspace (3 ) global [64 x float ] poison, align 16
11
+ @lds.7 = internal addrspace (3 ) global [64 x float ] poison, align 16
12
+ @lds.8 = internal addrspace (3 ) global [64 x float ] poison, align 16
13
+ @lds.9 = internal addrspace (3 ) global [64 x float ] poison, align 16
6
14
7
15
declare void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) nocapture , i32 %size , i32 %voffset , i32 %soffset , i32 %offset , i32 %aux )
8
16
declare void @llvm.amdgcn.global.load.lds (ptr addrspace (1 ) nocapture %gptr , ptr addrspace (3 ) nocapture %lptr , i32 %size , i32 %offset , i32 %aux )
9
17
10
- ; FIXME: vmcnt(0) is too strong, it shall use vmcnt(2) before the first
11
- ; ds_read_b32 and vmcnt(0) before the second.
12
-
13
18
; GCN-LABEL: {{^}}buffer_load_lds_dword_2_arrays:
14
19
; GCN-COUNT-4: buffer_load_dword
15
- ; GCN: s_waitcnt vmcnt(0 )
20
+ ; GCN: s_waitcnt vmcnt(2 )
16
21
; GCN: ds_read_b32
17
-
18
- ; FIXME:
19
- ; GCN-NOT: s_waitcnt
20
-
22
+ ; GCN: s_waitcnt vmcnt(0)
21
23
; GCN: ds_read_b32
22
24
define amdgpu_kernel void @buffer_load_lds_dword_2_arrays (<4 x i32 > %rsrc , i32 %i1 , i32 %i2 , ptr addrspace (1 ) %out ) {
23
25
main_body:
@@ -43,15 +45,9 @@ main_body:
43
45
; GCN-COUNT-4: global_load_dword
44
46
; GFX9: s_waitcnt vmcnt(0)
45
47
; GFX9-COUNT-2: ds_read_b32
46
-
47
- ; FIXME: can be vmcnt(2)
48
-
49
- ; GFX10: s_waitcnt vmcnt(0)
48
+ ; GFX10: s_waitcnt vmcnt(2)
50
49
; GFX10: ds_read_b32
51
-
52
- ; FIXME:
53
- ; GFX10-NOT: s_waitcnt
54
-
50
+ ; GFX10: s_waitcnt vmcnt(0)
55
51
; GFX10: ds_read_b32
56
52
define amdgpu_kernel void @global_load_lds_dword_2_arrays (ptr addrspace (1 ) nocapture %gptr , i32 %i1 , i32 %i2 , ptr addrspace (1 ) %out ) {
57
53
main_body:
@@ -70,4 +66,89 @@ main_body:
70
66
ret void
71
67
}
72
68
69
+ ; There are 8 pseudo registers defined to track LDS DMA dependencies.
70
+ ; When exhausted we default to vmcnt(0).
71
+
72
+ ; GCN-LABEL: {{^}}buffer_load_lds_dword_10_arrays:
73
+ ; GCN-COUNT-10: buffer_load_dword
74
+ ; GCN: s_waitcnt vmcnt(8)
75
+ ; GCN: ds_read_b32
76
+ ; GCN: s_waitcnt vmcnt(7)
77
+ ; GCN: ds_read_b32
78
+ ; GCN: s_waitcnt vmcnt(6)
79
+ ; GCN: ds_read_b32
80
+ ; GCN: s_waitcnt vmcnt(5)
81
+ ; GCN: ds_read_b32
82
+ ; GCN: s_waitcnt vmcnt(4)
83
+ ; GCN: ds_read_b32
84
+ ; GCN: s_waitcnt vmcnt(3)
85
+ ; GCN: ds_read_b32
86
+ ; GCN: s_waitcnt vmcnt(2)
87
+ ; GCN-NOT: s_waitcnt vmcnt
88
+ ; GCN: ds_read_b32
89
+ ; GCN: s_waitcnt vmcnt(0)
90
+ ; GCN: ds_read_b32
91
+ define amdgpu_kernel void @buffer_load_lds_dword_10_arrays (<4 x i32 > %rsrc , i32 %i1 , i32 %i2 , i32 %i3 , i32 %i4 , i32 %i5 , i32 %i6 , i32 %i7 , i32 %i8 , i32 %i9 , ptr addrspace (1 ) %out ) {
92
+ main_body:
93
+ call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.0 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
94
+ call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.1 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
95
+ call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.2 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
96
+ call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.3 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
97
+ call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.4 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
98
+ call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.5 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
99
+ call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.6 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
100
+ call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.7 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
101
+ call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.8 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
102
+ call void @llvm.amdgcn.raw.buffer.load.lds (<4 x i32 > %rsrc , ptr addrspace (3 ) @lds.9 , i32 4 , i32 0 , i32 0 , i32 0 , i32 0 )
103
+ %gep.0 = getelementptr float , ptr addrspace (3 ) @lds.0 , i32 %i1
104
+ %gep.1 = getelementptr float , ptr addrspace (3 ) @lds.1 , i32 %i2
105
+ %gep.2 = getelementptr float , ptr addrspace (3 ) @lds.2 , i32 %i2
106
+ %gep.3 = getelementptr float , ptr addrspace (3 ) @lds.3 , i32 %i2
107
+ %gep.4 = getelementptr float , ptr addrspace (3 ) @lds.4 , i32 %i2
108
+ %gep.5 = getelementptr float , ptr addrspace (3 ) @lds.5 , i32 %i2
109
+ %gep.6 = getelementptr float , ptr addrspace (3 ) @lds.6 , i32 %i2
110
+ %gep.7 = getelementptr float , ptr addrspace (3 ) @lds.7 , i32 %i2
111
+ %gep.8 = getelementptr float , ptr addrspace (3 ) @lds.8 , i32 %i2
112
+ %gep.9 = getelementptr float , ptr addrspace (3 ) @lds.9 , i32 %i2
113
+ %val.0 = load float , ptr addrspace (3 ) %gep.0 , align 4
114
+ call void @llvm.amdgcn.wave.barrier ()
115
+ %val.1 = load float , ptr addrspace (3 ) %gep.1 , align 4
116
+ call void @llvm.amdgcn.wave.barrier ()
117
+ %val.2 = load float , ptr addrspace (3 ) %gep.2 , align 4
118
+ call void @llvm.amdgcn.wave.barrier ()
119
+ %val.3 = load float , ptr addrspace (3 ) %gep.3 , align 4
120
+ call void @llvm.amdgcn.wave.barrier ()
121
+ %val.4 = load float , ptr addrspace (3 ) %gep.4 , align 4
122
+ call void @llvm.amdgcn.wave.barrier ()
123
+ %val.5 = load float , ptr addrspace (3 ) %gep.5 , align 4
124
+ call void @llvm.amdgcn.wave.barrier ()
125
+ %val.6 = load float , ptr addrspace (3 ) %gep.6 , align 4
126
+ call void @llvm.amdgcn.wave.barrier ()
127
+ %val.7 = load float , ptr addrspace (3 ) %gep.7 , align 4
128
+ call void @llvm.amdgcn.wave.barrier ()
129
+ %val.8 = load float , ptr addrspace (3 ) %gep.8 , align 4
130
+ call void @llvm.amdgcn.wave.barrier ()
131
+ %val.9 = load float , ptr addrspace (3 ) %gep.9 , align 4
132
+ %out.gep.1 = getelementptr float , ptr addrspace (1 ) %out , i32 1
133
+ %out.gep.2 = getelementptr float , ptr addrspace (1 ) %out , i32 2
134
+ %out.gep.3 = getelementptr float , ptr addrspace (1 ) %out , i32 3
135
+ %out.gep.4 = getelementptr float , ptr addrspace (1 ) %out , i32 4
136
+ %out.gep.5 = getelementptr float , ptr addrspace (1 ) %out , i32 5
137
+ %out.gep.6 = getelementptr float , ptr addrspace (1 ) %out , i32 6
138
+ %out.gep.7 = getelementptr float , ptr addrspace (1 ) %out , i32 7
139
+ %out.gep.8 = getelementptr float , ptr addrspace (1 ) %out , i32 8
140
+ %out.gep.9 = getelementptr float , ptr addrspace (1 ) %out , i32 9
141
+ store float %val.0 , ptr addrspace (1 ) %out
142
+ store float %val.1 , ptr addrspace (1 ) %out.gep.1
143
+ store float %val.2 , ptr addrspace (1 ) %out.gep.2
144
+ store float %val.3 , ptr addrspace (1 ) %out.gep.3
145
+ store float %val.4 , ptr addrspace (1 ) %out.gep.4
146
+ store float %val.5 , ptr addrspace (1 ) %out.gep.5
147
+ store float %val.6 , ptr addrspace (1 ) %out.gep.6
148
+ store float %val.7 , ptr addrspace (1 ) %out.gep.7
149
+ store float %val.8 , ptr addrspace (1 ) %out.gep.8
150
+ store float %val.9 , ptr addrspace (1 ) %out.gep.9
151
+ ret void
152
+ }
153
+
73
154
declare void @llvm.amdgcn.wave.barrier ()
0 commit comments