|
14 | 14 | #include "DeviceUtils.h"
|
15 | 15 | #include "Interface.h"
|
16 | 16 | #include "State.h"
|
| 17 | +#include "gpuintrin.h" |
17 | 18 |
|
18 | 19 | #include "llvm/Frontend/OpenMP/OMPGridValues.h"
|
19 | 20 |
|
20 | 21 | using namespace ompx;
|
21 | 22 |
|
22 |
| -namespace ompx { |
23 |
| -namespace impl { |
24 |
| - |
25 |
| -/// AMDGCN Implementation |
26 |
| -/// |
27 |
| -///{ |
28 |
| -#ifdef __AMDGPU__ |
29 |
| - |
30 |
| -uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); } |
31 |
| - |
32 |
| -uint32_t getNumberOfThreadsInBlock(int32_t Dim) { |
33 |
| - switch (Dim) { |
34 |
| - case 0: |
35 |
| - return __builtin_amdgcn_workgroup_size_x(); |
36 |
| - case 1: |
37 |
| - return __builtin_amdgcn_workgroup_size_y(); |
38 |
| - case 2: |
39 |
| - return __builtin_amdgcn_workgroup_size_z(); |
40 |
| - }; |
41 |
| - UNREACHABLE("Dim outside range!"); |
42 |
| -} |
43 |
| - |
44 |
| -LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); } |
45 |
| - |
46 |
| -LaneMaskTy lanemaskLT() { |
47 |
| - uint32_t Lane = mapping::getThreadIdInWarp(); |
48 |
| - int64_t Ballot = mapping::activemask(); |
49 |
| - uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; |
50 |
| - return Mask & Ballot; |
51 |
| -} |
52 |
| - |
53 |
| -LaneMaskTy lanemaskGT() { |
54 |
| - uint32_t Lane = mapping::getThreadIdInWarp(); |
55 |
| - if (Lane == (mapping::getWarpSize() - 1)) |
56 |
| - return 0; |
57 |
| - int64_t Ballot = mapping::activemask(); |
58 |
| - uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); |
59 |
| - return Mask & Ballot; |
60 |
| -} |
61 |
| - |
62 |
| -uint32_t getThreadIdInWarp() { |
63 |
| - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); |
64 |
| -} |
65 |
| - |
66 |
| -uint32_t getThreadIdInBlock(int32_t Dim) { |
67 |
| - switch (Dim) { |
68 |
| - case 0: |
69 |
| - return __builtin_amdgcn_workitem_id_x(); |
70 |
| - case 1: |
71 |
| - return __builtin_amdgcn_workitem_id_y(); |
72 |
| - case 2: |
73 |
| - return __builtin_amdgcn_workitem_id_z(); |
74 |
| - }; |
75 |
| - UNREACHABLE("Dim outside range!"); |
76 |
| -} |
77 |
| - |
78 |
| -uint32_t getNumberOfThreadsInKernel() { |
79 |
| - return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() * |
80 |
| - __builtin_amdgcn_grid_size_z(); |
81 |
| -} |
82 |
| - |
83 |
| -uint32_t getBlockIdInKernel(int32_t Dim) { |
84 |
| - switch (Dim) { |
85 |
| - case 0: |
86 |
| - return __builtin_amdgcn_workgroup_id_x(); |
87 |
| - case 1: |
88 |
| - return __builtin_amdgcn_workgroup_id_y(); |
89 |
| - case 2: |
90 |
| - return __builtin_amdgcn_workgroup_id_z(); |
91 |
| - }; |
92 |
| - UNREACHABLE("Dim outside range!"); |
93 |
| -} |
94 |
| - |
95 |
| -uint32_t getNumberOfBlocksInKernel(int32_t Dim) { |
96 |
| - switch (Dim) { |
97 |
| - case 0: |
98 |
| - return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x(); |
99 |
| - case 1: |
100 |
| - return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y(); |
101 |
| - case 2: |
102 |
| - return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z(); |
103 |
| - }; |
104 |
| - UNREACHABLE("Dim outside range!"); |
105 |
| -} |
106 |
| - |
107 |
| -uint32_t getWarpIdInBlock() { |
108 |
| - return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); |
109 |
| -} |
110 |
| - |
111 |
| -uint32_t getNumberOfWarpsInBlock() { |
112 |
| - return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize(); |
113 |
| -} |
114 |
| - |
115 |
| -#endif |
116 |
| -///} |
117 |
| - |
118 |
| -/// NVPTX Implementation |
119 |
| -/// |
120 |
| -///{ |
121 |
| -#ifdef __NVPTX__ |
122 |
| - |
123 |
| -uint32_t getNumberOfThreadsInBlock(int32_t Dim) { |
124 |
| - switch (Dim) { |
125 |
| - case 0: |
126 |
| - return __nvvm_read_ptx_sreg_ntid_x(); |
127 |
| - case 1: |
128 |
| - return __nvvm_read_ptx_sreg_ntid_y(); |
129 |
| - case 2: |
130 |
| - return __nvvm_read_ptx_sreg_ntid_z(); |
131 |
| - }; |
132 |
| - UNREACHABLE("Dim outside range!"); |
133 |
| -} |
134 |
| - |
135 |
| -uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); } |
136 |
| - |
137 |
| -LaneMaskTy activemask() { return __nvvm_activemask(); } |
138 |
| - |
139 |
| -LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); } |
140 |
| - |
141 |
| -LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); } |
142 |
| - |
143 |
| -uint32_t getThreadIdInBlock(int32_t Dim) { |
144 |
| - switch (Dim) { |
145 |
| - case 0: |
146 |
| - return __nvvm_read_ptx_sreg_tid_x(); |
147 |
| - case 1: |
148 |
| - return __nvvm_read_ptx_sreg_tid_y(); |
149 |
| - case 2: |
150 |
| - return __nvvm_read_ptx_sreg_tid_z(); |
151 |
| - }; |
152 |
| - UNREACHABLE("Dim outside range!"); |
153 |
| -} |
154 |
| - |
155 |
| -uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); } |
156 |
| - |
157 |
| -uint32_t getBlockIdInKernel(int32_t Dim) { |
158 |
| - switch (Dim) { |
159 |
| - case 0: |
160 |
| - return __nvvm_read_ptx_sreg_ctaid_x(); |
161 |
| - case 1: |
162 |
| - return __nvvm_read_ptx_sreg_ctaid_y(); |
163 |
| - case 2: |
164 |
| - return __nvvm_read_ptx_sreg_ctaid_z(); |
165 |
| - }; |
166 |
| - UNREACHABLE("Dim outside range!"); |
167 |
| -} |
168 |
| - |
169 |
| -uint32_t getNumberOfBlocksInKernel(int32_t Dim) { |
170 |
| - switch (Dim) { |
171 |
| - case 0: |
172 |
| - return __nvvm_read_ptx_sreg_nctaid_x(); |
173 |
| - case 1: |
174 |
| - return __nvvm_read_ptx_sreg_nctaid_y(); |
175 |
| - case 2: |
176 |
| - return __nvvm_read_ptx_sreg_nctaid_z(); |
177 |
| - }; |
178 |
| - UNREACHABLE("Dim outside range!"); |
179 |
| -} |
180 |
| - |
181 |
| -uint32_t getNumberOfThreadsInKernel() { |
182 |
| - return impl::getNumberOfThreadsInBlock(0) * |
183 |
| - impl::getNumberOfBlocksInKernel(0) * |
184 |
| - impl::getNumberOfThreadsInBlock(1) * |
185 |
| - impl::getNumberOfBlocksInKernel(1) * |
186 |
| - impl::getNumberOfThreadsInBlock(2) * |
187 |
| - impl::getNumberOfBlocksInKernel(2); |
188 |
| -} |
189 |
| - |
190 |
| -uint32_t getWarpIdInBlock() { |
191 |
| - return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); |
192 |
| -} |
193 |
| - |
194 |
| -uint32_t getNumberOfWarpsInBlock() { |
195 |
| - return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / |
196 |
| - mapping::getWarpSize(); |
197 |
| -} |
198 |
| - |
199 |
| -#endif |
200 |
| -///} |
201 |
| - |
202 |
| -} // namespace impl |
203 |
| -} // namespace ompx |
204 |
| - |
205 |
| -/// We have to be deliberate about the distinction of `mapping::` and `impl::` |
206 |
| -/// below to avoid repeating assumptions or including irrelevant ones. |
207 |
| -///{ |
208 |
| - |
209 | 23 | static bool isInLastWarp() {
|
210 | 24 | uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
|
211 | 25 | ~(mapping::getWarpSize() - 1);
|
@@ -236,64 +50,87 @@ bool mapping::isLeaderInWarp() {
|
236 | 50 | return utils::popc(Active & LaneMaskLT) == 0;
|
237 | 51 | }
|
238 | 52 |
|
239 |
| -LaneMaskTy mapping::activemask() { return impl::activemask(); } |
| 53 | +LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); } |
240 | 54 |
|
241 |
| -LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); } |
| 55 | +LaneMaskTy mapping::lanemaskLT() { |
| 56 | +#ifdef __NVPTX__ |
| 57 | + return __nvvm_read_ptx_sreg_lanemask_lt(); |
| 58 | +#else |
| 59 | + uint32_t Lane = mapping::getThreadIdInWarp(); |
| 60 | + int64_t Ballot = mapping::activemask(); |
| 61 | + uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1; |
| 62 | + return Mask & Ballot; |
| 63 | +#endif |
| 64 | +} |
242 | 65 |
|
243 |
| -LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); } |
| 66 | +LaneMaskTy mapping::lanemaskGT() { |
| 67 | +#ifdef __NVPTX__ |
| 68 | + return __nvvm_read_ptx_sreg_lanemask_gt(); |
| 69 | +#else |
| 70 | + uint32_t Lane = mapping::getThreadIdInWarp(); |
| 71 | + if (Lane == (mapping::getWarpSize() - 1)) |
| 72 | + return 0; |
| 73 | + int64_t Ballot = mapping::activemask(); |
| 74 | + uint64_t Mask = (~((uint64_t)0)) << (Lane + 1); |
| 75 | + return Mask & Ballot; |
| 76 | +#endif |
| 77 | +} |
244 | 78 |
|
245 | 79 | uint32_t mapping::getThreadIdInWarp() {
|
246 |
| - uint32_t ThreadIdInWarp = impl::getThreadIdInWarp(); |
247 |
| - ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr); |
| 80 | + uint32_t ThreadIdInWarp = __gpu_lane_id(); |
| 81 | + ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr); |
248 | 82 | return ThreadIdInWarp;
|
249 | 83 | }
|
250 | 84 |
|
251 | 85 | uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
|
252 |
| - uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim); |
| 86 | + uint32_t ThreadIdInBlock = __gpu_thread_id(Dim); |
253 | 87 | return ThreadIdInBlock;
|
254 | 88 | }
|
255 | 89 |
|
256 |
| -uint32_t mapping::getWarpSize() { return impl::getWarpSize(); } |
| 90 | +uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); } |
257 | 91 |
|
258 | 92 | uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
|
259 | 93 | uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
|
260 | 94 | // If we are in SPMD mode, remove one warp.
|
261 |
| - return BlockSize - (!IsSPMD * impl::getWarpSize()); |
| 95 | + return BlockSize - (!IsSPMD * mapping::getWarpSize()); |
262 | 96 | }
|
263 | 97 | uint32_t mapping::getMaxTeamThreads() {
|
264 | 98 | return mapping::getMaxTeamThreads(mapping::isSPMDMode());
|
265 | 99 | }
|
266 | 100 |
|
267 | 101 | uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
|
268 |
| - return impl::getNumberOfThreadsInBlock(Dim); |
| 102 | + return __gpu_num_threads(Dim); |
269 | 103 | }
|
270 | 104 |
|
271 | 105 | uint32_t mapping::getNumberOfThreadsInKernel() {
|
272 |
| - return impl::getNumberOfThreadsInKernel(); |
| 106 | + return mapping::getNumberOfThreadsInBlock(0) * |
| 107 | + mapping::getNumberOfBlocksInKernel(0) * |
| 108 | + mapping::getNumberOfThreadsInBlock(1) * |
| 109 | + mapping::getNumberOfBlocksInKernel(1) * |
| 110 | + mapping::getNumberOfThreadsInBlock(2) * |
| 111 | + mapping::getNumberOfBlocksInKernel(2); |
273 | 112 | }
|
274 | 113 |
|
275 | 114 | uint32_t mapping::getWarpIdInBlock() {
|
276 |
| - uint32_t WarpID = impl::getWarpIdInBlock(); |
277 |
| - ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr); |
| 115 | + uint32_t WarpID = |
| 116 | + mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize(); |
| 117 | + ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr); |
278 | 118 | return WarpID;
|
279 | 119 | }
|
280 | 120 |
|
281 | 121 | uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
|
282 |
| - uint32_t BlockId = impl::getBlockIdInKernel(Dim); |
283 |
| - ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr); |
| 122 | + uint32_t BlockId = __gpu_block_id(Dim); |
| 123 | + ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr); |
284 | 124 | return BlockId;
|
285 | 125 | }
|
286 | 126 |
|
287 | 127 | uint32_t mapping::getNumberOfWarpsInBlock() {
|
288 |
| - uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock(); |
289 |
| - ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr); |
290 |
| - return NumberOfWarpsInBlocks; |
| 128 | + return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) / |
| 129 | + mapping::getWarpSize(); |
291 | 130 | }
|
292 | 131 |
|
293 | 132 | uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
|
294 |
| - uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim); |
295 |
| - ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr); |
296 |
| - return NumberOfBlocks; |
| 133 | + return __gpu_num_blocks(Dim); |
297 | 134 | }
|
298 | 135 |
|
299 | 136 | uint32_t mapping::getNumberOfProcessorElements() {
|
@@ -326,11 +163,11 @@ extern "C" {
|
326 | 163 | }
|
327 | 164 |
|
328 | 165 | [[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
|
329 |
| - return impl::getNumberOfThreadsInBlock(mapping::DIM_X); |
| 166 | + return mapping::getNumberOfThreadsInBlock(mapping::DIM_X); |
330 | 167 | }
|
331 | 168 |
|
332 | 169 | [[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
|
333 |
| - return impl::getWarpSize(); |
| 170 | + return mapping::getWarpSize(); |
334 | 171 | }
|
335 | 172 | }
|
336 | 173 |
|
|
0 commit comments