Skip to content

Commit 206f78d

Browse files
authored
[OpenMP] Use 'gpuintrin.h' definitions for simple block identifiers (#131631)
Summary: This patch ports the runtime to use `gpuintrin.h` instead of calling the builtins for most things. The `lanemask_gt` stuff was left for now with a fallback. AMD version for Ron https://gist.github.com/jhuber6/42014d635b9a8158727640876bf47226.
1 parent c02b935 commit 206f78d

File tree

2 files changed

+49
-212
lines changed

2 files changed

+49
-212
lines changed

offload/DeviceRTL/include/Mapping.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,9 @@ namespace ompx {
1919
namespace mapping {
2020

2121
enum {
22-
DIM_X = 0,
23-
DIM_Y = 1,
24-
DIM_Z = 2,
22+
DIM_X = __GPU_X_DIM,
23+
DIM_Y = __GPU_Y_DIM,
24+
DIM_Z = __GPU_Z_DIM,
2525
};
2626

2727
inline constexpr uint32_t MaxThreadsPerTeam = 1024;

offload/DeviceRTL/src/Mapping.cpp

Lines changed: 46 additions & 209 deletions
Original file line numberDiff line numberDiff line change
@@ -14,198 +14,12 @@
1414
#include "DeviceUtils.h"
1515
#include "Interface.h"
1616
#include "State.h"
17+
#include "gpuintrin.h"
1718

1819
#include "llvm/Frontend/OpenMP/OMPGridValues.h"
1920

2021
using namespace ompx;
2122

22-
namespace ompx {
23-
namespace impl {
24-
25-
/// AMDGCN Implementation
26-
///
27-
///{
28-
#ifdef __AMDGPU__
29-
30-
uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
31-
32-
uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
33-
switch (Dim) {
34-
case 0:
35-
return __builtin_amdgcn_workgroup_size_x();
36-
case 1:
37-
return __builtin_amdgcn_workgroup_size_y();
38-
case 2:
39-
return __builtin_amdgcn_workgroup_size_z();
40-
};
41-
UNREACHABLE("Dim outside range!");
42-
}
43-
44-
LaneMaskTy activemask() { return __builtin_amdgcn_read_exec(); }
45-
46-
LaneMaskTy lanemaskLT() {
47-
uint32_t Lane = mapping::getThreadIdInWarp();
48-
int64_t Ballot = mapping::activemask();
49-
uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
50-
return Mask & Ballot;
51-
}
52-
53-
LaneMaskTy lanemaskGT() {
54-
uint32_t Lane = mapping::getThreadIdInWarp();
55-
if (Lane == (mapping::getWarpSize() - 1))
56-
return 0;
57-
int64_t Ballot = mapping::activemask();
58-
uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
59-
return Mask & Ballot;
60-
}
61-
62-
uint32_t getThreadIdInWarp() {
63-
return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u));
64-
}
65-
66-
uint32_t getThreadIdInBlock(int32_t Dim) {
67-
switch (Dim) {
68-
case 0:
69-
return __builtin_amdgcn_workitem_id_x();
70-
case 1:
71-
return __builtin_amdgcn_workitem_id_y();
72-
case 2:
73-
return __builtin_amdgcn_workitem_id_z();
74-
};
75-
UNREACHABLE("Dim outside range!");
76-
}
77-
78-
uint32_t getNumberOfThreadsInKernel() {
79-
return __builtin_amdgcn_grid_size_x() * __builtin_amdgcn_grid_size_y() *
80-
__builtin_amdgcn_grid_size_z();
81-
}
82-
83-
uint32_t getBlockIdInKernel(int32_t Dim) {
84-
switch (Dim) {
85-
case 0:
86-
return __builtin_amdgcn_workgroup_id_x();
87-
case 1:
88-
return __builtin_amdgcn_workgroup_id_y();
89-
case 2:
90-
return __builtin_amdgcn_workgroup_id_z();
91-
};
92-
UNREACHABLE("Dim outside range!");
93-
}
94-
95-
uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
96-
switch (Dim) {
97-
case 0:
98-
return __builtin_amdgcn_grid_size_x() / __builtin_amdgcn_workgroup_size_x();
99-
case 1:
100-
return __builtin_amdgcn_grid_size_y() / __builtin_amdgcn_workgroup_size_y();
101-
case 2:
102-
return __builtin_amdgcn_grid_size_z() / __builtin_amdgcn_workgroup_size_z();
103-
};
104-
UNREACHABLE("Dim outside range!");
105-
}
106-
107-
uint32_t getWarpIdInBlock() {
108-
return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
109-
}
110-
111-
uint32_t getNumberOfWarpsInBlock() {
112-
return mapping::getNumberOfThreadsInBlock() / mapping::getWarpSize();
113-
}
114-
115-
#endif
116-
///}
117-
118-
/// NVPTX Implementation
119-
///
120-
///{
121-
#ifdef __NVPTX__
122-
123-
uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
124-
switch (Dim) {
125-
case 0:
126-
return __nvvm_read_ptx_sreg_ntid_x();
127-
case 1:
128-
return __nvvm_read_ptx_sreg_ntid_y();
129-
case 2:
130-
return __nvvm_read_ptx_sreg_ntid_z();
131-
};
132-
UNREACHABLE("Dim outside range!");
133-
}
134-
135-
uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
136-
137-
LaneMaskTy activemask() { return __nvvm_activemask(); }
138-
139-
LaneMaskTy lanemaskLT() { return __nvvm_read_ptx_sreg_lanemask_lt(); }
140-
141-
LaneMaskTy lanemaskGT() { return __nvvm_read_ptx_sreg_lanemask_gt(); }
142-
143-
uint32_t getThreadIdInBlock(int32_t Dim) {
144-
switch (Dim) {
145-
case 0:
146-
return __nvvm_read_ptx_sreg_tid_x();
147-
case 1:
148-
return __nvvm_read_ptx_sreg_tid_y();
149-
case 2:
150-
return __nvvm_read_ptx_sreg_tid_z();
151-
};
152-
UNREACHABLE("Dim outside range!");
153-
}
154-
155-
uint32_t getThreadIdInWarp() { return __nvvm_read_ptx_sreg_laneid(); }
156-
157-
uint32_t getBlockIdInKernel(int32_t Dim) {
158-
switch (Dim) {
159-
case 0:
160-
return __nvvm_read_ptx_sreg_ctaid_x();
161-
case 1:
162-
return __nvvm_read_ptx_sreg_ctaid_y();
163-
case 2:
164-
return __nvvm_read_ptx_sreg_ctaid_z();
165-
};
166-
UNREACHABLE("Dim outside range!");
167-
}
168-
169-
uint32_t getNumberOfBlocksInKernel(int32_t Dim) {
170-
switch (Dim) {
171-
case 0:
172-
return __nvvm_read_ptx_sreg_nctaid_x();
173-
case 1:
174-
return __nvvm_read_ptx_sreg_nctaid_y();
175-
case 2:
176-
return __nvvm_read_ptx_sreg_nctaid_z();
177-
};
178-
UNREACHABLE("Dim outside range!");
179-
}
180-
181-
uint32_t getNumberOfThreadsInKernel() {
182-
return impl::getNumberOfThreadsInBlock(0) *
183-
impl::getNumberOfBlocksInKernel(0) *
184-
impl::getNumberOfThreadsInBlock(1) *
185-
impl::getNumberOfBlocksInKernel(1) *
186-
impl::getNumberOfThreadsInBlock(2) *
187-
impl::getNumberOfBlocksInKernel(2);
188-
}
189-
190-
uint32_t getWarpIdInBlock() {
191-
return impl::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
192-
}
193-
194-
uint32_t getNumberOfWarpsInBlock() {
195-
return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
196-
mapping::getWarpSize();
197-
}
198-
199-
#endif
200-
///}
201-
202-
} // namespace impl
203-
} // namespace ompx
204-
205-
/// We have to be deliberate about the distinction of `mapping::` and `impl::`
206-
/// below to avoid repeating assumptions or including irrelevant ones.
207-
///{
208-
20923
static bool isInLastWarp() {
21024
uint32_t MainTId = (mapping::getNumberOfThreadsInBlock() - 1) &
21125
~(mapping::getWarpSize() - 1);
@@ -236,64 +50,87 @@ bool mapping::isLeaderInWarp() {
23650
return utils::popc(Active & LaneMaskLT) == 0;
23751
}
23852

239-
LaneMaskTy mapping::activemask() { return impl::activemask(); }
53+
LaneMaskTy mapping::activemask() { return __gpu_lane_mask(); }
24054

241-
LaneMaskTy mapping::lanemaskLT() { return impl::lanemaskLT(); }
55+
LaneMaskTy mapping::lanemaskLT() {
56+
#ifdef __NVPTX__
57+
return __nvvm_read_ptx_sreg_lanemask_lt();
58+
#else
59+
uint32_t Lane = mapping::getThreadIdInWarp();
60+
int64_t Ballot = mapping::activemask();
61+
uint64_t Mask = ((uint64_t)1 << Lane) - (uint64_t)1;
62+
return Mask & Ballot;
63+
#endif
64+
}
24265

243-
LaneMaskTy mapping::lanemaskGT() { return impl::lanemaskGT(); }
66+
LaneMaskTy mapping::lanemaskGT() {
67+
#ifdef __NVPTX__
68+
return __nvvm_read_ptx_sreg_lanemask_gt();
69+
#else
70+
uint32_t Lane = mapping::getThreadIdInWarp();
71+
if (Lane == (mapping::getWarpSize() - 1))
72+
return 0;
73+
int64_t Ballot = mapping::activemask();
74+
uint64_t Mask = (~((uint64_t)0)) << (Lane + 1);
75+
return Mask & Ballot;
76+
#endif
77+
}
24478

24579
uint32_t mapping::getThreadIdInWarp() {
246-
uint32_t ThreadIdInWarp = impl::getThreadIdInWarp();
247-
ASSERT(ThreadIdInWarp < impl::getWarpSize(), nullptr);
80+
uint32_t ThreadIdInWarp = __gpu_lane_id();
81+
ASSERT(ThreadIdInWarp < mapping::getWarpSize(), nullptr);
24882
return ThreadIdInWarp;
24983
}
25084

25185
uint32_t mapping::getThreadIdInBlock(int32_t Dim) {
252-
uint32_t ThreadIdInBlock = impl::getThreadIdInBlock(Dim);
86+
uint32_t ThreadIdInBlock = __gpu_thread_id(Dim);
25387
return ThreadIdInBlock;
25488
}
25589

256-
uint32_t mapping::getWarpSize() { return impl::getWarpSize(); }
90+
uint32_t mapping::getWarpSize() { return __gpu_num_lanes(); }
25791

25892
uint32_t mapping::getMaxTeamThreads(bool IsSPMD) {
25993
uint32_t BlockSize = mapping::getNumberOfThreadsInBlock();
26094
// If we are in SPMD mode, remove one warp.
261-
return BlockSize - (!IsSPMD * impl::getWarpSize());
95+
return BlockSize - (!IsSPMD * mapping::getWarpSize());
26296
}
26397
uint32_t mapping::getMaxTeamThreads() {
26498
return mapping::getMaxTeamThreads(mapping::isSPMDMode());
26599
}
266100

267101
uint32_t mapping::getNumberOfThreadsInBlock(int32_t Dim) {
268-
return impl::getNumberOfThreadsInBlock(Dim);
102+
return __gpu_num_threads(Dim);
269103
}
270104

271105
uint32_t mapping::getNumberOfThreadsInKernel() {
272-
return impl::getNumberOfThreadsInKernel();
106+
return mapping::getNumberOfThreadsInBlock(0) *
107+
mapping::getNumberOfBlocksInKernel(0) *
108+
mapping::getNumberOfThreadsInBlock(1) *
109+
mapping::getNumberOfBlocksInKernel(1) *
110+
mapping::getNumberOfThreadsInBlock(2) *
111+
mapping::getNumberOfBlocksInKernel(2);
273112
}
274113

275114
uint32_t mapping::getWarpIdInBlock() {
276-
uint32_t WarpID = impl::getWarpIdInBlock();
277-
ASSERT(WarpID < impl::getNumberOfWarpsInBlock(), nullptr);
115+
uint32_t WarpID =
116+
mapping::getThreadIdInBlock(mapping::DIM_X) / mapping::getWarpSize();
117+
ASSERT(WarpID < mapping::getNumberOfWarpsInBlock(), nullptr);
278118
return WarpID;
279119
}
280120

281121
uint32_t mapping::getBlockIdInKernel(int32_t Dim) {
282-
uint32_t BlockId = impl::getBlockIdInKernel(Dim);
283-
ASSERT(BlockId < impl::getNumberOfBlocksInKernel(Dim), nullptr);
122+
uint32_t BlockId = __gpu_block_id(Dim);
123+
ASSERT(BlockId < mapping::getNumberOfBlocksInKernel(Dim), nullptr);
284124
return BlockId;
285125
}
286126

287127
uint32_t mapping::getNumberOfWarpsInBlock() {
288-
uint32_t NumberOfWarpsInBlocks = impl::getNumberOfWarpsInBlock();
289-
ASSERT(impl::getWarpIdInBlock() < NumberOfWarpsInBlocks, nullptr);
290-
return NumberOfWarpsInBlocks;
128+
return (mapping::getNumberOfThreadsInBlock() + mapping::getWarpSize() - 1) /
129+
mapping::getWarpSize();
291130
}
292131

293132
uint32_t mapping::getNumberOfBlocksInKernel(int32_t Dim) {
294-
uint32_t NumberOfBlocks = impl::getNumberOfBlocksInKernel(Dim);
295-
ASSERT(impl::getBlockIdInKernel(Dim) < NumberOfBlocks, nullptr);
296-
return NumberOfBlocks;
133+
return __gpu_num_blocks(Dim);
297134
}
298135

299136
uint32_t mapping::getNumberOfProcessorElements() {
@@ -326,11 +163,11 @@ extern "C" {
326163
}
327164

328165
[[gnu::noinline]] uint32_t __kmpc_get_hardware_num_threads_in_block() {
329-
return impl::getNumberOfThreadsInBlock(mapping::DIM_X);
166+
return mapping::getNumberOfThreadsInBlock(mapping::DIM_X);
330167
}
331168

332169
[[gnu::noinline]] uint32_t __kmpc_get_warp_size() {
333-
return impl::getWarpSize();
170+
return mapping::getWarpSize();
334171
}
335172
}
336173

0 commit comments

Comments
 (0)