Skip to content

Commit 506ca19

Browse files
authored
[OpenMP] Remove use of '__AMDGCN_WAVEFRONT_SIZE' (#113156)
Summary: This is going to be deprecated in #112849. This patch ports it to use the builtin instead. This isn't a compile constant, so it could slightly negatively affect codegen. There really should be an IR pass to turn it into a constant if the function has known attributes. Using the builtin is correct when we just do it for knowing the size like we do here. Obviously guarding w32/w64 code with this check would be broken.
1 parent 52755ac commit 506ca19

File tree

3 files changed

+34
-19
lines changed

3 files changed

+34
-19
lines changed

offload/DeviceRTL/src/Mapping.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ namespace ompx {
2525
namespace impl {
2626

2727
// Forward declarations defined to be defined for AMDGCN and NVPTX.
28-
const llvm::omp::GV &getGridValue();
2928
LaneMaskTy activemask();
3029
LaneMaskTy lanemaskLT();
3130
LaneMaskTy lanemaskGT();
@@ -37,15 +36,14 @@ uint32_t getBlockIdInKernel(int32_t Dim);
3736
uint32_t getNumberOfBlocksInKernel(int32_t Dim);
3837
uint32_t getWarpIdInBlock();
3938
uint32_t getNumberOfWarpsInBlock();
39+
uint32_t getWarpSize();
4040

4141
/// AMDGCN Implementation
4242
///
4343
///{
4444
#pragma omp begin declare variant match(device = {arch(amdgcn)})
4545

46-
const llvm::omp::GV &getGridValue() {
47-
return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
48-
}
46+
uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
4947

5048
uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
5149
switch (Dim) {
@@ -152,7 +150,7 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
152150
UNREACHABLE("Dim outside range!");
153151
}
154152

155-
const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
153+
uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
156154

157155
LaneMaskTy activemask() { return __nvvm_activemask(); }
158156

@@ -219,8 +217,6 @@ uint32_t getNumberOfWarpsInBlock() {
219217
#pragma omp end declare variant
220218
///}
221219

222-
uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
223-
224220
} // namespace impl
225221
} // namespace ompx
226222

offload/test/offloading/ompx_bare_ballot_sync.c

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,33 @@
88
#include <stdio.h>
99
#include <stdlib.h>
1010

11+
#pragma omp begin declare variant match(device = {arch(amdgcn)})
12+
unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
13+
#pragma omp end declare variant
14+
15+
#pragma omp begin declare variant match(device = {arch(nvptx64)})
16+
unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
17+
#pragma omp end declare variant
18+
19+
#pragma omp begin declare variant match(device = {kind(cpu)})
20+
unsigned get_warp_size() { return 1; }
21+
#pragma omp end declare variant
22+
1123
int main(int argc, char *argv[]) {
1224
const int num_blocks = 1;
1325
const int block_size = 256;
1426
const int N = num_blocks * block_size;
1527
int *res = (int *)malloc(N * sizeof(int));
1628

17-
#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \
18-
map(from: res[0:N])
29+
#pragma omp target teams ompx_bare num_teams(num_blocks) \
30+
thread_limit(block_size) map(from : res[0 : N])
1931
{
2032
int tid = ompx_thread_id_x();
2133
uint64_t mask = ompx_ballot_sync(~0LU, tid & 0x1);
22-
#if defined __AMDGCN_WAVEFRONT_SIZE && __AMDGCN_WAVEFRONT_SIZE == 64
23-
res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
24-
#else
25-
res[tid] = mask == 0xaaaaaaaa;
26-
#endif
34+
if (get_warp_size() == 64)
35+
res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
36+
else
37+
res[tid] = mask == 0xaaaaaaaa;
2738
}
2839

2940
for (int i = 0; i < N; ++i)

offload/test/offloading/ompx_bare_shfl_down_sync.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,18 @@
1010
#include <ompx.h>
1111
#include <type_traits>
1212

13+
#pragma omp begin declare variant match(device = {arch(amdgcn)})
14+
unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
15+
#pragma omp end declare variant
16+
17+
#pragma omp begin declare variant match(device = {arch(nvptx64)})
18+
unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
19+
#pragma omp end declare variant
20+
21+
#pragma omp begin declare variant match(device = {kind(cpu)})
22+
unsigned get_warp_size() { return 1; }
23+
#pragma omp end declare variant
24+
1325
template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
1426
bool equal(T LHS, T RHS) {
1527
return LHS == RHS;
@@ -32,11 +44,7 @@ template <typename T> void test() {
3244
{
3345
int tid = ompx_thread_id_x();
3446
T val = ompx::shfl_down_sync(~0U, static_cast<T>(tid), 1);
35-
#ifdef __AMDGCN_WAVEFRONT_SIZE
36-
int warp_size = __AMDGCN_WAVEFRONT_SIZE;
37-
#else
38-
int warp_size = 32;
39-
#endif
47+
int warp_size = get_warp_size();
4048
if ((tid & (warp_size - 1)) != warp_size - 1)
4149
res[tid] = equal(val, static_cast<T>(tid + 1));
4250
else

0 commit comments

Comments
 (0)