-
Notifications
You must be signed in to change notification settings - Fork 14.3k
[OpenMP] Remove use of '__AMDGCN_WAVEFRONT_SIZE' #113156
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Summary: This is going to be deprecated in llvm#112849. This patch ports it to use the builtin instead. This isn't a compile constant, so it could slightly negatively affect codegen. There really should be an IR pass to turn it into a constant if the function has known attributes. Using the builtin is correct when we just do it for knowing the size like we do here. Obviously guarding w32/w64 code with this check would be broken.
@llvm/pr-subscribers-offload Author: Joseph Huber (jhuber6) ChangesSummary: Using the builtin is correct when we just do it for knowing the size Full diff: https://github.com/llvm/llvm-project/pull/113156.diff 3 Files Affected:
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 3aefcff68e1956..881bd12f034051 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -25,7 +25,6 @@ namespace ompx {
namespace impl {
// Forward declarations defined to be defined for AMDGCN and NVPTX.
-const llvm::omp::GV &getGridValue();
LaneMaskTy activemask();
LaneMaskTy lanemaskLT();
LaneMaskTy lanemaskGT();
@@ -37,15 +36,14 @@ uint32_t getBlockIdInKernel(int32_t Dim);
uint32_t getNumberOfBlocksInKernel(int32_t Dim);
uint32_t getWarpIdInBlock();
uint32_t getNumberOfWarpsInBlock();
+uint32_t getWarpSize();
/// AMDGCN Implementation
///
///{
#pragma omp begin declare variant match(device = {arch(amdgcn)})
-const llvm::omp::GV &getGridValue() {
- return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
-}
+uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
switch (Dim) {
@@ -152,7 +150,7 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
UNREACHABLE("Dim outside range!");
}
-const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
+uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
LaneMaskTy activemask() { return __nvvm_activemask(); }
@@ -219,8 +217,6 @@ uint32_t getNumberOfWarpsInBlock() {
#pragma omp end declare variant
///}
-uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
-
} // namespace impl
} // namespace ompx
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
index 101d1255f0d670..b810fb404b58f6 100644
--- a/offload/test/offloading/ompx_bare_ballot_sync.c
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -8,22 +8,33 @@
#include <stdio.h>
#include <stdlib.h>
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
int main(int argc, char *argv[]) {
const int num_blocks = 1;
const int block_size = 256;
const int N = num_blocks * block_size;
int *res = (int *)malloc(N * sizeof(int));
-#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \
- map(from: res[0:N])
+#pragma omp target teams ompx_bare num_teams(num_blocks) \
+ thread_limit(block_size) map(from : res[0 : N])
{
int tid = ompx_thread_id_x();
uint64_t mask = ompx_ballot_sync(~0LU, tid & 0x1);
-#if defined __AMDGCN_WAVEFRONT_SIZE && __AMDGCN_WAVEFRONT_SIZE == 64
- res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
-#else
- res[tid] = mask == 0xaaaaaaaa;
-#endif
+ if (get_warp_size() == 64)
+ res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
+ else
+ res[tid] = mask == 0xaaaaaaaa;
}
for (int i = 0; i < N; ++i)
diff --git a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
index 9b0e66e25f68c9..311999918de857 100644
--- a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
+++ b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
@@ -10,6 +10,18 @@
#include <ompx.h>
#include <type_traits>
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
bool equal(T LHS, T RHS) {
return LHS == RHS;
@@ -32,11 +44,7 @@ template <typename T> void test() {
{
int tid = ompx_thread_id_x();
T val = ompx::shfl_down_sync(~0U, static_cast<T>(tid), 1);
-#ifdef __AMDGCN_WAVEFRONT_SIZE
- int warp_size = __AMDGCN_WAVEFRONT_SIZE;
-#else
- int warp_size = 32;
-#endif
+ int warp_size = get_warp_size();
if ((tid & (warp_size - 1)) != warp_size - 1)
res[tid] = equal(val, static_cast<T>(tid + 1));
else
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a device only context where you might as well just use the raw constant
Summary:
This is going to be deprecated in
#112849. This patch ports it to
use the builtin instead. This isn't a compile constant, so it could
slightly negatively affect codegen. There really should be an IR pass to
turn it into a constant if the function has known attributes.
Using the builtin is correct when we just do it for knowing the size
like we do here. Obviously guarding w32/w64 code with this check would
be broken.