[OpenMP] Remove use of '__AMDGCN_WAVEFRONT_SIZE' (#113156)

jhuber6 · web-flow · commit 506ca19dc9d6 · 2024-11-25T07:38:28.000-06:00
Summary: This is going to be deprecated in #112849. This patch ports it to use the builtin instead. This isn't a compile constant, so it could slightly negatively affect codegen. There really should be an IR pass to turn it into a constant if the function has known attributes. Using the builtin is correct when we just do it for knowing the size like we do here. Obviously guarding w32/w64 code with this check would be broken.
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
@@ -25,7 +25,6 @@ namespace ompx {
 namespace impl {
 
 // Forward declarations defined to be defined for AMDGCN and NVPTX.
-const llvm::omp::GV &getGridValue();
 LaneMaskTy activemask();
 LaneMaskTy lanemaskLT();
 LaneMaskTy lanemaskGT();
@@ -37,15 +36,14 @@ uint32_t getBlockIdInKernel(int32_t Dim);
 uint32_t getNumberOfBlocksInKernel(int32_t Dim);
 uint32_t getWarpIdInBlock();
 uint32_t getNumberOfWarpsInBlock();
+uint32_t getWarpSize();
 
 /// AMDGCN Implementation
 ///
 ///{
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
-const llvm::omp::GV &getGridValue() {
-  return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
-}
+uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
 
 uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
   switch (Dim) {
@@ -152,7 +150,7 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
   UNREACHABLE("Dim outside range!");
 }
 
-const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
+uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
 
 LaneMaskTy activemask() { return __nvvm_activemask(); }
 
@@ -219,8 +217,6 @@ uint32_t getNumberOfWarpsInBlock() {
 #pragma omp end declare variant
 ///}
 
-uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
-
 } // namespace impl
 } // namespace ompx
 
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -8,22 +8,33 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
 int main(int argc, char *argv[]) {
   const int num_blocks = 1;
   const int block_size = 256;
   const int N = num_blocks * block_size;
   int *res = (int *)malloc(N * sizeof(int));
 
-#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \
-        map(from: res[0:N])
+#pragma omp target teams ompx_bare num_teams(num_blocks)                       \
+    thread_limit(block_size) map(from : res[0 : N])
   {
     int tid = ompx_thread_id_x();
     uint64_t mask = ompx_ballot_sync(~0LU, tid & 0x1);
-#if defined __AMDGCN_WAVEFRONT_SIZE && __AMDGCN_WAVEFRONT_SIZE == 64
-    res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
-#else
-    res[tid] = mask == 0xaaaaaaaa;
-#endif
+    if (get_warp_size() == 64)
+      res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
+    else
+      res[tid] = mask == 0xaaaaaaaa;
   }
 
   for (int i = 0; i < N; ++i)
diff --git a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
@@ -10,6 +10,18 @@
 #include <ompx.h>
 #include <type_traits>
 
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
 template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
 bool equal(T LHS, T RHS) {
   return LHS == RHS;
@@ -32,11 +44,7 @@ template <typename T> void test() {
   {
     int tid = ompx_thread_id_x();
     T val = ompx::shfl_down_sync(~0U, static_cast<T>(tid), 1);
-#ifdef __AMDGCN_WAVEFRONT_SIZE
-    int warp_size = __AMDGCN_WAVEFRONT_SIZE;
-#else
-    int warp_size = 32;
-#endif
+    int warp_size = get_warp_size();
     if ((tid & (warp_size - 1)) != warp_size - 1)
       res[tid] = equal(val, static_cast<T>(tid + 1));
     else