Skip to content

[OpenMP] Remove use of '__AMDGCN_WAVEFRONT_SIZE' #113156

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Nov 25, 2024

Conversation

jhuber6
Copy link
Contributor

@jhuber6 jhuber6 commented Oct 21, 2024

Summary:
This is going to be deprecated in
#112849. This patch ports it to
use the builtin instead. This isn't a compile constant, so it could
slightly negatively affect codegen. There really should be an IR pass to
turn it into a constant if the function has known attributes.

Using the builtin is correct when we just do it for knowing the size
like we do here. Obviously guarding w32/w64 code with this check would
be broken.

Summary:
This is going to be deprecated in
llvm#112849. This patch ports it to
use the builtin instead. This isn't a compile constant, so it could
slightly negatively affect codegen. There really should be an IR pass to
turn it into a constant if the function has known attributes.

Using the builtin is correct when we just do it for knowing the size
like we do here. Obviously guarding w32/w64 code with this check would
be broken.
@llvmbot
Copy link
Member

llvmbot commented Oct 21, 2024

@llvm/pr-subscribers-offload

Author: Joseph Huber (jhuber6)

Changes

Summary:
This is going to be deprecated in
#112849. This patch ports it to
use the builtin instead. This isn't a compile constant, so it could
slightly negatively affect codegen. There really should be an IR pass to
turn it into a constant if the function has known attributes.

Using the builtin is correct when we just do it for knowing the size
like we do here. Obviously guarding w32/w64 code with this check would
be broken.


Full diff: https://github.com/llvm/llvm-project/pull/113156.diff

3 Files Affected:

  • (modified) offload/DeviceRTL/src/Mapping.cpp (+3-7)
  • (modified) offload/test/offloading/ompx_bare_ballot_sync.c (+18-7)
  • (modified) offload/test/offloading/ompx_bare_shfl_down_sync.cpp (+13-5)
diff --git a/offload/DeviceRTL/src/Mapping.cpp b/offload/DeviceRTL/src/Mapping.cpp
index 3aefcff68e1956..881bd12f034051 100644
--- a/offload/DeviceRTL/src/Mapping.cpp
+++ b/offload/DeviceRTL/src/Mapping.cpp
@@ -25,7 +25,6 @@ namespace ompx {
 namespace impl {
 
 // Forward declarations defined to be defined for AMDGCN and NVPTX.
-const llvm::omp::GV &getGridValue();
 LaneMaskTy activemask();
 LaneMaskTy lanemaskLT();
 LaneMaskTy lanemaskGT();
@@ -37,15 +36,14 @@ uint32_t getBlockIdInKernel(int32_t Dim);
 uint32_t getNumberOfBlocksInKernel(int32_t Dim);
 uint32_t getWarpIdInBlock();
 uint32_t getNumberOfWarpsInBlock();
+uint32_t getWarpSize();
 
 /// AMDGCN Implementation
 ///
 ///{
 #pragma omp begin declare variant match(device = {arch(amdgcn)})
 
-const llvm::omp::GV &getGridValue() {
-  return llvm::omp::getAMDGPUGridValues<__AMDGCN_WAVEFRONT_SIZE>();
-}
+uint32_t getWarpSize() { return __builtin_amdgcn_wavefrontsize(); }
 
 uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
   switch (Dim) {
@@ -152,7 +150,7 @@ uint32_t getNumberOfThreadsInBlock(int32_t Dim) {
   UNREACHABLE("Dim outside range!");
 }
 
-const llvm::omp::GV &getGridValue() { return llvm::omp::NVPTXGridValues; }
+uint32_t getWarpSize() { return __nvvm_read_ptx_sreg_warpsize(); }
 
 LaneMaskTy activemask() { return __nvvm_activemask(); }
 
@@ -219,8 +217,6 @@ uint32_t getNumberOfWarpsInBlock() {
 #pragma omp end declare variant
 ///}
 
-uint32_t getWarpSize() { return getGridValue().GV_Warp_Size; }
-
 } // namespace impl
 } // namespace ompx
 
diff --git a/offload/test/offloading/ompx_bare_ballot_sync.c b/offload/test/offloading/ompx_bare_ballot_sync.c
index 101d1255f0d670..b810fb404b58f6 100644
--- a/offload/test/offloading/ompx_bare_ballot_sync.c
+++ b/offload/test/offloading/ompx_bare_ballot_sync.c
@@ -8,22 +8,33 @@
 #include <stdio.h>
 #include <stdlib.h>
 
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
 int main(int argc, char *argv[]) {
   const int num_blocks = 1;
   const int block_size = 256;
   const int N = num_blocks * block_size;
   int *res = (int *)malloc(N * sizeof(int));
 
-#pragma omp target teams ompx_bare num_teams(num_blocks) thread_limit(block_size) \
-        map(from: res[0:N])
+#pragma omp target teams ompx_bare num_teams(num_blocks)                       \
+    thread_limit(block_size) map(from : res[0 : N])
   {
     int tid = ompx_thread_id_x();
     uint64_t mask = ompx_ballot_sync(~0LU, tid & 0x1);
-#if defined __AMDGCN_WAVEFRONT_SIZE && __AMDGCN_WAVEFRONT_SIZE == 64
-    res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
-#else
-    res[tid] = mask == 0xaaaaaaaa;
-#endif
+    if (get_warp_size() == 64)
+      res[tid] = mask == 0xaaaaaaaaaaaaaaaa;
+    else
+      res[tid] = mask == 0xaaaaaaaa;
   }
 
   for (int i = 0; i < N; ++i)
diff --git a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
index 9b0e66e25f68c9..311999918de857 100644
--- a/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
+++ b/offload/test/offloading/ompx_bare_shfl_down_sync.cpp
@@ -10,6 +10,18 @@
 #include <ompx.h>
 #include <type_traits>
 
+#pragma omp begin declare variant match(device = {arch(amdgcn)})
+unsigned get_warp_size() { return __builtin_amdgcn_wavefrontsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {arch(nvptx64)})
+unsigned get_warp_size() { return __nvvm_read_ptx_sreg_warpsize(); }
+#pragma omp end declare variant
+
+#pragma omp begin declare variant match(device = {kind(cpu)})
+unsigned get_warp_size() { return 1; }
+#pragma omp end declare variant
+
 template <typename T, std::enable_if_t<std::is_integral<T>::value, bool> = true>
 bool equal(T LHS, T RHS) {
   return LHS == RHS;
@@ -32,11 +44,7 @@ template <typename T> void test() {
   {
     int tid = ompx_thread_id_x();
     T val = ompx::shfl_down_sync(~0U, static_cast<T>(tid), 1);
-#ifdef __AMDGCN_WAVEFRONT_SIZE
-    int warp_size = __AMDGCN_WAVEFRONT_SIZE;
-#else
-    int warp_size = 32;
-#endif
+    int warp_size = get_warp_size();
     if ((tid & (warp_size - 1)) != warp_size - 1)
       res[tid] = equal(val, static_cast<T>(tid + 1));
     else

Copy link
Contributor

@arsenm arsenm left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a device only context where you might as well just use the raw constant

@jhuber6 jhuber6 merged commit 506ca19 into llvm:main Nov 25, 2024
8 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

Successfully merging this pull request may close these issues.

5 participants