Skip to content

Commit 27cc4a8

Browse files
committed
[OpenMP][NVPTX] Rewrite CUDA intrinsics with NVVM intrinsics
This patch makes prep for dropping CUDA when compiling `deviceRTLs`. CUDA intrinsics are replaced by NVVM intrinsics which refers to code in `__clang_cuda_intrinsics.h`. We don't want to directly include it because in the near future we're going to switch to OpenMP and by then the header cannot be used anymore. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D95327
1 parent f05b492 commit 27cc4a8

File tree

1 file changed

+11
-22
lines changed

1 file changed

+11
-22
lines changed

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu

Lines changed: 11 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -16,20 +16,6 @@
1616

1717
#include <cuda.h>
1818

19-
// Forward declaration of CUDA primitives which will be evetually transformed
20-
// into LLVM intrinsics.
21-
extern "C" {
22-
unsigned int __activemask();
23-
unsigned int __ballot(unsigned);
24-
// The default argument here is based on NVIDIA's website
25-
// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/
26-
int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE);
27-
int __shfl(int val, int src_line, int width = WARPSIZE);
28-
int __shfl_down(int var, unsigned detla, int width);
29-
int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width);
30-
void __syncwarp(int mask);
31-
}
32-
3319
DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
3420
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
3521
}
@@ -71,38 +57,41 @@ DEVICE double __kmpc_impl_get_wtime() {
7157

7258
// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
7359
DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
74-
#if CUDA_VERSION >= 9000
75-
return __activemask();
60+
#if CUDA_VERSION < 9020
61+
return __nvvm_vote_ballot(1);
7662
#else
77-
return __ballot(1);
63+
unsigned int Mask;
64+
asm volatile("activemask.b32 %0;" : "=r"(Mask));
65+
return Mask;
7866
#endif
7967
}
8068

8169
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
8270
DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
8371
int32_t SrcLane) {
8472
#if CUDA_VERSION >= 9000
85-
return __shfl_sync(Mask, Var, SrcLane);
73+
return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f);
8674
#else
87-
return __shfl(Var, SrcLane);
75+
return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f);
8876
#endif // CUDA_VERSION
8977
}
9078

9179
DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
9280
int32_t Var, uint32_t Delta,
9381
int32_t Width) {
82+
int32_t T = ((WARPSIZE - Width) << 8) | 0x1f;
9483
#if CUDA_VERSION >= 9000
95-
return __shfl_down_sync(Mask, Var, Delta, Width);
84+
return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T);
9685
#else
97-
return __shfl_down(Var, Delta, Width);
86+
return __nvvm_shfl_down_i32(Var, Delta, T);
9887
#endif // CUDA_VERSION
9988
}
10089

10190
DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
10291

10392
DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
10493
#if CUDA_VERSION >= 9000
105-
__syncwarp(Mask);
94+
__nvvm_bar_warp_sync(Mask);
10695
#else
10796
// In Cuda < 9.0 no need to sync threads in warps.
10897
#endif // CUDA_VERSION

0 commit comments

Comments
 (0)