|
16 | 16 |
|
17 | 17 | #include <cuda.h>
|
18 | 18 |
|
19 |
| -// Forward declaration of CUDA primitives which will be evetually transformed |
20 |
| -// into LLVM intrinsics. |
21 |
| -extern "C" { |
22 |
| -unsigned int __activemask(); |
23 |
| -unsigned int __ballot(unsigned); |
24 |
| -// The default argument here is based on NVIDIA's website |
25 |
| -// https://developer.nvidia.com/blog/using-cuda-warp-level-primitives/ |
26 |
| -int __shfl_sync(unsigned mask, int val, int src_line, int width = WARPSIZE); |
27 |
| -int __shfl(int val, int src_line, int width = WARPSIZE); |
28 |
| -int __shfl_down(int var, unsigned detla, int width); |
29 |
| -int __shfl_down_sync(unsigned mask, int var, unsigned detla, int width); |
30 |
| -void __syncwarp(int mask); |
31 |
| -} |
32 |
| - |
33 | 19 | DEVICE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) {
|
34 | 20 | asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
|
35 | 21 | }
|
@@ -71,38 +57,41 @@ DEVICE double __kmpc_impl_get_wtime() {
|
71 | 57 |
|
72 | 58 | // In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask().
|
73 | 59 | DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() {
|
74 |
| -#if CUDA_VERSION >= 9000 |
75 |
| - return __activemask(); |
| 60 | +#if CUDA_VERSION < 9020 |
| 61 | + return __nvvm_vote_ballot(1); |
76 | 62 | #else
|
77 |
| - return __ballot(1); |
| 63 | + unsigned int Mask; |
| 64 | + asm volatile("activemask.b32 %0;" : "=r"(Mask)); |
| 65 | + return Mask; |
78 | 66 | #endif
|
79 | 67 | }
|
80 | 68 |
|
81 | 69 | // In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
|
82 | 70 | DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var,
|
83 | 71 | int32_t SrcLane) {
|
84 | 72 | #if CUDA_VERSION >= 9000
|
85 |
| - return __shfl_sync(Mask, Var, SrcLane); |
| 73 | + return __nvvm_shfl_sync_idx_i32(Mask, Var, SrcLane, 0x1f); |
86 | 74 | #else
|
87 |
| - return __shfl(Var, SrcLane); |
| 75 | + return __nvvm_shfl_idx_i32(Var, SrcLane, 0x1f); |
88 | 76 | #endif // CUDA_VERSION
|
89 | 77 | }
|
90 | 78 |
|
91 | 79 | DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask,
|
92 | 80 | int32_t Var, uint32_t Delta,
|
93 | 81 | int32_t Width) {
|
| 82 | + int32_t T = ((WARPSIZE - Width) << 8) | 0x1f; |
94 | 83 | #if CUDA_VERSION >= 9000
|
95 |
| - return __shfl_down_sync(Mask, Var, Delta, Width); |
| 84 | + return __nvvm_shfl_sync_down_i32(Mask, Var, Delta, T); |
96 | 85 | #else
|
97 |
| - return __shfl_down(Var, Delta, Width); |
| 86 | + return __nvvm_shfl_down_i32(Var, Delta, T); |
98 | 87 | #endif // CUDA_VERSION
|
99 | 88 | }
|
100 | 89 |
|
101 | 90 | DEVICE void __kmpc_impl_syncthreads() { __syncthreads(); }
|
102 | 91 |
|
103 | 92 | DEVICE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
|
104 | 93 | #if CUDA_VERSION >= 9000
|
105 |
| - __syncwarp(Mask); |
| 94 | + __nvvm_bar_warp_sync(Mask); |
106 | 95 | #else
|
107 | 96 | // In Cuda < 9.0 no need to sync threads in warps.
|
108 | 97 | #endif // CUDA_VERSION
|
|
0 commit comments