Skip to content

Commit 8b6cd15

Browse files
[libomptarget][amdgcn] Implement partial barrier
[libomptarget][amdgcn] Implement partial barrier named_sync is used to coordinate non-spmd kernels. This uses bar.sync on nvptx. There is no corresponding ISA support on amdgcn, so this is implemented using shared memory, one word initialized to zero. Each wave increments the variable by one. Whichever wave is last is responsible for resetting the variable to zero, at which point it and the others continue. The race condition on a wave reaching the barrier before another wave has noticed that it has been released is handled with a generation counter, packed into the same word. Uses a shared variable that is not needed on nvptx. Introduces a new hook, kmpc_impl_target_init, to allow different targets to do extra initialization. Reviewed By: jdoerfert Differential Revision: https://reviews.llvm.org/D88602
1 parent 81ead8a commit 8b6cd15

File tree

4 files changed

+64
-5
lines changed

4 files changed

+64
-5
lines changed

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -109,11 +109,11 @@ INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) {
109109
// AMDGCN doesn't need to sync threads in a warp
110110
}
111111

112-
INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
113-
(void)num_threads;
114-
// TODO: Implement on top of __SHARED__
115-
__builtin_amdgcn_s_barrier();
116-
}
112+
// AMDGCN specific kernel initialization
113+
DEVICE void __kmpc_impl_target_init();
114+
115+
// Equivalent to ptx bar.sync 1. Barrier until num_threads arrive.
116+
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads);
117117

118118
INLINE void __kmpc_impl_threadfence() {
119119
__builtin_amdgcn_fence(__ATOMIC_SEQ_CST, "agent");

openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,59 @@ DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
6262
return __builtin_amdgcn_ds_bpermute(index << 2, var);
6363
}
6464

65+
static DEVICE SHARED uint32_t L1_Barrier;
66+
67+
DEVICE void __kmpc_impl_target_init() {
68+
// Don't have global ctors, and shared memory is not zero init
69+
__atomic_store_n(&L1_Barrier, 0u, __ATOMIC_RELEASE);
70+
}
71+
72+
DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
73+
__atomic_thread_fence(__ATOMIC_ACQUIRE);
74+
75+
uint32_t num_waves = num_threads / WARPSIZE;
76+
77+
// Partial barrier implementation for amdgcn.
78+
// Uses two 16 bit unsigned counters. One for the number of waves to have
79+
// reached the barrier, and one to count how many times the barrier has been
80+
// passed. These are packed in a single atomically accessed 32 bit integer.
81+
// Low bits for the number of waves, assumed zero before this call.
82+
// High bits to count the number of times the barrier has been passed.
83+
84+
assert(num_waves != 0);
85+
assert(num_waves * WARPSIZE == num_threads);
86+
assert(num_waves < 0xffffu);
87+
88+
// Increment the low 16 bits once, using the lowest active thread.
89+
uint64_t lowestActiveThread = __kmpc_impl_ffs(__kmpc_impl_activemask()) - 1;
90+
bool isLowest = GetLaneId() == lowestActiveThread;
91+
92+
if (isLowest) {
93+
uint32_t load =
94+
__atomic_fetch_add(&L1_Barrier, 1, __ATOMIC_RELAXED); // commutative
95+
96+
// Record the number of times the barrier has been passed
97+
uint32_t generation = load & 0xffff0000u;
98+
99+
if ((load & 0x0000ffffu) == (num_waves - 1)) {
100+
// Reached num_waves in low bits so this is the last wave.
101+
// Set low bits to zero and increment high bits
102+
load += 0x00010000u; // wrap is safe
103+
load &= 0xffff0000u; // because bits zeroed second
104+
105+
// Reset the wave counter and release the waiting waves
106+
__atomic_store_n(&L1_Barrier, load, __ATOMIC_RELAXED);
107+
} else {
108+
// more waves still to go, spin until generation counter changes
109+
do {
110+
__builtin_amdgcn_s_sleep(0);
111+
load = __atomic_load_n(&L1_Barrier, __ATOMIC_RELAXED);
112+
} while ((load & 0xffff0000u) == generation);
113+
}
114+
}
115+
__atomic_thread_fence(__ATOMIC_RELEASE);
116+
}
117+
65118
EXTERN uint64_t __ockl_get_local_size(uint32_t);
66119
EXTERN uint64_t __ockl_get_num_groups(uint32_t);
67120
DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); }

openmp/libomptarget/deviceRTLs/common/src/omptarget.cu

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
6363
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
6464
nThreads = GetNumberOfThreadsInBlock();
6565
threadLimit = ThreadLimit;
66+
__kmpc_impl_target_init();
6667
}
6768

6869
EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) {

openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,11 @@ INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) {
183183
#endif // CUDA_VERSION
184184
}
185185

186+
// NVPTX specific kernel initialization
187+
INLINE void __kmpc_impl_target_init() { /* nvptx needs no extra setup */
188+
}
189+
190+
// Barrier until num_threads arrive.
186191
INLINE void __kmpc_impl_named_sync(uint32_t num_threads) {
187192
// The named barrier for active parallel threads of a team in an L1 parallel
188193
// region to synchronize with each other.

0 commit comments

Comments
 (0)