Skip to content

Commit b113965

Browse files
committed
[OpenMP] Introduce more atomic operations into the runtime
We should use OpenMP atomics but they don't take variable orderings. Maybe we should expose all of this in the header but that solves only part of the problem anyway. Differential Revision: https://reviews.llvm.org/D135036
1 parent f85c1f3 commit b113965

File tree

3 files changed

+199
-46
lines changed

3 files changed

+199
-46
lines changed

openmp/libomptarget/DeviceRTL/include/Synchronization.h

Lines changed: 52 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -54,20 +54,60 @@ enum OrderingTy {
5454
seq_cst = __ATOMIC_SEQ_CST,
5555
};
5656

57-
/// Atomically load \p Addr with \p Ordering semantics.
58-
uint32_t load(uint32_t *Addr, atomic::OrderingTy Ordering);
59-
60-
/// Atomically store \p V to \p Addr with \p Ordering semantics.
61-
void store(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering);
62-
6357
/// Atomically increment \p *Addr and wrap at \p V with \p Ordering semantics.
64-
uint32_t inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering);
58+
uint32_t inc(uint32_t *Addr, uint32_t V, OrderingTy Ordering);
59+
60+
/// Atomically perform <op> on \p V and \p *Addr with \p Ordering semantics. The
61+
/// result is stored in \p *Addr;
62+
/// {
63+
64+
#define ATOMIC_COMMON_OP(TY) \
65+
TY add(TY *Addr, TY V, OrderingTy Ordering); \
66+
TY mul(TY *Addr, TY V, OrderingTy Ordering); \
67+
TY load(TY *Addr, OrderingTy Ordering); \
68+
void store(TY *Addr, TY V, OrderingTy Ordering); \
69+
bool cas(TY *Addr, TY ExpectedV, TY DesiredV, OrderingTy OrderingSucc, \
70+
OrderingTy OrderingFail);
71+
72+
#define ATOMIC_FP_ONLY_OP(TY) \
73+
TY min(TY *Addr, TY V, OrderingTy Ordering); \
74+
TY max(TY *Addr, TY V, OrderingTy Ordering);
75+
76+
#define ATOMIC_INT_ONLY_OP(TY) \
77+
TY min(TY *Addr, TY V, OrderingTy Ordering); \
78+
TY max(TY *Addr, TY V, OrderingTy Ordering); \
79+
TY bit_or(TY *Addr, TY V, OrderingTy Ordering); \
80+
TY bit_and(TY *Addr, TY V, OrderingTy Ordering); \
81+
TY bit_xor(TY *Addr, TY V, OrderingTy Ordering);
82+
83+
#define ATOMIC_FP_OP(TY) \
84+
ATOMIC_FP_ONLY_OP(TY) \
85+
ATOMIC_COMMON_OP(TY)
86+
87+
#define ATOMIC_INT_OP(TY) \
88+
ATOMIC_INT_ONLY_OP(TY) \
89+
ATOMIC_COMMON_OP(TY)
90+
91+
// This needs to be kept in sync with the header. Also the reason we don't use
92+
// templates here.
93+
ATOMIC_INT_OP(int8_t)
94+
ATOMIC_INT_OP(int16_t)
95+
ATOMIC_INT_OP(int32_t)
96+
ATOMIC_INT_OP(int64_t)
97+
ATOMIC_INT_OP(uint8_t)
98+
ATOMIC_INT_OP(uint16_t)
99+
ATOMIC_INT_OP(uint32_t)
100+
ATOMIC_INT_OP(uint64_t)
101+
ATOMIC_FP_OP(float)
102+
ATOMIC_FP_OP(double)
103+
104+
#undef ATOMIC_INT_ONLY_OP
105+
#undef ATOMIC_FP_ONLY_OP
106+
#undef ATOMIC_COMMON_OP
107+
#undef ATOMIC_INT_OP
108+
#undef ATOMIC_FP_OP
65109

66-
/// Atomically add \p V to \p *Addr with \p Ordering semantics.
67-
uint32_t add(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering);
68-
69-
/// Atomically add \p V to \p *Addr with \p Ordering semantics.
70-
uint64_t add(uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering);
110+
///}
71111

72112
} // namespace atomic
73113

openmp/libomptarget/DeviceRTL/include/Utils.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,11 @@ template <typename Ty1, typename Ty2> inline Ty1 align_down(Ty1 V, Ty2 Align) {
7777
/// Return true iff \p Ptr is pointing into shared (local) memory (AS(3)).
7878
bool isSharedMemPtr(void *Ptr);
7979

80+
/// Return \p V typed punned as \p DstTy.
81+
template <typename DstTy, typename SrcTy> inline DstTy convertViaPun(SrcTy V) {
82+
return *((DstTy *)(&V));
83+
}
84+
8085
/// A pointer variable that has by design an `undef` value. Use with care.
8186
__attribute__((loader_uninitialized)) static void *const UndefPtr;
8287

openmp/libomptarget/DeviceRTL/src/Synchronization.cpp

Lines changed: 142 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -32,40 +32,87 @@ namespace impl {
3232
uint32_t atomicInc(uint32_t *Address, uint32_t Val,
3333
atomic::OrderingTy Ordering);
3434

35-
uint32_t atomicLoad(uint32_t *Address, atomic::OrderingTy Ordering) {
36-
return __atomic_fetch_add(Address, 0U, Ordering);
35+
template <typename Ty>
36+
Ty atomicAdd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
37+
return __atomic_fetch_add(Address, Val, Ordering);
38+
}
39+
40+
template <typename Ty>
41+
Ty atomicMul(Ty *Address, Ty V, atomic::OrderingTy Ordering) {
42+
Ty TypedCurrentVal, TypedResultVal, TypedNewVal;
43+
bool Success;
44+
do {
45+
TypedCurrentVal = atomic::load(Address, Ordering);
46+
TypedNewVal = TypedCurrentVal * V;
47+
Success = atomic::cas(Address, TypedCurrentVal, TypedNewVal, Ordering,
48+
atomic::relaxed);
49+
} while (!Success);
50+
return TypedResultVal;
51+
}
52+
53+
template <typename Ty> Ty atomicLoad(Ty *Address, atomic::OrderingTy Ordering) {
54+
return atomicAdd(Address, Ty(0), Ordering);
3755
}
3856

39-
void atomicStore(uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) {
57+
template <typename Ty>
58+
void atomicStore(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
4059
__atomic_store_n(Address, Val, Ordering);
4160
}
4261

43-
uint32_t atomicAdd(uint32_t *Address, uint32_t Val,
44-
atomic::OrderingTy Ordering) {
45-
return __atomic_fetch_add(Address, Val, Ordering);
62+
template <typename Ty>
63+
bool atomicCAS(Ty *Address, Ty ExpectedV, Ty DesiredV,
64+
atomic::OrderingTy OrderingSucc,
65+
atomic::OrderingTy OrderingFail) {
66+
return __atomic_compare_exchange(Address, &ExpectedV, &DesiredV, false,
67+
OrderingSucc, OrderingFail);
4668
}
47-
uint32_t atomicMax(uint32_t *Address, uint32_t Val,
48-
atomic::OrderingTy Ordering) {
69+
70+
template <typename Ty>
71+
Ty atomicMin(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
72+
return __atomic_fetch_min(Address, Val, Ordering);
73+
}
74+
75+
template <typename Ty>
76+
Ty atomicMax(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
4977
return __atomic_fetch_max(Address, Val, Ordering);
5078
}
5179

80+
// TODO: Implement this with __atomic_fetch_max and remove the duplication.
81+
template <typename Ty, typename STy, typename UTy>
82+
Ty atomicMinFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
83+
if (Val >= 0)
84+
return atomicMin((STy *)Address, utils::convertViaPun<STy>(Val), Ordering);
85+
return atomicMax((UTy *)Address, utils::convertViaPun<UTy>(Val), Ordering);
86+
}
87+
88+
template <typename Ty, typename STy, typename UTy>
89+
Ty atomicMaxFP(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
90+
if (Val >= 0)
91+
return atomicMax((STy *)Address, utils::convertViaPun<STy>(Val), Ordering);
92+
return atomicMin((UTy *)Address, utils::convertViaPun<UTy>(Val), Ordering);
93+
}
94+
95+
template <typename Ty>
96+
Ty atomicOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
97+
return __atomic_fetch_or(Address, Val, Ordering);
98+
}
99+
100+
template <typename Ty>
101+
Ty atomicAnd(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
102+
return __atomic_fetch_and(Address, Val, Ordering);
103+
}
104+
105+
template <typename Ty>
106+
Ty atomicXOr(Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
107+
return __atomic_fetch_xor(Address, Val, Ordering);
108+
}
109+
52110
uint32_t atomicExchange(uint32_t *Address, uint32_t Val,
53111
atomic::OrderingTy Ordering) {
54112
uint32_t R;
55113
__atomic_exchange(Address, &Val, &R, Ordering);
56114
return R;
57115
}
58-
uint32_t atomicCAS(uint32_t *Address, uint32_t Compare, uint32_t Val,
59-
atomic::OrderingTy Ordering) {
60-
(void)__atomic_compare_exchange(Address, &Compare, &Val, false, Ordering,
61-
Ordering);
62-
return Compare;
63-
}
64-
65-
uint64_t atomicAdd(uint64_t *Address, uint64_t Val,
66-
atomic::OrderingTy Ordering) {
67-
return __atomic_fetch_add(Address, Val, Ordering);
68-
}
69116
///}
70117

71118
// Forward declarations defined to be defined for AMDGCN and NVPTX.
@@ -287,7 +334,8 @@ void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
287334

288335
void setLock(omp_lock_t *Lock) {
289336
// TODO: not sure spinning is a good idea here..
290-
while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst) != UNSET) {
337+
while (atomicCAS((uint32_t *)Lock, UNSET, SET, atomic::seq_cst,
338+
atomic::seq_cst) != UNSET) {
291339
int32_t start = __nvvm_read_ptx_sreg_clock();
292340
int32_t now;
293341
for (;;) {
@@ -322,24 +370,84 @@ void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); }
322370

323371
void fence::system(atomic::OrderingTy Ordering) { impl::fenceSystem(Ordering); }
324372

325-
uint32_t atomic::load(uint32_t *Addr, atomic::OrderingTy Ordering) {
326-
return impl::atomicLoad(Addr, Ordering);
327-
}
373+
#define ATOMIC_COMMON_OP(TY) \
374+
TY atomic::add(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
375+
return impl::atomicAdd(Addr, V, Ordering); \
376+
} \
377+
TY atomic::mul(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
378+
return impl::atomicMul(Addr, V, Ordering); \
379+
} \
380+
TY atomic::load(TY *Addr, atomic::OrderingTy Ordering) { \
381+
return impl::atomicLoad(Addr, Ordering); \
382+
} \
383+
bool atomic::cas(TY *Addr, TY ExpectedV, TY DesiredV, \
384+
atomic::OrderingTy OrderingSucc, \
385+
atomic::OrderingTy OrderingFail) { \
386+
return impl::atomicCAS(Addr, ExpectedV, DesiredV, OrderingSucc, \
387+
OrderingFail); \
388+
}
328389

329-
void atomic::store(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
330-
impl::atomicStore(Addr, V, Ordering);
331-
}
390+
#define ATOMIC_FP_ONLY_OP(TY, STY, UTY) \
391+
TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
392+
return impl::atomicMinFP<TY, STY, UTY>(Addr, V, Ordering); \
393+
} \
394+
TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
395+
return impl::atomicMaxFP<TY, STY, UTY>(Addr, V, Ordering); \
396+
} \
397+
void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
398+
impl::atomicStore(reinterpret_cast<UTY *>(Addr), \
399+
utils::convertViaPun<UTY>(V), Ordering); \
400+
}
332401

333-
uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
334-
return impl::atomicInc(Addr, V, Ordering);
335-
}
402+
#define ATOMIC_INT_ONLY_OP(TY) \
403+
TY atomic::min(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
404+
return impl::atomicMin<TY>(Addr, V, Ordering); \
405+
} \
406+
TY atomic::max(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
407+
return impl::atomicMax<TY>(Addr, V, Ordering); \
408+
} \
409+
TY atomic::bit_or(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
410+
return impl::atomicOr(Addr, V, Ordering); \
411+
} \
412+
TY atomic::bit_and(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
413+
return impl::atomicAnd(Addr, V, Ordering); \
414+
} \
415+
TY atomic::bit_xor(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
416+
return impl::atomicXOr(Addr, V, Ordering); \
417+
} \
418+
void atomic::store(TY *Addr, TY V, atomic::OrderingTy Ordering) { \
419+
impl::atomicStore(Addr, V, Ordering); \
420+
}
336421

337-
uint32_t atomic::add(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
338-
return impl::atomicAdd(Addr, V, Ordering);
339-
}
422+
#define ATOMIC_FP_OP(TY, STY, UTY) \
423+
ATOMIC_FP_ONLY_OP(TY, STY, UTY) \
424+
ATOMIC_COMMON_OP(TY)
425+
426+
#define ATOMIC_INT_OP(TY) \
427+
ATOMIC_INT_ONLY_OP(TY) \
428+
ATOMIC_COMMON_OP(TY)
429+
430+
// This needs to be kept in sync with the header. Also the reason we don't use
431+
// templates here.
432+
ATOMIC_INT_OP(int8_t)
433+
ATOMIC_INT_OP(int16_t)
434+
ATOMIC_INT_OP(int32_t)
435+
ATOMIC_INT_OP(int64_t)
436+
ATOMIC_INT_OP(uint8_t)
437+
ATOMIC_INT_OP(uint16_t)
438+
ATOMIC_INT_OP(uint32_t)
439+
ATOMIC_INT_OP(uint64_t)
440+
ATOMIC_FP_OP(float, int32_t, uint32_t)
441+
ATOMIC_FP_OP(double, int64_t, uint64_t)
442+
443+
#undef ATOMIC_INT_ONLY_OP
444+
#undef ATOMIC_FP_ONLY_OP
445+
#undef ATOMIC_COMMON_OP
446+
#undef ATOMIC_INT_OP
447+
#undef ATOMIC_FP_OP
340448

341-
uint64_t atomic::add(uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering) {
342-
return impl::atomicAdd(Addr, V, Ordering);
449+
uint32_t atomic::inc(uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
450+
return impl::atomicInc(Addr, V, Ordering);
343451
}
344452

345453
extern "C" {

0 commit comments

Comments
 (0)