@@ -32,40 +32,87 @@ namespace impl {
32
32
uint32_t atomicInc (uint32_t *Address, uint32_t Val,
33
33
atomic::OrderingTy Ordering);
34
34
35
- uint32_t atomicLoad (uint32_t *Address, atomic::OrderingTy Ordering) {
36
- return __atomic_fetch_add (Address, 0U , Ordering);
35
+ template <typename Ty>
36
+ Ty atomicAdd (Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
37
+ return __atomic_fetch_add (Address, Val, Ordering);
38
+ }
39
+
40
+ template <typename Ty>
41
+ Ty atomicMul (Ty *Address, Ty V, atomic::OrderingTy Ordering) {
42
+ Ty TypedCurrentVal, TypedResultVal, TypedNewVal;
43
+ bool Success;
44
+ do {
45
+ TypedCurrentVal = atomic::load (Address, Ordering);
46
+ TypedNewVal = TypedCurrentVal * V;
47
+ Success = atomic::cas (Address, TypedCurrentVal, TypedNewVal, Ordering,
48
+ atomic::relaxed);
49
+ } while (!Success);
50
+ return TypedResultVal;
51
+ }
52
+
53
+ template <typename Ty> Ty atomicLoad (Ty *Address, atomic::OrderingTy Ordering) {
54
+ return atomicAdd (Address, Ty (0 ), Ordering);
37
55
}
38
56
39
- void atomicStore (uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) {
57
+ template <typename Ty>
58
+ void atomicStore (Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
40
59
__atomic_store_n (Address, Val, Ordering);
41
60
}
42
61
43
- uint32_t atomicAdd (uint32_t *Address, uint32_t Val,
44
- atomic::OrderingTy Ordering) {
45
- return __atomic_fetch_add (Address, Val, Ordering);
62
+ template <typename Ty>
63
+ bool atomicCAS (Ty *Address, Ty ExpectedV, Ty DesiredV,
64
+ atomic::OrderingTy OrderingSucc,
65
+ atomic::OrderingTy OrderingFail) {
66
+ return __atomic_compare_exchange (Address, &ExpectedV, &DesiredV, false ,
67
+ OrderingSucc, OrderingFail);
46
68
}
47
- uint32_t atomicMax (uint32_t *Address, uint32_t Val,
48
- atomic::OrderingTy Ordering) {
69
+
70
+ template <typename Ty>
71
+ Ty atomicMin (Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
72
+ return __atomic_fetch_min (Address, Val, Ordering);
73
+ }
74
+
75
+ template <typename Ty>
76
+ Ty atomicMax (Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
49
77
return __atomic_fetch_max (Address, Val, Ordering);
50
78
}
51
79
80
+ // TODO: Implement this with __atomic_fetch_max and remove the duplication.
81
+ template <typename Ty, typename STy, typename UTy>
82
+ Ty atomicMinFP (Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
83
+ if (Val >= 0 )
84
+ return atomicMin ((STy *)Address, utils::convertViaPun<STy>(Val), Ordering);
85
+ return atomicMax ((UTy *)Address, utils::convertViaPun<UTy>(Val), Ordering);
86
+ }
87
+
88
+ template <typename Ty, typename STy, typename UTy>
89
+ Ty atomicMaxFP (Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
90
+ if (Val >= 0 )
91
+ return atomicMax ((STy *)Address, utils::convertViaPun<STy>(Val), Ordering);
92
+ return atomicMin ((UTy *)Address, utils::convertViaPun<UTy>(Val), Ordering);
93
+ }
94
+
95
+ template <typename Ty>
96
+ Ty atomicOr (Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
97
+ return __atomic_fetch_or (Address, Val, Ordering);
98
+ }
99
+
100
+ template <typename Ty>
101
+ Ty atomicAnd (Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
102
+ return __atomic_fetch_and (Address, Val, Ordering);
103
+ }
104
+
105
+ template <typename Ty>
106
+ Ty atomicXOr (Ty *Address, Ty Val, atomic::OrderingTy Ordering) {
107
+ return __atomic_fetch_xor (Address, Val, Ordering);
108
+ }
109
+
52
110
uint32_t atomicExchange (uint32_t *Address, uint32_t Val,
53
111
atomic::OrderingTy Ordering) {
54
112
uint32_t R;
55
113
__atomic_exchange (Address, &Val, &R, Ordering);
56
114
return R;
57
115
}
58
- uint32_t atomicCAS (uint32_t *Address, uint32_t Compare, uint32_t Val,
59
- atomic::OrderingTy Ordering) {
60
- (void )__atomic_compare_exchange (Address, &Compare, &Val, false , Ordering,
61
- Ordering);
62
- return Compare;
63
- }
64
-
65
- uint64_t atomicAdd (uint64_t *Address, uint64_t Val,
66
- atomic::OrderingTy Ordering) {
67
- return __atomic_fetch_add (Address, Val, Ordering);
68
- }
69
116
// /}
70
117
71
118
// Forward declarations defined to be defined for AMDGCN and NVPTX.
@@ -287,7 +334,8 @@ void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
287
334
288
335
void setLock (omp_lock_t *Lock) {
289
336
// TODO: not sure spinning is a good idea here..
290
- while (atomicCAS ((uint32_t *)Lock, UNSET, SET, atomic::seq_cst) != UNSET) {
337
+ while (atomicCAS ((uint32_t *)Lock, UNSET, SET, atomic::seq_cst,
338
+ atomic::seq_cst) != UNSET) {
291
339
int32_t start = __nvvm_read_ptx_sreg_clock ();
292
340
int32_t now;
293
341
for (;;) {
@@ -322,24 +370,84 @@ void fence::kernel(atomic::OrderingTy Ordering) { impl::fenceKernel(Ordering); }
322
370
323
371
void fence::system (atomic::OrderingTy Ordering) { impl::fenceSystem (Ordering); }
324
372
325
- uint32_t atomic::load (uint32_t *Addr, atomic::OrderingTy Ordering) {
326
- return impl::atomicLoad (Addr, Ordering);
327
- }
373
+ #define ATOMIC_COMMON_OP (TY ) \
374
+ TY atomic::add (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
375
+ return impl::atomicAdd (Addr, V, Ordering); \
376
+ } \
377
+ TY atomic::mul (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
378
+ return impl::atomicMul (Addr, V, Ordering); \
379
+ } \
380
+ TY atomic::load (TY *Addr, atomic::OrderingTy Ordering) { \
381
+ return impl::atomicLoad (Addr, Ordering); \
382
+ } \
383
+ bool atomic::cas (TY *Addr, TY ExpectedV, TY DesiredV, \
384
+ atomic::OrderingTy OrderingSucc, \
385
+ atomic::OrderingTy OrderingFail) { \
386
+ return impl::atomicCAS (Addr, ExpectedV, DesiredV, OrderingSucc, \
387
+ OrderingFail); \
388
+ }
328
389
329
- void atomic::store (uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
330
- impl::atomicStore (Addr, V, Ordering);
331
- }
390
+ #define ATOMIC_FP_ONLY_OP (TY, STY, UTY ) \
391
+ TY atomic::min (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
392
+ return impl::atomicMinFP<TY, STY, UTY>(Addr, V, Ordering); \
393
+ } \
394
+ TY atomic::max (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
395
+ return impl::atomicMaxFP<TY, STY, UTY>(Addr, V, Ordering); \
396
+ } \
397
+ void atomic::store (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
398
+ impl::atomicStore (reinterpret_cast <UTY *>(Addr), \
399
+ utils::convertViaPun<UTY>(V), Ordering); \
400
+ }
332
401
333
- uint32_t atomic::inc (uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
334
- return impl::atomicInc (Addr, V, Ordering);
335
- }
402
+ #define ATOMIC_INT_ONLY_OP (TY ) \
403
+ TY atomic::min (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
404
+ return impl::atomicMin<TY>(Addr, V, Ordering); \
405
+ } \
406
+ TY atomic::max (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
407
+ return impl::atomicMax<TY>(Addr, V, Ordering); \
408
+ } \
409
+ TY atomic::bit_or (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
410
+ return impl::atomicOr (Addr, V, Ordering); \
411
+ } \
412
+ TY atomic::bit_and (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
413
+ return impl::atomicAnd (Addr, V, Ordering); \
414
+ } \
415
+ TY atomic::bit_xor (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
416
+ return impl::atomicXOr (Addr, V, Ordering); \
417
+ } \
418
+ void atomic::store (TY *Addr, TY V, atomic::OrderingTy Ordering) { \
419
+ impl::atomicStore (Addr, V, Ordering); \
420
+ }
336
421
337
- uint32_t atomic::add (uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
338
- return impl::atomicAdd (Addr, V, Ordering);
339
- }
422
+ #define ATOMIC_FP_OP (TY, STY, UTY ) \
423
+ ATOMIC_FP_ONLY_OP (TY, STY, UTY) \
424
+ ATOMIC_COMMON_OP(TY)
425
+
426
+ #define ATOMIC_INT_OP (TY ) \
427
+ ATOMIC_INT_ONLY_OP (TY) \
428
+ ATOMIC_COMMON_OP(TY)
429
+
430
+ // This needs to be kept in sync with the header. Also the reason we don't use
431
+ // templates here.
432
+ ATOMIC_INT_OP(int8_t )
433
+ ATOMIC_INT_OP(int16_t )
434
+ ATOMIC_INT_OP(int32_t )
435
+ ATOMIC_INT_OP(int64_t )
436
+ ATOMIC_INT_OP(uint8_t )
437
+ ATOMIC_INT_OP(uint16_t )
438
+ ATOMIC_INT_OP(uint32_t )
439
+ ATOMIC_INT_OP(uint64_t )
440
+ ATOMIC_FP_OP(float , int32_t , uint32_t )
441
+ ATOMIC_FP_OP(double , int64_t , uint64_t )
442
+
443
+ #undef ATOMIC_INT_ONLY_OP
444
+ #undef ATOMIC_FP_ONLY_OP
445
+ #undef ATOMIC_COMMON_OP
446
+ #undef ATOMIC_INT_OP
447
+ #undef ATOMIC_FP_OP
340
448
341
- uint64_t atomic::add ( uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering) {
342
- return impl::atomicAdd (Addr, V, Ordering);
449
+ uint32_t atomic::inc ( uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
450
+ return impl::atomicInc (Addr, V, Ordering);
343
451
}
344
452
345
453
extern " C" {
0 commit comments