@@ -29,47 +29,52 @@ namespace impl {
29
29
// /
30
30
// /{
31
31
// / NOTE: This function needs to be implemented by every target.
32
- uint32_t atomicInc (uint32_t *Address, uint32_t Val, int Ordering);
32
+ uint32_t atomicInc (uint32_t *Address, uint32_t Val,
33
+ atomic::OrderingTy Ordering);
33
34
34
- uint32_t atomicLoad (uint32_t *Address, int Ordering) {
35
- return __atomic_fetch_add (Address, 0U , __ATOMIC_SEQ_CST );
35
+ uint32_t atomicLoad (uint32_t *Address, atomic::OrderingTy Ordering) {
36
+ return __atomic_fetch_add (Address, 0U , Ordering );
36
37
}
37
38
38
- void atomicStore (uint32_t *Address, uint32_t Val, int Ordering) {
39
+ void atomicStore (uint32_t *Address, uint32_t Val, atomic::OrderingTy Ordering) {
39
40
__atomic_store_n (Address, Val, Ordering);
40
41
}
41
42
42
- uint32_t atomicAdd (uint32_t *Address, uint32_t Val, int Ordering) {
43
+ uint32_t atomicAdd (uint32_t *Address, uint32_t Val,
44
+ atomic::OrderingTy Ordering) {
43
45
return __atomic_fetch_add (Address, Val, Ordering);
44
46
}
45
- uint32_t atomicMax (uint32_t *Address, uint32_t Val, int Ordering) {
47
+ uint32_t atomicMax (uint32_t *Address, uint32_t Val,
48
+ atomic::OrderingTy Ordering) {
46
49
return __atomic_fetch_max (Address, Val, Ordering);
47
50
}
48
51
49
- uint32_t atomicExchange (uint32_t *Address, uint32_t Val, int Ordering) {
52
+ uint32_t atomicExchange (uint32_t *Address, uint32_t Val,
53
+ atomic::OrderingTy Ordering) {
50
54
uint32_t R;
51
55
__atomic_exchange (Address, &Val, &R, Ordering);
52
56
return R;
53
57
}
54
58
uint32_t atomicCAS (uint32_t *Address, uint32_t Compare, uint32_t Val,
55
- int Ordering) {
59
+ atomic::OrderingTy Ordering) {
56
60
(void )__atomic_compare_exchange (Address, &Compare, &Val, false , Ordering,
57
61
Ordering);
58
62
return Compare;
59
63
}
60
64
61
- uint64_t atomicAdd (uint64_t *Address, uint64_t Val, int Ordering) {
65
+ uint64_t atomicAdd (uint64_t *Address, uint64_t Val,
66
+ atomic::OrderingTy Ordering) {
62
67
return __atomic_fetch_add (Address, Val, Ordering);
63
68
}
64
69
// /}
65
70
66
71
// Forward declarations defined to be defined for AMDGCN and NVPTX.
67
- uint32_t atomicInc (uint32_t *A, uint32_t V, int Ordering);
72
+ uint32_t atomicInc (uint32_t *A, uint32_t V, atomic::OrderingTy Ordering);
68
73
void namedBarrierInit ();
69
74
void namedBarrier ();
70
- void fenceTeam (int Ordering);
71
- void fenceKernel (int Ordering);
72
- void fenceSystem (int Ordering);
75
+ void fenceTeam (atomic::OrderingTy Ordering);
76
+ void fenceKernel (atomic::OrderingTy Ordering);
77
+ void fenceSystem (atomic::OrderingTy Ordering);
73
78
void syncWarp (__kmpc_impl_lanemask_t );
74
79
void syncThreads ();
75
80
void syncThreadsAligned () { syncThreads (); }
@@ -84,30 +89,30 @@ void setLock(omp_lock_t *);
84
89
// /{
85
90
#pragma omp begin declare variant match(device = {arch(amdgcn)})
86
91
87
- uint32_t atomicInc (uint32_t *A, uint32_t V, int Ordering) {
92
+ uint32_t atomicInc (uint32_t *A, uint32_t V, atomic::OrderingTy Ordering) {
88
93
// builtin_amdgcn_atomic_inc32 should expand to this switch when
89
94
// passed a runtime value, but does not do so yet. Workaround here.
90
95
switch (Ordering) {
91
96
default :
92
97
__builtin_unreachable ();
93
- case __ATOMIC_RELAXED :
94
- return __builtin_amdgcn_atomic_inc32 (A, V, __ATOMIC_RELAXED , " " );
95
- case __ATOMIC_ACQUIRE :
96
- return __builtin_amdgcn_atomic_inc32 (A, V, __ATOMIC_ACQUIRE , " " );
97
- case __ATOMIC_RELEASE :
98
- return __builtin_amdgcn_atomic_inc32 (A, V, __ATOMIC_RELEASE , " " );
99
- case __ATOMIC_ACQ_REL :
100
- return __builtin_amdgcn_atomic_inc32 (A, V, __ATOMIC_ACQ_REL , " " );
101
- case __ATOMIC_SEQ_CST :
102
- return __builtin_amdgcn_atomic_inc32 (A, V, __ATOMIC_SEQ_CST , " " );
98
+ case atomic::relaxed :
99
+ return __builtin_amdgcn_atomic_inc32 (A, V, atomic::relaxed , " " );
100
+ case atomic::aquire :
101
+ return __builtin_amdgcn_atomic_inc32 (A, V, atomic::aquire , " " );
102
+ case atomic::release :
103
+ return __builtin_amdgcn_atomic_inc32 (A, V, atomic::release , " " );
104
+ case atomic::acq_rel :
105
+ return __builtin_amdgcn_atomic_inc32 (A, V, atomic::acq_rel , " " );
106
+ case atomic::seq_cst :
107
+ return __builtin_amdgcn_atomic_inc32 (A, V, atomic::seq_cst , " " );
103
108
}
104
109
}
105
110
106
111
uint32_t SHARED (namedBarrierTracker);
107
112
108
113
void namedBarrierInit () {
109
114
// Don't have global ctors, and shared memory is not zero init
110
- atomic::store (&namedBarrierTracker, 0u , __ATOMIC_RELEASE );
115
+ atomic::store (&namedBarrierTracker, 0u , atomic::release );
111
116
}
112
117
113
118
void namedBarrier () {
@@ -117,7 +122,7 @@ void namedBarrier() {
117
122
uint32_t WarpSize = mapping::getWarpSize ();
118
123
uint32_t NumWaves = NumThreads / WarpSize;
119
124
120
- fence::team (__ATOMIC_ACQUIRE );
125
+ fence::team (atomic::aquire );
121
126
122
127
// named barrier implementation for amdgcn.
123
128
// Uses two 16 bit unsigned counters. One for the number of waves to have
@@ -133,7 +138,7 @@ void namedBarrier() {
133
138
// Increment the low 16 bits once, using the lowest active thread.
134
139
if (mapping::isLeaderInWarp ()) {
135
140
uint32_t load = atomic::add (&namedBarrierTracker, 1 ,
136
- __ATOMIC_RELAXED ); // commutative
141
+ atomic::relaxed ); // commutative
137
142
138
143
// Record the number of times the barrier has been passed
139
144
uint32_t generation = load & 0xffff0000u ;
@@ -145,61 +150,61 @@ void namedBarrier() {
145
150
load &= 0xffff0000u ; // because bits zeroed second
146
151
147
152
// Reset the wave counter and release the waiting waves
148
- atomic::store (&namedBarrierTracker, load, __ATOMIC_RELAXED );
153
+ atomic::store (&namedBarrierTracker, load, atomic::relaxed );
149
154
} else {
150
155
// more waves still to go, spin until generation counter changes
151
156
do {
152
157
__builtin_amdgcn_s_sleep (0 );
153
- load = atomic::load (&namedBarrierTracker, __ATOMIC_RELAXED );
158
+ load = atomic::load (&namedBarrierTracker, atomic::relaxed );
154
159
} while ((load & 0xffff0000u ) == generation);
155
160
}
156
161
}
157
- fence::team (__ATOMIC_RELEASE );
162
+ fence::team (atomic::release );
158
163
}
159
164
160
165
// sema checking of amdgcn_fence is aggressive. Intention is to patch clang
161
166
// so that it is usable within a template environment and so that a runtime
162
167
// value of the memory order is expanded to this switch within clang/llvm.
163
- void fenceTeam (int Ordering) {
168
+ void fenceTeam (atomic::OrderingTy Ordering) {
164
169
switch (Ordering) {
165
170
default :
166
171
__builtin_unreachable ();
167
- case __ATOMIC_ACQUIRE :
168
- return __builtin_amdgcn_fence (__ATOMIC_ACQUIRE , " workgroup" );
169
- case __ATOMIC_RELEASE :
170
- return __builtin_amdgcn_fence (__ATOMIC_RELEASE , " workgroup" );
171
- case __ATOMIC_ACQ_REL :
172
- return __builtin_amdgcn_fence (__ATOMIC_ACQ_REL , " workgroup" );
173
- case __ATOMIC_SEQ_CST :
174
- return __builtin_amdgcn_fence (__ATOMIC_SEQ_CST , " workgroup" );
172
+ case atomic::aquire :
173
+ return __builtin_amdgcn_fence (atomic::aquire , " workgroup" );
174
+ case atomic::release :
175
+ return __builtin_amdgcn_fence (atomic::release , " workgroup" );
176
+ case atomic::acq_rel :
177
+ return __builtin_amdgcn_fence (atomic::acq_rel , " workgroup" );
178
+ case atomic::seq_cst :
179
+ return __builtin_amdgcn_fence (atomic::seq_cst , " workgroup" );
175
180
}
176
181
}
177
- void fenceKernel (int Ordering) {
182
+ void fenceKernel (atomic::OrderingTy Ordering) {
178
183
switch (Ordering) {
179
184
default :
180
185
__builtin_unreachable ();
181
- case __ATOMIC_ACQUIRE :
182
- return __builtin_amdgcn_fence (__ATOMIC_ACQUIRE , " agent" );
183
- case __ATOMIC_RELEASE :
184
- return __builtin_amdgcn_fence (__ATOMIC_RELEASE , " agent" );
185
- case __ATOMIC_ACQ_REL :
186
- return __builtin_amdgcn_fence (__ATOMIC_ACQ_REL , " agent" );
187
- case __ATOMIC_SEQ_CST :
188
- return __builtin_amdgcn_fence (__ATOMIC_SEQ_CST , " agent" );
186
+ case atomic::aquire :
187
+ return __builtin_amdgcn_fence (atomic::aquire , " agent" );
188
+ case atomic::release :
189
+ return __builtin_amdgcn_fence (atomic::release , " agent" );
190
+ case atomic::acq_rel :
191
+ return __builtin_amdgcn_fence (atomic::acq_rel , " agent" );
192
+ case atomic::seq_cst :
193
+ return __builtin_amdgcn_fence (atomic::seq_cst , " agent" );
189
194
}
190
195
}
191
- void fenceSystem (int Ordering) {
196
+ void fenceSystem (atomic::OrderingTy Ordering) {
192
197
switch (Ordering) {
193
198
default :
194
199
__builtin_unreachable ();
195
- case __ATOMIC_ACQUIRE :
196
- return __builtin_amdgcn_fence (__ATOMIC_ACQUIRE , " " );
197
- case __ATOMIC_RELEASE :
198
- return __builtin_amdgcn_fence (__ATOMIC_RELEASE , " " );
199
- case __ATOMIC_ACQ_REL :
200
- return __builtin_amdgcn_fence (__ATOMIC_ACQ_REL , " " );
201
- case __ATOMIC_SEQ_CST :
202
- return __builtin_amdgcn_fence (__ATOMIC_SEQ_CST , " " );
200
+ case atomic::aquire :
201
+ return __builtin_amdgcn_fence (atomic::aquire , " " );
202
+ case atomic::release :
203
+ return __builtin_amdgcn_fence (atomic::release , " " );
204
+ case atomic::acq_rel :
205
+ return __builtin_amdgcn_fence (atomic::acq_rel , " " );
206
+ case atomic::seq_cst :
207
+ return __builtin_amdgcn_fence (atomic::seq_cst , " " );
203
208
}
204
209
}
205
210
@@ -226,7 +231,8 @@ void setLock(omp_lock_t *) { __builtin_trap(); }
226
231
#pragma omp begin declare variant match( \
227
232
device = {arch (nvptx, nvptx64)}, implementation = {extension (match_any)})
228
233
229
- uint32_t atomicInc (uint32_t *Address, uint32_t Val, int Ordering) {
234
+ uint32_t atomicInc (uint32_t *Address, uint32_t Val,
235
+ atomic::OrderingTy Ordering) {
230
236
return __nvvm_atom_inc_gen_ui (Address, Val);
231
237
}
232
238
@@ -268,11 +274,11 @@ constexpr uint32_t SET = 1;
268
274
// called before it is defined
269
275
// here the overload won't happen. Investigate lalter!
270
276
void unsetLock (omp_lock_t *Lock) {
271
- (void )atomicExchange ((uint32_t *)Lock, UNSET, __ATOMIC_SEQ_CST );
277
+ (void )atomicExchange ((uint32_t *)Lock, UNSET, atomic::seq_cst );
272
278
}
273
279
274
280
int testLock (omp_lock_t *Lock) {
275
- return atomicAdd ((uint32_t *)Lock, 0u , __ATOMIC_SEQ_CST );
281
+ return atomicAdd ((uint32_t *)Lock, 0u , atomic::seq_cst );
276
282
}
277
283
278
284
void initLock (omp_lock_t *Lock) { unsetLock (Lock); }
@@ -281,7 +287,7 @@ void destroyLock(omp_lock_t *Lock) { unsetLock(Lock); }
281
287
282
288
void setLock (omp_lock_t *Lock) {
283
289
// TODO: not sure spinning is a good idea here..
284
- while (atomicCAS ((uint32_t *)Lock, UNSET, SET, __ATOMIC_SEQ_CST ) != UNSET) {
290
+ while (atomicCAS ((uint32_t *)Lock, UNSET, SET, atomic::seq_cst ) != UNSET) {
285
291
int32_t start = __nvvm_read_ptx_sreg_clock ();
286
292
int32_t now;
287
293
for (;;) {
@@ -310,29 +316,29 @@ void synchronize::threads() { impl::syncThreads(); }
310
316
311
317
void synchronize::threadsAligned () { impl::syncThreadsAligned (); }
312
318
313
- void fence::team (int Ordering) { impl::fenceTeam (Ordering); }
319
+ void fence::team (atomic::OrderingTy Ordering) { impl::fenceTeam (Ordering); }
314
320
315
- void fence::kernel (int Ordering) { impl::fenceKernel (Ordering); }
321
+ void fence::kernel (atomic::OrderingTy Ordering) { impl::fenceKernel (Ordering); }
316
322
317
- void fence::system (int Ordering) { impl::fenceSystem (Ordering); }
323
+ void fence::system (atomic::OrderingTy Ordering) { impl::fenceSystem (Ordering); }
318
324
319
- uint32_t atomic::load (uint32_t *Addr, int Ordering) {
325
+ uint32_t atomic::load (uint32_t *Addr, atomic::OrderingTy Ordering) {
320
326
return impl::atomicLoad (Addr, Ordering);
321
327
}
322
328
323
- void atomic::store (uint32_t *Addr, uint32_t V, int Ordering) {
329
+ void atomic::store (uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
324
330
impl::atomicStore (Addr, V, Ordering);
325
331
}
326
332
327
- uint32_t atomic::inc (uint32_t *Addr, uint32_t V, int Ordering) {
333
+ uint32_t atomic::inc (uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
328
334
return impl::atomicInc (Addr, V, Ordering);
329
335
}
330
336
331
- uint32_t atomic::add (uint32_t *Addr, uint32_t V, int Ordering) {
337
+ uint32_t atomic::add (uint32_t *Addr, uint32_t V, atomic::OrderingTy Ordering) {
332
338
return impl::atomicAdd (Addr, V, Ordering);
333
339
}
334
340
335
- uint64_t atomic::add (uint64_t *Addr, uint64_t V, int Ordering) {
341
+ uint64_t atomic::add (uint64_t *Addr, uint64_t V, atomic::OrderingTy Ordering) {
336
342
return impl::atomicAdd (Addr, V, Ordering);
337
343
}
338
344
@@ -389,7 +395,7 @@ void __kmpc_end_single(IdentTy *Loc, int32_t TId) {
389
395
390
396
void __kmpc_flush (IdentTy *Loc) {
391
397
FunctionTracingRAII ();
392
- fence::kernel (__ATOMIC_SEQ_CST );
398
+ fence::kernel (atomic::seq_cst );
393
399
}
394
400
395
401
uint64_t __kmpc_warp_active_thread_mask (void ) {
0 commit comments