17
17
18
18
// Initialized with a 64-bit mask with bits set in positions less than the
19
19
// thread's lane number in the warp
20
- DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt () {
20
+ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt () {
21
21
uint32_t lane = GetLaneId ();
22
22
int64_t ballot = __kmpc_impl_activemask ();
23
23
uint64_t mask = ((uint64_t )1 << lane) - (uint64_t )1 ;
@@ -26,7 +26,7 @@ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() {
26
26
27
27
// Initialized with a 64-bit mask with bits set in positions greater than the
28
28
// thread's lane number in the warp
29
- DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt () {
29
+ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt () {
30
30
uint32_t lane = GetLaneId ();
31
31
if (lane == (WARPSIZE - 1 ))
32
32
return 0 ;
@@ -35,9 +35,9 @@ DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() {
35
35
return mask & ballot;
36
36
}
37
37
38
- DEVICE double __kmpc_impl_get_wtick () { return ((double )1E-9 ); }
38
+ EXTERN double __kmpc_impl_get_wtick () { return ((double )1E-9 ); }
39
39
40
- DEVICE double __kmpc_impl_get_wtime () {
40
+ EXTERN double __kmpc_impl_get_wtime () {
41
41
// The intrinsics for measuring time have undocumented frequency
42
42
// This will probably need to be found by measurement on a number of
43
43
// architectures. Until then, return 0, which is very inaccurate as a
@@ -46,19 +46,19 @@ DEVICE double __kmpc_impl_get_wtime() {
46
46
}
47
47
48
48
// Warp vote function
49
- DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask () {
49
+ EXTERN __kmpc_impl_lanemask_t __kmpc_impl_activemask () {
50
50
return __builtin_amdgcn_read_exec ();
51
51
}
52
52
53
- DEVICE int32_t __kmpc_impl_shfl_sync (__kmpc_impl_lanemask_t , int32_t var,
53
+ EXTERN int32_t __kmpc_impl_shfl_sync (__kmpc_impl_lanemask_t , int32_t var,
54
54
int32_t srcLane) {
55
55
int width = WARPSIZE;
56
56
int self = GetLaneId ();
57
57
int index = srcLane + (self & ~(width - 1 ));
58
58
return __builtin_amdgcn_ds_bpermute (index << 2 , var);
59
59
}
60
60
61
- DEVICE int32_t __kmpc_impl_shfl_down_sync (__kmpc_impl_lanemask_t , int32_t var,
61
+ EXTERN int32_t __kmpc_impl_shfl_down_sync (__kmpc_impl_lanemask_t , int32_t var,
62
62
uint32_t laneDelta, int32_t width) {
63
63
int self = GetLaneId ();
64
64
int index = self + laneDelta;
@@ -68,12 +68,12 @@ DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var,
68
68
69
69
static DEVICE SHARED uint32_t L1_Barrier;
70
70
71
- DEVICE void __kmpc_impl_target_init () {
71
+ EXTERN void __kmpc_impl_target_init () {
72
72
// Don't have global ctors, and shared memory is not zero init
73
73
__atomic_store_n (&L1_Barrier, 0u , __ATOMIC_RELEASE);
74
74
}
75
75
76
- DEVICE void __kmpc_impl_named_sync (uint32_t num_threads) {
76
+ EXTERN void __kmpc_impl_named_sync (uint32_t num_threads) {
77
77
__atomic_thread_fence (__ATOMIC_ACQUIRE);
78
78
79
79
uint32_t num_waves = num_threads / WARPSIZE;
@@ -85,9 +85,9 @@ DEVICE void __kmpc_impl_named_sync(uint32_t num_threads) {
85
85
// Low bits for the number of waves, assumed zero before this call.
86
86
// High bits to count the number of times the barrier has been passed.
87
87
88
- assert ( num_waves != 0 ) ;
89
- assert ( num_waves * WARPSIZE == num_threads) ;
90
- assert ( num_waves < 0xffffu ) ;
88
+ // precondition: num_waves != 0;
89
+ // invariant: num_waves * WARPSIZE == num_threads;
90
+ // precondition: num_waves < 0xffffu;
91
91
92
92
// Increment the low 16 bits once, using the lowest active thread.
93
93
uint64_t lowestActiveThread = __kmpc_impl_ffs (__kmpc_impl_activemask ()) - 1 ;
@@ -131,19 +131,19 @@ DEVICE uint32_t get_workgroup_dim(uint32_t group_id, uint32_t grid_size,
131
131
}
132
132
} // namespace
133
133
134
- DEVICE int GetNumberOfBlocksInKernel () {
134
+ EXTERN int GetNumberOfBlocksInKernel () {
135
135
return get_grid_dim (__builtin_amdgcn_grid_size_x (),
136
136
__builtin_amdgcn_workgroup_size_x ());
137
137
}
138
138
139
- DEVICE int GetNumberOfThreadsInBlock () {
139
+ EXTERN int GetNumberOfThreadsInBlock () {
140
140
return get_workgroup_dim (__builtin_amdgcn_workgroup_id_x (),
141
141
__builtin_amdgcn_grid_size_x (),
142
142
__builtin_amdgcn_workgroup_size_x ());
143
143
}
144
144
145
- DEVICE unsigned GetWarpId () { return GetThreadIdInBlock () / WARPSIZE; }
146
- DEVICE unsigned GetLaneId () {
145
+ EXTERN unsigned GetWarpId () { return GetThreadIdInBlock () / WARPSIZE; }
146
+ EXTERN unsigned GetLaneId () {
147
147
return __builtin_amdgcn_mbcnt_hi (~0u , __builtin_amdgcn_mbcnt_lo (~0u , 0u ));
148
148
}
149
149
@@ -186,38 +186,38 @@ DEVICE unsigned long long __kmpc_atomic_add(unsigned long long *Address,
186
186
}
187
187
188
188
// Stub implementations
189
- DEVICE void *__kmpc_impl_malloc (size_t ) { return nullptr ; }
190
- DEVICE void __kmpc_impl_free (void *) {}
189
+ EXTERN void *__kmpc_impl_malloc (size_t ) { return nullptr ; }
190
+ EXTERN void __kmpc_impl_free (void *) {}
191
191
192
- DEVICE void __kmpc_impl_unpack (uint64_t val, uint32_t &lo, uint32_t &hi) {
192
+ EXTERN void __kmpc_impl_unpack (uint64_t val, uint32_t &lo, uint32_t &hi) {
193
193
lo = (uint32_t )(val & UINT64_C (0x00000000FFFFFFFF ));
194
194
hi = (uint32_t )((val & UINT64_C (0xFFFFFFFF00000000 )) >> 32 );
195
195
}
196
196
197
- DEVICE uint64_t __kmpc_impl_pack (uint32_t lo, uint32_t hi) {
197
+ EXTERN uint64_t __kmpc_impl_pack (uint32_t lo, uint32_t hi) {
198
198
return (((uint64_t )hi) << 32 ) | (uint64_t )lo;
199
199
}
200
200
201
- DEVICE void __kmpc_impl_syncthreads () { __builtin_amdgcn_s_barrier (); }
201
+ EXTERN void __kmpc_impl_syncthreads () { __builtin_amdgcn_s_barrier (); }
202
202
203
- DEVICE void __kmpc_impl_syncwarp (__kmpc_impl_lanemask_t ) {
203
+ EXTERN void __kmpc_impl_syncwarp (__kmpc_impl_lanemask_t ) {
204
204
// AMDGCN doesn't need to sync threads in a warp
205
205
}
206
206
207
- DEVICE void __kmpc_impl_threadfence () {
207
+ EXTERN void __kmpc_impl_threadfence () {
208
208
__builtin_amdgcn_fence (__ATOMIC_SEQ_CST, " agent" );
209
209
}
210
210
211
- DEVICE void __kmpc_impl_threadfence_block () {
211
+ EXTERN void __kmpc_impl_threadfence_block () {
212
212
__builtin_amdgcn_fence (__ATOMIC_SEQ_CST, " workgroup" );
213
213
}
214
214
215
- DEVICE void __kmpc_impl_threadfence_system () {
215
+ EXTERN void __kmpc_impl_threadfence_system () {
216
216
__builtin_amdgcn_fence (__ATOMIC_SEQ_CST, " " );
217
217
}
218
218
219
219
// Calls to the AMDGCN layer (assuming 1D layout)
220
- DEVICE int GetThreadIdInBlock () { return __builtin_amdgcn_workitem_id_x (); }
221
- DEVICE int GetBlockIdInKernel () { return __builtin_amdgcn_workgroup_id_x (); }
220
+ EXTERN int GetThreadIdInBlock () { return __builtin_amdgcn_workitem_id_x (); }
221
+ EXTERN int GetBlockIdInKernel () { return __builtin_amdgcn_workgroup_id_x (); }
222
222
223
223
#pragma omp end declare target
0 commit comments