@@ -124,11 +124,7 @@ void SPIRV_OVERLOADABLE SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(int Execut
124
124
__intel_atomic_work_item_fence ( Memory , Semantics );
125
125
}
126
126
127
- if (Execution == Device )
128
- {
129
- global_barrier ();
130
- }
131
- else if ( Execution <= Workgroup )
127
+ if ( Execution <= Workgroup )
132
128
{
133
129
__builtin_IB_thread_group_barrier ();
134
130
}
@@ -286,18 +282,15 @@ void __builtin_spirv_OpMemoryNamedBarrierWrapperOCL_p3__namedBarrier_i32_i32(loc
286
282
}
287
283
288
284
__global volatile uchar * __builtin_IB_get_sync_buffer ();
289
- uint __intel_get_local_linear_id ( void );
290
- uint __intel_get_local_size ( void );
291
285
292
286
void global_barrier ()
293
287
{
294
288
//Make sure each WKG item hit the barrier.
295
- __intel_atomic_work_item_fence (Device , AcquireRelease | CrossWorkgroupMemory );
296
- __builtin_IB_thread_group_barrier ();
289
+ barrier (CLK_GLOBAL_MEM_FENCE );
297
290
298
291
__global volatile uchar * syncBuffer = __builtin_IB_get_sync_buffer ();
299
- bool firstThreadPerWg = __intel_is_first_work_group_item ( );
300
- uint groupLinearId = (__builtin_IB_get_group_id (2 ) * __builtin_IB_get_num_groups (1 ) * __builtin_IB_get_num_groups (0 )) + (__builtin_IB_get_group_id (1 ) * __builtin_IB_get_num_groups (0 )) + __builtin_IB_get_group_id (0 );
292
+ bool firstThreadPerWg = ( get_local_id ( 0 ) == 0 ) && ( get_local_id ( 1 ) == 0 ) && ( get_local_id ( 2 ) == 0 );
293
+ size_t groupLinearId = (get_group_id (2 ) * get_num_groups (1 ) * get_num_groups (0 )) + (get_group_id (1 ) * get_num_groups (0 )) + get_group_id (0 );
301
294
302
295
//Now first thread of each wkg writes to designated place in syncBuffer
303
296
if (firstThreadPerWg )
@@ -306,27 +299,26 @@ void global_barrier()
306
299
atomic_work_item_fence (CLK_GLOBAL_MEM_FENCE , memory_order_release , memory_scope_device ); // == write_mem_fence(CLK_GLOBAL_MEM_FENCE);
307
300
}
308
301
309
- uint numGroups = __builtin_IB_get_num_groups (0 ) * __builtin_IB_get_num_groups (1 ) * __builtin_IB_get_num_groups (2 );
302
+ size_t numGroups = get_num_groups (0 ) * get_num_groups (1 ) * get_num_groups (2 );
310
303
//Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.
311
304
if (groupLinearId == (numGroups - 1 ))
312
305
{
313
- uint localSize = __intel_get_local_size ( );
306
+ size_t localSize = get_local_size ( 0 ) * get_local_size ( 1 ) * get_local_size ( 2 );
314
307
//24 -48 case
315
308
volatile uchar Value ;
316
309
do
317
310
{
318
311
atomic_work_item_fence (CLK_GLOBAL_MEM_FENCE , memory_order_acquire , memory_scope_device ); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);
319
312
Value = 1 ;
320
- for (uint i = __intel_get_local_linear_id (); i < numGroups ; i += localSize )
313
+ for (size_t i = get_local_linear_id (); i < numGroups ; i += localSize )
321
314
{
322
315
Value = Value & syncBuffer [i ];
323
316
}
324
317
325
318
} while (Value == 0 );
326
- __intel_atomic_work_item_fence (Device , AcquireRelease | CrossWorkgroupMemory );
327
- __builtin_IB_thread_group_barrier ();
319
+ barrier (CLK_GLOBAL_MEM_FENCE );
328
320
329
- for (uint i = __intel_get_local_linear_id (); i < numGroups ; i += localSize )
321
+ for (size_t i = get_local_linear_id (); i < numGroups ; i += localSize )
330
322
{
331
323
syncBuffer [i ] = 0 ;
332
324
}
@@ -339,8 +331,7 @@ void global_barrier()
339
331
atomic_work_item_fence (CLK_GLOBAL_MEM_FENCE , memory_order_acquire , memory_scope_device ); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);
340
332
};
341
333
}
342
- __intel_atomic_work_item_fence (Device , AcquireRelease | CrossWorkgroupMemory );
343
- __builtin_IB_thread_group_barrier ();
334
+ barrier (CLK_GLOBAL_MEM_FENCE );
344
335
}
345
336
346
337
void system_memfence (char fence_typed_memory )
0 commit comments