@@ -124,7 +124,11 @@ void SPIRV_OVERLOADABLE SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(int Execut
124
124
__intel_atomic_work_item_fence ( Memory , Semantics );
125
125
}
126
126
127
- if ( Execution <= Workgroup )
127
+ if (Execution == Device )
128
+ {
129
+ global_barrier ();
130
+ }
131
+ else if ( Execution <= Workgroup )
128
132
{
129
133
__builtin_IB_thread_group_barrier ();
130
134
}
@@ -282,15 +286,18 @@ void __builtin_spirv_OpMemoryNamedBarrierWrapperOCL_p3__namedBarrier_i32_i32(loc
282
286
}
283
287
284
288
__global volatile uchar * __builtin_IB_get_sync_buffer ();
289
+ uint __intel_get_local_linear_id ( void );
290
+ uint __intel_get_local_size ( void );
285
291
286
292
void global_barrier ()
287
293
{
288
294
//Make sure each WKG item hit the barrier.
289
- barrier (CLK_GLOBAL_MEM_FENCE );
295
+ __intel_atomic_work_item_fence (Device , AcquireRelease | CrossWorkgroupMemory );
296
+ __builtin_IB_thread_group_barrier ();
290
297
291
298
__global volatile uchar * syncBuffer = __builtin_IB_get_sync_buffer ();
292
- bool firstThreadPerWg = ( get_local_id ( 0 ) == 0 ) && ( get_local_id ( 1 ) == 0 ) && ( get_local_id ( 2 ) == 0 );
293
- size_t groupLinearId = (get_group_id (2 ) * get_num_groups (1 ) * get_num_groups (0 )) + (get_group_id (1 ) * get_num_groups (0 )) + get_group_id (0 );
299
+ bool firstThreadPerWg = __intel_is_first_work_group_item ( );
300
+ uint groupLinearId = (__builtin_IB_get_group_id (2 ) * __builtin_IB_get_num_groups (1 ) * __builtin_IB_get_num_groups (0 )) + (__builtin_IB_get_group_id (1 ) * __builtin_IB_get_num_groups (0 )) + __builtin_IB_get_group_id (0 );
294
301
295
302
//Now first thread of each wkg writes to designated place in syncBuffer
296
303
if (firstThreadPerWg )
@@ -299,26 +306,27 @@ void global_barrier()
299
306
atomic_work_item_fence (CLK_GLOBAL_MEM_FENCE , memory_order_release , memory_scope_device ); // == write_mem_fence(CLK_GLOBAL_MEM_FENCE);
300
307
}
301
308
302
- size_t numGroups = get_num_groups (0 ) * get_num_groups (1 ) * get_num_groups (2 );
309
+ uint numGroups = __builtin_IB_get_num_groups (0 ) * __builtin_IB_get_num_groups (1 ) * __builtin_IB_get_num_groups (2 );
303
310
//Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.
304
311
if (groupLinearId == (numGroups - 1 ))
305
312
{
306
- size_t localSize = get_local_size ( 0 ) * get_local_size ( 1 ) * get_local_size ( 2 );
313
+ uint localSize = __intel_get_local_size ( );
307
314
//24 -48 case
308
315
volatile uchar Value ;
309
316
do
310
317
{
311
318
atomic_work_item_fence (CLK_GLOBAL_MEM_FENCE , memory_order_acquire , memory_scope_device ); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);
312
319
Value = 1 ;
313
- for (size_t i = get_local_linear_id (); i < numGroups ; i += localSize )
320
+ for (uint i = __intel_get_local_linear_id (); i < numGroups ; i += localSize )
314
321
{
315
322
Value = Value & syncBuffer [i ];
316
323
}
317
324
318
325
} while (Value == 0 );
319
- barrier (CLK_GLOBAL_MEM_FENCE );
326
+ __intel_atomic_work_item_fence (Device , AcquireRelease | CrossWorkgroupMemory );
327
+ __builtin_IB_thread_group_barrier ();
320
328
321
- for (size_t i = get_local_linear_id (); i < numGroups ; i += localSize )
329
+ for (uint i = __intel_get_local_linear_id (); i < numGroups ; i += localSize )
322
330
{
323
331
syncBuffer [i ] = 0 ;
324
332
}
@@ -331,7 +339,8 @@ void global_barrier()
331
339
atomic_work_item_fence (CLK_GLOBAL_MEM_FENCE , memory_order_acquire , memory_scope_device ); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);
332
340
};
333
341
}
334
- barrier (CLK_GLOBAL_MEM_FENCE );
342
+ __intel_atomic_work_item_fence (Device , AcquireRelease | CrossWorkgroupMemory );
343
+ __builtin_IB_thread_group_barrier ();
335
344
}
336
345
337
346
void system_memfence (char fence_typed_memory )
0 commit comments