Skip to content

Commit b98e914

Browse files
ficoligcbot
authored andcommitted
Enable Device scope for OpControlBarrier
Invoke global_barrier for Device scope. Remove call to OpControlBarrier in global_barrier to avoid recursion. Avoid calling functions with size_t return type in global_barrier.
1 parent 2557c11 commit b98e914

File tree

1 file changed

+19
-10
lines changed

1 file changed

+19
-10
lines changed

IGC/BiFModule/Implementation/barrier.cl

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,11 @@ void SPIRV_OVERLOADABLE SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(int Execut
124124
__intel_atomic_work_item_fence( Memory, Semantics );
125125
}
126126

127-
if( Execution <= Workgroup )
127+
if (Execution == Device)
128+
{
129+
global_barrier();
130+
}
131+
else if( Execution <= Workgroup )
128132
{
129133
__builtin_IB_thread_group_barrier();
130134
}
@@ -282,15 +286,18 @@ void __builtin_spirv_OpMemoryNamedBarrierWrapperOCL_p3__namedBarrier_i32_i32(loc
282286
}
283287

284288
__global volatile uchar* __builtin_IB_get_sync_buffer();
289+
uint __intel_get_local_linear_id( void );
290+
uint __intel_get_local_size( void );
285291

286292
void global_barrier()
287293
{
288294
//Make sure each WKG item hit the barrier.
289-
barrier(CLK_GLOBAL_MEM_FENCE);
295+
__intel_atomic_work_item_fence(Device, AcquireRelease | CrossWorkgroupMemory);
296+
__builtin_IB_thread_group_barrier();
290297

291298
__global volatile uchar* syncBuffer = __builtin_IB_get_sync_buffer();
292-
bool firstThreadPerWg = (get_local_id(0) == 0) && (get_local_id(1) == 0) && (get_local_id(2) == 0);
293-
size_t groupLinearId = (get_group_id(2) * get_num_groups(1) * get_num_groups(0)) + (get_group_id(1) * get_num_groups(0)) + get_group_id(0);
299+
bool firstThreadPerWg = __intel_is_first_work_group_item();
300+
uint groupLinearId = (__builtin_IB_get_group_id(2) * __builtin_IB_get_num_groups(1) * __builtin_IB_get_num_groups(0)) + (__builtin_IB_get_group_id(1) * __builtin_IB_get_num_groups(0)) + __builtin_IB_get_group_id(0);
294301

295302
//Now first thread of each wkg writes to designated place in syncBuffer
296303
if (firstThreadPerWg)
@@ -299,26 +306,27 @@ void global_barrier()
299306
atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_release, memory_scope_device); // == write_mem_fence(CLK_GLOBAL_MEM_FENCE);
300307
}
301308

302-
size_t numGroups = get_num_groups(0) * get_num_groups(1) * get_num_groups(2);
309+
uint numGroups = __builtin_IB_get_num_groups(0) * __builtin_IB_get_num_groups(1) * __builtin_IB_get_num_groups(2);
303310
//Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.
304311
if (groupLinearId == (numGroups - 1))
305312
{
306-
size_t localSize = get_local_size(0) * get_local_size(1) * get_local_size(2);
313+
uint localSize = __intel_get_local_size();
307314
//24 -48 case
308315
volatile uchar Value;
309316
do
310317
{
311318
atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_acquire, memory_scope_device); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);
312319
Value = 1;
313-
for (size_t i = get_local_linear_id(); i < numGroups; i += localSize)
320+
for (uint i = __intel_get_local_linear_id(); i < numGroups; i += localSize)
314321
{
315322
Value = Value & syncBuffer[i];
316323
}
317324

318325
} while (Value == 0);
319-
barrier(CLK_GLOBAL_MEM_FENCE);
326+
__intel_atomic_work_item_fence(Device, AcquireRelease | CrossWorkgroupMemory);
327+
__builtin_IB_thread_group_barrier();
320328

321-
for (size_t i = get_local_linear_id(); i < numGroups; i += localSize)
329+
for (uint i = __intel_get_local_linear_id(); i < numGroups; i += localSize)
322330
{
323331
syncBuffer[i] = 0;
324332
}
@@ -331,7 +339,8 @@ void global_barrier()
331339
atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_acquire, memory_scope_device); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);
332340
};
333341
}
334-
barrier(CLK_GLOBAL_MEM_FENCE);
342+
__intel_atomic_work_item_fence(Device, AcquireRelease | CrossWorkgroupMemory);
343+
__builtin_IB_thread_group_barrier();
335344
}
336345

337346
void system_memfence(char fence_typed_memory)

0 commit comments

Comments
 (0)