Enable Device scope for OpControlBarrier

ficol · igcbot · commit b98e91441665 · 2024-02-01T14:58:08.000+01:00
Invoke global_barrier for Device scope. Remove call to OpControlBarrier in global_barrier to avoid recursion.
Avoid calling functions with size_t return type in global_barrier.
diff --git a/IGC/BiFModule/Implementation/barrier.cl b/IGC/BiFModule/Implementation/barrier.cl
@@ -124,7 +124,11 @@ void SPIRV_OVERLOADABLE SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(int Execut
         __intel_atomic_work_item_fence( Memory, Semantics );
     }
 
-    if( Execution <= Workgroup )
+    if (Execution == Device)
+    {
+        global_barrier();
+    }
+    else  if( Execution <= Workgroup )
     {
         __builtin_IB_thread_group_barrier();
     }
@@ -282,15 +286,18 @@ void __builtin_spirv_OpMemoryNamedBarrierWrapperOCL_p3__namedBarrier_i32_i32(loc
 }
 
 __global volatile uchar* __builtin_IB_get_sync_buffer();
+uint __intel_get_local_linear_id( void );
+uint __intel_get_local_size( void );
 
 void global_barrier()
 {
     //Make sure each WKG item hit the barrier.
-    barrier(CLK_GLOBAL_MEM_FENCE);
+    __intel_atomic_work_item_fence(Device, AcquireRelease | CrossWorkgroupMemory);
+    __builtin_IB_thread_group_barrier();
 
     __global volatile uchar* syncBuffer = __builtin_IB_get_sync_buffer();
-    bool firstThreadPerWg = (get_local_id(0) == 0) && (get_local_id(1) == 0) && (get_local_id(2) == 0);
-    size_t groupLinearId = (get_group_id(2) * get_num_groups(1) * get_num_groups(0)) + (get_group_id(1) * get_num_groups(0)) + get_group_id(0);
+    bool firstThreadPerWg = __intel_is_first_work_group_item();
+    uint groupLinearId = (__builtin_IB_get_group_id(2) * __builtin_IB_get_num_groups(1) * __builtin_IB_get_num_groups(0)) + (__builtin_IB_get_group_id(1) * __builtin_IB_get_num_groups(0)) + __builtin_IB_get_group_id(0);
 
     //Now first thread of each wkg writes to designated place in syncBuffer
     if (firstThreadPerWg)
@@ -299,26 +306,27 @@ void global_barrier()
         atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_release, memory_scope_device); // == write_mem_fence(CLK_GLOBAL_MEM_FENCE);
     }
 
-    size_t numGroups = get_num_groups(0) * get_num_groups(1) * get_num_groups(2);
+    uint numGroups = __builtin_IB_get_num_groups(0) * __builtin_IB_get_num_groups(1) * __builtin_IB_get_num_groups(2);
     //Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.
     if (groupLinearId == (numGroups - 1))
     {
-        size_t localSize = get_local_size(0) * get_local_size(1) * get_local_size(2);
+        uint localSize = __intel_get_local_size();
         //24 -48 case
         volatile uchar Value;
         do
         {
             atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_acquire, memory_scope_device); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);
             Value = 1;
-            for (size_t i = get_local_linear_id(); i < numGroups; i += localSize)
+            for (uint i = __intel_get_local_linear_id(); i < numGroups; i += localSize)
             {
                 Value = Value & syncBuffer[i];
             }
 
         } while (Value == 0);
-        barrier(CLK_GLOBAL_MEM_FENCE);
+        __intel_atomic_work_item_fence(Device, AcquireRelease | CrossWorkgroupMemory);
+        __builtin_IB_thread_group_barrier();
 
-        for (size_t i = get_local_linear_id(); i < numGroups; i += localSize)
+        for (uint i = __intel_get_local_linear_id(); i < numGroups; i += localSize)
         {
             syncBuffer[i] = 0;
         }
@@ -331,7 +339,8 @@ void global_barrier()
            atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_acquire, memory_scope_device); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);
         };
     }
-    barrier(CLK_GLOBAL_MEM_FENCE);
+    __intel_atomic_work_item_fence(Device, AcquireRelease | CrossWorkgroupMemory);
+    __builtin_IB_thread_group_barrier();
 }
 
 void system_memfence(char fence_typed_memory)

Original file line number	Diff line number	Diff line change
`@@ -124,7 +124,11 @@ void SPIRV_OVERLOADABLE SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(int Execut`
`124`	`124`	`__intel_atomic_work_item_fence( Memory, Semantics );`
`125`	`125`	`}`
`126`	`126`
`127`		`- if( Execution <= Workgroup )`
	`127`	`+ if (Execution == Device)`
	`128`	`+ {`
	`129`	`+ global_barrier();`
	`130`	`+ }`
	`131`	`+ else if( Execution <= Workgroup )`
`128`	`132`	`{`
`129`	`133`	`__builtin_IB_thread_group_barrier();`
`130`	`134`	`}`
`@@ -282,15 +286,18 @@ void __builtin_spirv_OpMemoryNamedBarrierWrapperOCL_p3__namedBarrier_i32_i32(loc`
`282`	`286`	`}`
`283`	`287`
`284`	`288`	`__global volatile uchar* __builtin_IB_get_sync_buffer();`
	`289`	`+uint __intel_get_local_linear_id( void );`
	`290`	`+uint __intel_get_local_size( void );`
`285`	`291`
`286`	`292`	`void global_barrier()`
`287`	`293`	`{`
`288`	`294`	`//Make sure each WKG item hit the barrier.`
`289`		`- barrier(CLK_GLOBAL_MEM_FENCE);`
	`295`	`+ __intel_atomic_work_item_fence(Device, AcquireRelease \| CrossWorkgroupMemory);`
	`296`	`+ __builtin_IB_thread_group_barrier();`
`290`	`297`
`291`	`298`	`__global volatile uchar* syncBuffer = __builtin_IB_get_sync_buffer();`
`292`		`- bool firstThreadPerWg = (get_local_id(0) == 0) && (get_local_id(1) == 0) && (get_local_id(2) == 0);`
`293`		`- size_t groupLinearId = (get_group_id(2) * get_num_groups(1) * get_num_groups(0)) + (get_group_id(1) * get_num_groups(0)) + get_group_id(0);`
	`299`	`+ bool firstThreadPerWg = __intel_is_first_work_group_item();`
	`300`	`+ uint groupLinearId = (__builtin_IB_get_group_id(2) * __builtin_IB_get_num_groups(1) * __builtin_IB_get_num_groups(0)) + (__builtin_IB_get_group_id(1) * __builtin_IB_get_num_groups(0)) + __builtin_IB_get_group_id(0);`
`294`	`301`
`295`	`302`	`//Now first thread of each wkg writes to designated place in syncBuffer`
`296`	`303`	`if (firstThreadPerWg)`
`@@ -299,26 +306,27 @@ void global_barrier()`
`299`	`306`	`atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_release, memory_scope_device); // == write_mem_fence(CLK_GLOBAL_MEM_FENCE);`
`300`	`307`	`}`
`301`	`308`
`302`		`- size_t numGroups = get_num_groups(0) * get_num_groups(1) * get_num_groups(2);`
	`309`	`+ uint numGroups = __builtin_IB_get_num_groups(0) * __builtin_IB_get_num_groups(1) * __builtin_IB_get_num_groups(2);`
`303`	`310`	`//Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.`
`304`	`311`	`if (groupLinearId == (numGroups - 1))`
`305`	`312`	`{`
`306`		`- size_t localSize = get_local_size(0) * get_local_size(1) * get_local_size(2);`
	`313`	`+ uint localSize = __intel_get_local_size();`
`307`	`314`	`//24 -48 case`
`308`	`315`	`volatile uchar Value;`
`309`	`316`	`do`
`310`	`317`	`{`
`311`	`318`	`atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_acquire, memory_scope_device); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);`
`312`	`319`	`Value = 1;`
`313`		`- for (size_t i = get_local_linear_id(); i < numGroups; i += localSize)`
	`320`	`+ for (uint i = __intel_get_local_linear_id(); i < numGroups; i += localSize)`
`314`	`321`	`{`
`315`	`322`	`Value = Value & syncBuffer[i];`
`316`	`323`	`}`
`317`	`324`
`318`	`325`	`} while (Value == 0);`
`319`		`- barrier(CLK_GLOBAL_MEM_FENCE);`
	`326`	`+ __intel_atomic_work_item_fence(Device, AcquireRelease \| CrossWorkgroupMemory);`
	`327`	`+ __builtin_IB_thread_group_barrier();`
`320`	`328`
`321`		`- for (size_t i = get_local_linear_id(); i < numGroups; i += localSize)`
	`329`	`+ for (uint i = __intel_get_local_linear_id(); i < numGroups; i += localSize)`
`322`	`330`	`{`
`323`	`331`	`syncBuffer[i] = 0;`
`324`	`332`	`}`
`@@ -331,7 +339,8 @@ void global_barrier()`
`331`	`339`	`atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_acquire, memory_scope_device); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);`
`332`	`340`	`};`
`333`	`341`	`}`
`334`		`- barrier(CLK_GLOBAL_MEM_FENCE);`
	`342`	`+ __intel_atomic_work_item_fence(Device, AcquireRelease \| CrossWorkgroupMemory);`
	`343`	`+ __builtin_IB_thread_group_barrier();`
`335`	`344`	`}`
`336`	`345`
`337`	`346`	`void system_memfence(char fence_typed_memory)`