[Autobackout][FuncReg]Revert of change: 7c428db

ficol · igcbot · commit f276326a0d1a · 2024-02-01T01:07:48.000+01:00
Enable Device scope for OpControlBarrier

Invoke global_barrier for Device scope. Remove call to OpControlBarrier in global_barrier to avoid recursion.
diff --git a/IGC/BiFModule/Implementation/barrier.cl b/IGC/BiFModule/Implementation/barrier.cl
@@ -124,11 +124,7 @@ void SPIRV_OVERLOADABLE SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(int Execut
         __intel_atomic_work_item_fence( Memory, Semantics );
     }
 
-    if (Execution == Device)
-    {
-        global_barrier();
-    }
-    else  if( Execution <= Workgroup )
+    if( Execution <= Workgroup )
     {
         __builtin_IB_thread_group_barrier();
     }
@@ -286,18 +282,15 @@ void __builtin_spirv_OpMemoryNamedBarrierWrapperOCL_p3__namedBarrier_i32_i32(loc
 }
 
 __global volatile uchar* __builtin_IB_get_sync_buffer();
-uint __intel_get_local_linear_id( void );
-uint __intel_get_local_size( void );
 
 void global_barrier()
 {
     //Make sure each WKG item hit the barrier.
-    __intel_atomic_work_item_fence(Device, AcquireRelease | CrossWorkgroupMemory);
-    __builtin_IB_thread_group_barrier();
+    barrier(CLK_GLOBAL_MEM_FENCE);
 
     __global volatile uchar* syncBuffer = __builtin_IB_get_sync_buffer();
-    bool firstThreadPerWg = __intel_is_first_work_group_item();
-    uint groupLinearId = (__builtin_IB_get_group_id(2) * __builtin_IB_get_num_groups(1) * __builtin_IB_get_num_groups(0)) + (__builtin_IB_get_group_id(1) * __builtin_IB_get_num_groups(0)) + __builtin_IB_get_group_id(0);
+    bool firstThreadPerWg = (get_local_id(0) == 0) && (get_local_id(1) == 0) && (get_local_id(2) == 0);
+    size_t groupLinearId = (get_group_id(2) * get_num_groups(1) * get_num_groups(0)) + (get_group_id(1) * get_num_groups(0)) + get_group_id(0);
 
     //Now first thread of each wkg writes to designated place in syncBuffer
     if (firstThreadPerWg)
@@ -306,27 +299,26 @@ void global_barrier()
         atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_release, memory_scope_device); // == write_mem_fence(CLK_GLOBAL_MEM_FENCE);
     }
 
-    uint numGroups = __builtin_IB_get_num_groups(0) * __builtin_IB_get_num_groups(1) * __builtin_IB_get_num_groups(2);
+    size_t numGroups = get_num_groups(0) * get_num_groups(1) * get_num_groups(2);
     //Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.
     if (groupLinearId == (numGroups - 1))
     {
-        uint localSize = __intel_get_local_size();
+        size_t localSize = get_local_size(0) * get_local_size(1) * get_local_size(2);
         //24 -48 case
         volatile uchar Value;
         do
         {
             atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_acquire, memory_scope_device); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);
             Value = 1;
-            for (uint i = __intel_get_local_linear_id(); i < numGroups; i += localSize)
+            for (size_t i = get_local_linear_id(); i < numGroups; i += localSize)
             {
                 Value = Value & syncBuffer[i];
             }
 
         } while (Value == 0);
-        __intel_atomic_work_item_fence(Device, AcquireRelease | CrossWorkgroupMemory);
-        __builtin_IB_thread_group_barrier();
+        barrier(CLK_GLOBAL_MEM_FENCE);
 
-        for (uint i = __intel_get_local_linear_id(); i < numGroups; i += localSize)
+        for (size_t i = get_local_linear_id(); i < numGroups; i += localSize)
         {
             syncBuffer[i] = 0;
         }
@@ -339,8 +331,7 @@ void global_barrier()
            atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_acquire, memory_scope_device); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);
         };
     }
-    __intel_atomic_work_item_fence(Device, AcquireRelease | CrossWorkgroupMemory);
-    __builtin_IB_thread_group_barrier();
+    barrier(CLK_GLOBAL_MEM_FENCE);
 }
 
 void system_memfence(char fence_typed_memory)

Original file line number	Diff line number	Diff line change
`@@ -124,11 +124,7 @@ void SPIRV_OVERLOADABLE SPIRV_BUILTIN(ControlBarrier, _i32_i32_i32, )(int Execut`
`124`	`124`	`__intel_atomic_work_item_fence( Memory, Semantics );`
`125`	`125`	`}`
`126`	`126`
`127`		`- if (Execution == Device)`
`128`		`- {`
`129`		`- global_barrier();`
`130`		`- }`
`131`		`- else if( Execution <= Workgroup )`
	`127`	`+ if( Execution <= Workgroup )`
`132`	`128`	`{`
`133`	`129`	`__builtin_IB_thread_group_barrier();`
`134`	`130`	`}`
`@@ -286,18 +282,15 @@ void __builtin_spirv_OpMemoryNamedBarrierWrapperOCL_p3__namedBarrier_i32_i32(loc`
`286`	`282`	`}`
`287`	`283`
`288`	`284`	`__global volatile uchar* __builtin_IB_get_sync_buffer();`
`289`		`-uint __intel_get_local_linear_id( void );`
`290`		`-uint __intel_get_local_size( void );`
`291`	`285`
`292`	`286`	`void global_barrier()`
`293`	`287`	`{`
`294`	`288`	`//Make sure each WKG item hit the barrier.`
`295`		`- __intel_atomic_work_item_fence(Device, AcquireRelease \| CrossWorkgroupMemory);`
`296`		`- __builtin_IB_thread_group_barrier();`
	`289`	`+ barrier(CLK_GLOBAL_MEM_FENCE);`
`297`	`290`
`298`	`291`	`__global volatile uchar* syncBuffer = __builtin_IB_get_sync_buffer();`
`299`		`- bool firstThreadPerWg = __intel_is_first_work_group_item();`
`300`		`- uint groupLinearId = (__builtin_IB_get_group_id(2) * __builtin_IB_get_num_groups(1) * __builtin_IB_get_num_groups(0)) + (__builtin_IB_get_group_id(1) * __builtin_IB_get_num_groups(0)) + __builtin_IB_get_group_id(0);`
	`292`	`+ bool firstThreadPerWg = (get_local_id(0) == 0) && (get_local_id(1) == 0) && (get_local_id(2) == 0);`
	`293`	`+ size_t groupLinearId = (get_group_id(2) * get_num_groups(1) * get_num_groups(0)) + (get_group_id(1) * get_num_groups(0)) + get_group_id(0);`
`301`	`294`
`302`	`295`	`//Now first thread of each wkg writes to designated place in syncBuffer`
`303`	`296`	`if (firstThreadPerWg)`
`@@ -306,27 +299,26 @@ void global_barrier()`
`306`	`299`	`atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_release, memory_scope_device); // == write_mem_fence(CLK_GLOBAL_MEM_FENCE);`
`307`	`300`	`}`
`308`	`301`
`309`		`- uint numGroups = __builtin_IB_get_num_groups(0) * __builtin_IB_get_num_groups(1) * __builtin_IB_get_num_groups(2);`
	`302`	`+ size_t numGroups = get_num_groups(0) * get_num_groups(1) * get_num_groups(2);`
`310`	`303`	`//Higher wkg ids tend to not have work to do in all cases, therefore I choose last wkg to wait for the others, as it is most likely it will hit this code sooner.`
`311`	`304`	`if (groupLinearId == (numGroups - 1))`
`312`	`305`	`{`
`313`		`- uint localSize = __intel_get_local_size();`
	`306`	`+ size_t localSize = get_local_size(0) * get_local_size(1) * get_local_size(2);`
`314`	`307`	`//24 -48 case`
`315`	`308`	`volatile uchar Value;`
`316`	`309`	`do`
`317`	`310`	`{`
`318`	`311`	`atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_acquire, memory_scope_device); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);`
`319`	`312`	`Value = 1;`
`320`		`- for (uint i = __intel_get_local_linear_id(); i < numGroups; i += localSize)`
	`313`	`+ for (size_t i = get_local_linear_id(); i < numGroups; i += localSize)`
`321`	`314`	`{`
`322`	`315`	`Value = Value & syncBuffer[i];`
`323`	`316`	`}`
`324`	`317`
`325`	`318`	`} while (Value == 0);`
`326`		`- __intel_atomic_work_item_fence(Device, AcquireRelease \| CrossWorkgroupMemory);`
`327`		`- __builtin_IB_thread_group_barrier();`
	`319`	`+ barrier(CLK_GLOBAL_MEM_FENCE);`
`328`	`320`
`329`		`- for (uint i = __intel_get_local_linear_id(); i < numGroups; i += localSize)`
	`321`	`+ for (size_t i = get_local_linear_id(); i < numGroups; i += localSize)`
`330`	`322`	`{`
`331`	`323`	`syncBuffer[i] = 0;`
`332`	`324`	`}`
`@@ -339,8 +331,7 @@ void global_barrier()`
`339`	`331`	`atomic_work_item_fence(CLK_GLOBAL_MEM_FENCE, memory_order_acquire, memory_scope_device); // == read_mem_fence(CLK_GLOBAL_MEM_FENCE);`
`340`	`332`	`};`
`341`	`333`	`}`
`342`		`- __intel_atomic_work_item_fence(Device, AcquireRelease \| CrossWorkgroupMemory);`
`343`		`- __builtin_IB_thread_group_barrier();`
	`334`	`+ barrier(CLK_GLOBAL_MEM_FENCE);`
`344`	`335`	`}`
`345`	`336`
`346`	`337`	`void system_memfence(char fence_typed_memory)`