[CLC][AMDGPU] Refactor fence helper to process order semantic explicitly (#12872)

GeorgeWeb · kbenzie · web-flow · commit 4acca904c0e0 · 2024-06-13T17:58:22.000+02:00
This PR refactors the builtin fence helper macro for AMDGPU to take in
and process the order semantic explicitly because that is the only
semantic argument accepted by the amdgcn builtin.

Additionally, makes the `None` (Monotonic) order semantic which maps to
C++/SYCL's `relaxed` to be a no-op instead of falling back to the
previous `acq_rel` default order.

---------

Co-authored-by: Kenneth Benzie (Benie) &lt;k.benzie83@gmail.com&gt;
diff --git a/libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl b/libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl
@@ -10,42 +10,56 @@
 #include <spirv/spirv.h>
 #include <spirv/spirv_types.h>
 
-#define BUILTIN_FENCE(semantics, scope_memory)                                 \
-  if (semantics & Acquire)                                                     \
-    return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, scope_memory);             \
-  else if (semantics & Release)                                                \
-    return __builtin_amdgcn_fence(__ATOMIC_RELEASE, scope_memory);             \
-  else if (semantics & AcquireRelease)                                         \
-    return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory);             \
-  else if (semantics & SequentiallyConsistent)                                 \
-    return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, scope_memory);             \
-  else                                                                         \
-    return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory);
 
-_CLC_DEF _CLC_OVERLOAD void __mem_fence(unsigned int scope_memory,
-                                        unsigned int semantics) {
+#define BUILTIN_FENCE(order, scope_memory)                                     \
+  /* None implies Monotonic (for llvm/AMDGPU), or relaxed in C++.              \
+   * This does not make sense as ordering argument for a fence instruction     \
+   * and is not part of the supported orderings for a fence in AMDGPU. */      \
+  if (order != None) {                                                         \
+    switch (order) {                                                           \
+    case Acquire:                                                              \
+      return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, scope_memory);           \
+    case Release:                                                              \
+      return __builtin_amdgcn_fence(__ATOMIC_RELEASE, scope_memory);           \
+    case AcquireRelease:                                                       \
+      return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory);           \
+    case SequentiallyConsistent:                                               \
+      return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, scope_memory);           \
+    default:                                                                   \
+      __builtin_trap();                                                        \
+      __builtin_unreachable();                                                 \
+    }                                                                          \
+  }
+
+_CLC_INLINE void builtin_fence_order(unsigned int scope_memory,
+                                     unsigned int order) {
   switch ((enum Scope)scope_memory) {
   case CrossDevice:
-    BUILTIN_FENCE(semantics, "")
+    BUILTIN_FENCE(order, "")
   case Device:
-    BUILTIN_FENCE(semantics, "agent")
+    BUILTIN_FENCE(order, "agent")
   case Workgroup:
-    BUILTIN_FENCE(semantics, "workgroup")
+    BUILTIN_FENCE(order, "workgroup")
   case Subgroup:
-    BUILTIN_FENCE(semantics, "wavefront")
+    BUILTIN_FENCE(order, "wavefront")
   case Invocation:
-    BUILTIN_FENCE(semantics, "singlethread")
+    BUILTIN_FENCE(order, "singlethread")
   }
 }
 #undef BUILTIN_FENCE
 
+_CLC_DEF _CLC_OVERLOAD void __mem_fence(unsigned int scope_memory,
+                                        unsigned int semantics) {
+  builtin_fence_order(scope_memory, semantics & 0x1F);
+}
+
 _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int scope_memory,
                                                   unsigned int semantics) {
   __mem_fence(scope_memory, semantics);
 }
 
 _CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void
-__spirv_ControlBarrier(unsigned int scope_execution, unsigned scope_memory,
+__spirv_ControlBarrier(unsigned int scope_execution, unsigned int scope_memory,
                        unsigned int semantics) {
   if (semantics) {
     __mem_fence(scope_memory, semantics);
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -131,7 +131,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
 
   fetch_adapter_source(hip
     ${UNIFIED_RUNTIME_REPO}
-    ${UNIFIED_RUNTIME_TAG}
+    # commit 2c4303c25b026f7edb215accdccb1bc5ae2e237b
+    # Merge: abe85cc9 3e011c70
+    # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
+    # Date:   Thu Jun 13 10:22:34 2024 +0100
+    #     Merge pull request #1414 from GeorgeWeb/georgi/hip-fences
+    #     [HIP] Enable more ordering and scope capabilities for atomic fences
+    2c4303c25b026f7edb215accdccb1bc5ae2e237b
   )
 
   fetch_adapter_source(native_cpu