[SYCL][HIP] Add coarse-grained memory advice for HIP on AMD (#12394)

GeorgeWeb · kbenzie · aarongreig · web-flow · commit ab86d0dbac54 · 2024-02-05T14:39:57.000+01:00
Enables and tests coarse grained memory access via the memadvise implementation for HIP platforms on AMD hardware. See related UR changes for the adapter implementation: oneapi-src/unified-runtime#1249 --------- Co-authored-by: Kenneth Benzie (Benie) <k.benzie@codeplay.com> Co-authored-by: aarongreig <aarongreig01@gmail.com>
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
@@ -151,9 +151,10 @@
 // 14.42 Added piextCommandBufferPrefetchUSM and piextCommandBufferAdviseUSM
 // 15.43 Changed the signature of piextMemGetNativeHandle to also take a
 // pi_device
+// 15.44 Add coarse-grain memory advice flag for HIP.
 
 #define _PI_H_VERSION_MAJOR 15
-#define _PI_H_VERSION_MINOR 43
+#define _PI_H_VERSION_MINOR 44
 
 #define _PI_STRING_HELPER(a) #a
 #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -582,6 +583,8 @@ typedef enum {
   PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST = 1 << 7,
   PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST = 1 << 8,
   PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST = 1 << 9,
+  PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED = 1 << 10,
+  PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED = 1 << 11,
   PI_MEM_ADVICE_UNKNOWN = 0x7FFFFFFF,
 } _pi_mem_advice;
 
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -56,14 +56,14 @@ endif()
 if(SYCL_PI_UR_USE_FETCH_CONTENT)
   include(FetchContent)
 
-  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime")
-  # commit 9363574db721d2388c7d76a10edb128764872352
-  # Merge: 553a6b82 5e513738
+  set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
+  # commit cd97e172cbbfc411fccb0b80e0fff6f9126574f4
+  # Merge: bd745d10 2a9ded6f
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Thu Feb 1 11:50:16 2024 +0000
-  #     Merge pull request #1302 from kbenzie/benie/cl-binary-type-intermediate
-  #     [CL] Handle INTERMEDIATE binary type
-  set(UNIFIED_RUNTIME_TAG 9363574db721d2388c7d76a10edb128764872352)
+  # Date:   Fri Feb 2 14:24:16 2024 +0000
+  #     Merge pull request #1249 from GeorgeWeb/georgi/hip_memadvise_coarse_grained
+  #     [HIP] Implement coarse-grained memory advice for the HIP adapter
+  set(UNIFIED_RUNTIME_TAG cd97e172cbbfc411fccb0b80e0fff6f9126574f4)
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
     set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -3375,6 +3375,12 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
   if (Advice & PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST) {
     UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST;
   }
+  if (Advice & PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY;
+  }
+  if (Advice & PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED) {
+    UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY;
+  }
   if (Advice & PI_MEM_ADVICE_RESET) {
     UrAdvice |= UR_USM_ADVICE_FLAG_DEFAULT;
   }
diff --git a/sycl/test-e2e/USM/memadvise_flags.cpp b/sycl/test-e2e/USM/memadvise_flags.cpp
@@ -59,6 +59,8 @@ int main() {
     valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_PREFERRED_LOCATION_HOST);
     valid_advices.emplace_back(PI_MEM_ADVICE_HIP_SET_ACCESSED_BY_HOST);
     valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_ACCESSED_BY_HOST);
+    valid_advices.emplace_back(PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED);
+    valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED);
   } else {
     // Skip
     return 0;
diff --git a/sycl/test-e2e/USM/memory_coherency_hip.cpp b/sycl/test-e2e/USM/memory_coherency_hip.cpp
@@ -0,0 +1,147 @@
+// RUN: %{build} -o %t1.out
+// REQUIRES: hip_amd
+// RUN: %{run} %t1.out
+
+//==---- memory_coherency_hip.cpp  -----------------------------------------==//
+// USM coarse/fine grain memory coherency test for the HIP-AMD backend.
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <sycl/sycl.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <thread>
+
+namespace kernels {
+class SquareKrnl final {
+  int *mPtr;
+
+public:
+  SquareKrnl(int *ptr) : mPtr{ptr} {}
+
+  void operator()(sycl::id<1>) const { *mPtr = (*mPtr) * (*mPtr); }
+};
+
+class CoherencyTestKrnl final {
+  int *mPtr;
+
+public:
+  CoherencyTestKrnl(int *ptr) : mPtr{ptr} {}
+
+  void operator()(sycl::id<1>) const {
+    auto atm = sycl::atomic_ref<int, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>(mPtr[0]);
+
+    // mPtr was initialized to 1 by the host, now set it to 2.
+    atm.fetch_add(1);
+
+    // spin until mPtr is 3, then change it to 4.
+    int expected{3};
+    int old = atm.load();
+    while (true) {
+      old = atm.load();
+      if (old == expected) {
+        if (atm.compare_exchange_strong(old, 4)) {
+          break;
+        }
+      }
+    }
+  }
+};
+} // namespace kernels
+
+int main() {
+  sycl::queue q{};
+  sycl::device dev = q.get_device();
+  sycl::context ctx = q.get_context();
+  if (!dev.get_info<sycl::info::device::usm_shared_allocations>()) {
+    std::cout << "Shared USM is not supported. Skipping test.\n";
+    return 0;
+  }
+
+  bool coherent{false};
+
+  int *ptr = sycl::malloc_shared<int>(1, q);
+
+  // Coherency test 1
+  //
+  // The following test validates if memory access is fine with memory allocated
+  // using malloc_managed() and COARSE_GRAINED advice set via mem_advise().
+  //
+  // Coarse grained memory is only guaranteed to be coherent outside of GPU
+  // kernels that modify it. Changes applied to coarse-grained memory by a GPU
+  // kernel are only visible to the rest of the system (CPU or other GPUs) when
+  // the kernel has completed. A GPU kernel is only guaranteed to see changes
+  // applied to coarse grained memory by the rest of the system (CPU or other
+  // GPUs) if those changes were made before the kernel launched.
+
+  // Hint to use coarse-grain memory.
+  q.mem_advise(ptr, sizeof(int), int{PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED});
+
+  int init_val{9};
+  int expected{init_val * init_val};
+
+  *ptr = init_val;
+  q.parallel_for(sycl::range{1}, kernels::SquareKrnl{ptr});
+  // Synchronise the underlying stream.
+  q.wait();
+
+  // Check if caches are flushed correctly and same memory is between devices.
+  if (*ptr == expected) {
+    coherent = true;
+  } else {
+    std::cerr << "Coherency test failed. Value: " << *ptr
+              << " (expected: " << expected << ")\n";
+    coherent = false;
+  }
+
+  // Coherency test 2
+  //
+  // The following test validates if fine-grain behavior is observed or not with
+  // memory allocated using malloc_managed().
+  //
+  // Fine grained memory allows CPUs and GPUs to synchronize (via atomics) and
+  // coherently communicate with each other while the GPU kernel is running.
+
+  // Hint to use fine-grain memory.
+  q.mem_advise(ptr, sizeof(int), int{PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED});
+
+  init_val = 1;
+  expected = 4;
+
+  *ptr = init_val;
+  q.parallel_for(sycl::range{1}, kernels::CoherencyTestKrnl{ptr});
+
+  // wait until ptr is 2 from the kernel (or 3 seconds), then increment to 3.
+  while (*ptr == 2) {
+    using std::chrono_literals::operator""s;
+    std::this_thread::sleep_for(3s);
+    break;
+  }
+  *ptr += 1;
+
+  // Synchronise the underlying stream.
+  q.wait();
+
+  // Check if caches are flushed correctly and same memory is between devices.
+  if (*ptr == expected) {
+    coherent &= true;
+  } else {
+    std::cerr << "Coherency test failed. Value: " << *ptr
+              << " (expected: " << expected << ")\n";
+    coherent = false;
+  }
+
+  // Cleanup
+  sycl::free(ptr, q);
+
+  // Check if all coherency tests passed.
+  assert(coherent);
+  // The above assert won't trigger with NDEBUG, so ensure the right exit code.
+  return coherent ? 0 : 1;
+}

Original file line number	Diff line number	Diff line change
`@@ -3375,6 +3375,12 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,`
`3375`	`3375`	`if (Advice & PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST) {`
`3376`	`3376`	`UrAdvice \|= UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST;`
`3377`	`3377`	`}`
	`3378`	`+ if (Advice & PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED) {`
	`3379`	`+ UrAdvice \|= UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY;`
	`3380`	`+ }`
	`3381`	`+ if (Advice & PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED) {`
	`3382`	`+ UrAdvice \|= UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY;`
	`3383`	`+ }`
`3378`	`3384`	`if (Advice & PI_MEM_ADVICE_RESET) {`
`3379`	`3385`	`UrAdvice \|= UR_USM_ADVICE_FLAG_DEFAULT;`
`3380`	`3386`	`}`