Skip to content

Commit ab86d0d

Browse files
GeorgeWebkbenzieaarongreig
authored
[SYCL][HIP] Add coarse-grained memory advice for HIP on AMD (#12394)
Enables and tests coarse grained memory access via the memadvise implementation for HIP platforms on AMD hardware. See related UR changes for the adapter implementation: oneapi-src/unified-runtime#1249 --------- Co-authored-by: Kenneth Benzie (Benie) <[email protected]> Co-authored-by: aarongreig <[email protected]>
1 parent b781e6c commit ab86d0d

File tree

5 files changed

+166
-8
lines changed

5 files changed

+166
-8
lines changed

sycl/include/sycl/detail/pi.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,9 +151,10 @@
151151
// 14.42 Added piextCommandBufferPrefetchUSM and piextCommandBufferAdviseUSM
152152
// 15.43 Changed the signature of piextMemGetNativeHandle to also take a
153153
// pi_device
154+
// 15.44 Add coarse-grain memory advice flag for HIP.
154155

155156
#define _PI_H_VERSION_MAJOR 15
156-
#define _PI_H_VERSION_MINOR 43
157+
#define _PI_H_VERSION_MINOR 44
157158

158159
#define _PI_STRING_HELPER(a) #a
159160
#define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -582,6 +583,8 @@ typedef enum {
582583
PI_MEM_ADVICE_CUDA_UNSET_PREFERRED_LOCATION_HOST = 1 << 7,
583584
PI_MEM_ADVICE_CUDA_SET_ACCESSED_BY_HOST = 1 << 8,
584585
PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST = 1 << 9,
586+
PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED = 1 << 10,
587+
PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED = 1 << 11,
585588
PI_MEM_ADVICE_UNKNOWN = 0x7FFFFFFF,
586589
} _pi_mem_advice;
587590

sycl/plugins/unified_runtime/CMakeLists.txt

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,14 +56,14 @@ endif()
5656
if(SYCL_PI_UR_USE_FETCH_CONTENT)
5757
include(FetchContent)
5858

59-
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime")
60-
# commit 9363574db721d2388c7d76a10edb128764872352
61-
# Merge: 553a6b82 5e513738
59+
set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
60+
# commit cd97e172cbbfc411fccb0b80e0fff6f9126574f4
61+
# Merge: bd745d10 2a9ded6f
6262
# Author: Kenneth Benzie (Benie) <[email protected]>
63-
# Date: Thu Feb 1 11:50:16 2024 +0000
64-
# Merge pull request #1302 from kbenzie/benie/cl-binary-type-intermediate
65-
# [CL] Handle INTERMEDIATE binary type
66-
set(UNIFIED_RUNTIME_TAG 9363574db721d2388c7d76a10edb128764872352)
63+
# Date: Fri Feb 2 14:24:16 2024 +0000
64+
# Merge pull request #1249 from GeorgeWeb/georgi/hip_memadvise_coarse_grained
65+
# [HIP] Implement coarse-grained memory advice for the HIP adapter
66+
set(UNIFIED_RUNTIME_TAG cd97e172cbbfc411fccb0b80e0fff6f9126574f4)
6767

6868
if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
6969
set(UNIFIED_RUNTIME_REPO "${SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO}")

sycl/plugins/unified_runtime/pi2ur.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3375,6 +3375,12 @@ inline pi_result piextUSMEnqueueMemAdvise(pi_queue Queue, const void *Ptr,
33753375
if (Advice & PI_MEM_ADVICE_CUDA_UNSET_ACCESSED_BY_HOST) {
33763376
UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_ACCESSED_BY_HOST;
33773377
}
3378+
if (Advice & PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED) {
3379+
UrAdvice |= UR_USM_ADVICE_FLAG_SET_NON_COHERENT_MEMORY;
3380+
}
3381+
if (Advice & PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED) {
3382+
UrAdvice |= UR_USM_ADVICE_FLAG_CLEAR_NON_COHERENT_MEMORY;
3383+
}
33783384
if (Advice & PI_MEM_ADVICE_RESET) {
33793385
UrAdvice |= UR_USM_ADVICE_FLAG_DEFAULT;
33803386
}

sycl/test-e2e/USM/memadvise_flags.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,8 @@ int main() {
5959
valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_PREFERRED_LOCATION_HOST);
6060
valid_advices.emplace_back(PI_MEM_ADVICE_HIP_SET_ACCESSED_BY_HOST);
6161
valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_ACCESSED_BY_HOST);
62+
valid_advices.emplace_back(PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED);
63+
valid_advices.emplace_back(PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED);
6264
} else {
6365
// Skip
6466
return 0;
Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
// RUN: %{build} -o %t1.out
2+
// REQUIRES: hip_amd
3+
// RUN: %{run} %t1.out
4+
5+
//==---- memory_coherency_hip.cpp -----------------------------------------==//
6+
// USM coarse/fine grain memory coherency test for the HIP-AMD backend.
7+
//
8+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9+
// See https://llvm.org/LICENSE.txt for license information.
10+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include <sycl/sycl.hpp>
15+
16+
#include <chrono>
17+
#include <iostream>
18+
#include <thread>
19+
20+
namespace kernels {
21+
class SquareKrnl final {
22+
int *mPtr;
23+
24+
public:
25+
SquareKrnl(int *ptr) : mPtr{ptr} {}
26+
27+
void operator()(sycl::id<1>) const { *mPtr = (*mPtr) * (*mPtr); }
28+
};
29+
30+
class CoherencyTestKrnl final {
31+
int *mPtr;
32+
33+
public:
34+
CoherencyTestKrnl(int *ptr) : mPtr{ptr} {}
35+
36+
void operator()(sycl::id<1>) const {
37+
auto atm = sycl::atomic_ref<int, sycl::memory_order::relaxed,
38+
sycl::memory_scope::device>(mPtr[0]);
39+
40+
// mPtr was initialized to 1 by the host, now set it to 2.
41+
atm.fetch_add(1);
42+
43+
// spin until mPtr is 3, then change it to 4.
44+
int expected{3};
45+
int old = atm.load();
46+
while (true) {
47+
old = atm.load();
48+
if (old == expected) {
49+
if (atm.compare_exchange_strong(old, 4)) {
50+
break;
51+
}
52+
}
53+
}
54+
}
55+
};
56+
} // namespace kernels
57+
58+
int main() {
59+
sycl::queue q{};
60+
sycl::device dev = q.get_device();
61+
sycl::context ctx = q.get_context();
62+
if (!dev.get_info<sycl::info::device::usm_shared_allocations>()) {
63+
std::cout << "Shared USM is not supported. Skipping test.\n";
64+
return 0;
65+
}
66+
67+
bool coherent{false};
68+
69+
int *ptr = sycl::malloc_shared<int>(1, q);
70+
71+
// Coherency test 1
72+
//
73+
// The following test validates if memory access is fine with memory allocated
74+
// using malloc_managed() and COARSE_GRAINED advice set via mem_advise().
75+
//
76+
// Coarse grained memory is only guaranteed to be coherent outside of GPU
77+
// kernels that modify it. Changes applied to coarse-grained memory by a GPU
78+
// kernel are only visible to the rest of the system (CPU or other GPUs) when
79+
// the kernel has completed. A GPU kernel is only guaranteed to see changes
80+
// applied to coarse grained memory by the rest of the system (CPU or other
81+
// GPUs) if those changes were made before the kernel launched.
82+
83+
// Hint to use coarse-grain memory.
84+
q.mem_advise(ptr, sizeof(int), int{PI_MEM_ADVICE_HIP_SET_COARSE_GRAINED});
85+
86+
int init_val{9};
87+
int expected{init_val * init_val};
88+
89+
*ptr = init_val;
90+
q.parallel_for(sycl::range{1}, kernels::SquareKrnl{ptr});
91+
// Synchronise the underlying stream.
92+
q.wait();
93+
94+
// Check if caches are flushed correctly and same memory is between devices.
95+
if (*ptr == expected) {
96+
coherent = true;
97+
} else {
98+
std::cerr << "Coherency test failed. Value: " << *ptr
99+
<< " (expected: " << expected << ")\n";
100+
coherent = false;
101+
}
102+
103+
// Coherency test 2
104+
//
105+
// The following test validates if fine-grain behavior is observed or not with
106+
// memory allocated using malloc_managed().
107+
//
108+
// Fine grained memory allows CPUs and GPUs to synchronize (via atomics) and
109+
// coherently communicate with each other while the GPU kernel is running.
110+
111+
// Hint to use fine-grain memory.
112+
q.mem_advise(ptr, sizeof(int), int{PI_MEM_ADVICE_HIP_UNSET_COARSE_GRAINED});
113+
114+
init_val = 1;
115+
expected = 4;
116+
117+
*ptr = init_val;
118+
q.parallel_for(sycl::range{1}, kernels::CoherencyTestKrnl{ptr});
119+
120+
// wait until ptr is 2 from the kernel (or 3 seconds), then increment to 3.
121+
while (*ptr == 2) {
122+
using std::chrono_literals::operator""s;
123+
std::this_thread::sleep_for(3s);
124+
break;
125+
}
126+
*ptr += 1;
127+
128+
// Synchronise the underlying stream.
129+
q.wait();
130+
131+
// Check if caches are flushed correctly and same memory is between devices.
132+
if (*ptr == expected) {
133+
coherent &= true;
134+
} else {
135+
std::cerr << "Coherency test failed. Value: " << *ptr
136+
<< " (expected: " << expected << ")\n";
137+
coherent = false;
138+
}
139+
140+
// Cleanup
141+
sycl::free(ptr, q);
142+
143+
// Check if all coherency tests passed.
144+
assert(coherent);
145+
// The above assert won't trigger with NDEBUG, so ensure the right exit code.
146+
return coherent ? 0 : 1;
147+
}

0 commit comments

Comments
 (0)