Skip to content

Commit ab57810

Browse files
authored
[SYCL][COMPAT] Added constant memory_region for read_only memory (#11024)
This PR extends the memory wrapper of SYCLcompat to support allocating read only memory. To allow the allocation of `device_memory` directly for source to source translation: - Extended `device_memory` to support a queue parameter in its constructor, which defaults to `syclcompat::get_default_queue` - Added device_read_only property to device memory if usm device read only extension is available for `memory_region::constant` Documentation was already updated, except for the usm device read only extension
1 parent 00ec5be commit ab57810

File tree

5 files changed

+336
-28
lines changed

5 files changed

+336
-28
lines changed

sycl/doc/syclcompat/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ Specifically, this library depends on the following SYCL extensions:
4242
../extensions/supported/sycl_ext_oneapi_assert.asciidoc)
4343
* [sycl_ext_oneapi_enqueue_barrier](
4444
../extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc)
45+
* [sycl_ext_oneapi_usm_device_read_only](../extensions/supported/sycl_ext_oneapi_usm_device_read_only.asciidoc)
4546

4647
## Usage
4748

@@ -265,7 +266,7 @@ void vectorAdd(const float *A, const float *B, float *C, int n,
265266
Then, `vectorAdd` can be launched like this:
266267

267268
``` c++
268-
syclcompat::launch<vectorAdd>(blocksPerGrid, threadsPerBlock, mem_size, d_A,
269+
syclcompat::launch<vectorAdd>(blocksPerGrid, threadsPerBlock, mem_size, d_A,
269270
d_B, d_C, n);
270271
```
271272

sycl/include/syclcompat/memory.hpp

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -876,12 +876,14 @@ template <class T, memory_region Memory, size_t Dimension> class device_memory {
876876
using value_t = typename detail::memory_traits<Memory, T>::value_t;
877877
using compat_accessor_t = syclcompat::accessor<T, Memory, Dimension>;
878878

879-
device_memory() : device_memory(sycl::range<Dimension>(1)) {}
879+
device_memory(sycl::queue q = get_default_queue())
880+
: device_memory(sycl::range<Dimension>(1), q) {}
880881

881882
/// Constructor of 1-D array with initializer list
882883
device_memory(const sycl::range<Dimension> &in_range,
883-
std::initializer_list<value_t> &&init_list)
884-
: device_memory(in_range) {
884+
std::initializer_list<value_t> &&init_list,
885+
sycl::queue q = get_default_queue())
886+
: device_memory(in_range, q) {
885887
assert(init_list.size() <= in_range.size());
886888
_host_ptr = (value_t *)std::malloc(_size);
887889
std::memset(_host_ptr, 0, _size);
@@ -892,8 +894,9 @@ template <class T, memory_region Memory, size_t Dimension> class device_memory {
892894
template <size_t D = Dimension>
893895
device_memory(
894896
const typename std::enable_if<D == 2, sycl::range<2>>::type &in_range,
895-
std::initializer_list<std::initializer_list<value_t>> &&init_list)
896-
: device_memory(in_range) {
897+
std::initializer_list<std::initializer_list<value_t>> &&init_list,
898+
sycl::queue q = get_default_queue())
899+
: device_memory(in_range, q) {
897900
assert(init_list.size() <= in_range[0]);
898901
_host_ptr = (value_t *)std::malloc(_size);
899902
std::memset(_host_ptr, 0, _size);
@@ -906,9 +909,10 @@ template <class T, memory_region Memory, size_t Dimension> class device_memory {
906909
}
907910

908911
/// Constructor with range
909-
device_memory(const sycl::range<Dimension> &range_in)
912+
device_memory(const sycl::range<Dimension> &range_in,
913+
sycl::queue q = get_default_queue())
910914
: _size(range_in.size() * sizeof(T)), _range(range_in), _reference(false),
911-
_host_ptr(nullptr), _device_ptr(nullptr) {
915+
_host_ptr(nullptr), _device_ptr(nullptr), _q(q) {
912916
static_assert((Memory == memory_region::global) ||
913917
(Memory == memory_region::constant) ||
914918
(Memory == memory_region::usm_shared),
@@ -918,19 +922,28 @@ template <class T, memory_region Memory, size_t Dimension> class device_memory {
918922
}
919923

920924
/// Constructor with range
921-
template <class... Args>
925+
// enable_if_t SFINAE to avoid ambiguity with
926+
// device_memory(Args... Arguments, sycl::queue q)
927+
template <class... Args, size_t D = Dimension,
928+
typename = std::enable_if_t<sizeof...(Args) == D>>
922929
device_memory(Args... Arguments)
923-
: device_memory(sycl::range<Dimension>(Arguments...)) {}
930+
: device_memory(sycl::range<Dimension>(Arguments...),
931+
get_default_queue()) {}
932+
933+
/// Constructor with range and queue
934+
template <class... Args>
935+
device_memory(Args... Arguments, sycl::queue q)
936+
: device_memory(sycl::range<Dimension>(Arguments...), q) {}
924937

925938
~device_memory() {
926939
if (_device_ptr && !_reference)
927-
free(_device_ptr);
940+
syclcompat::free(_device_ptr, _q);
928941
if (_host_ptr)
929942
std::free(_host_ptr);
930943
}
931944

932945
/// Allocate memory with default queue, and init memory if has initial value.
933-
void init() { init(get_default_queue()); }
946+
void init() { init(_q); }
934947
/// Allocate memory with specified queue, and init memory if has initial
935948
/// value.
936949
void init(sycl::queue q) {
@@ -946,12 +959,12 @@ template <class T, memory_region Memory, size_t Dimension> class device_memory {
946959
/// The variable is assigned to a device pointer.
947960
void assign(value_t *src, size_t size) {
948961
this->~device_memory();
949-
new (this) device_memory(src, size);
962+
new (this) device_memory(src, size, _q);
950963
}
951964

952965
/// Get memory pointer of the memory object, which is virtual pointer when
953966
/// usm is not used, and device pointer when usm is used.
954-
value_t *get_ptr() { return get_ptr(get_default_queue()); }
967+
value_t *get_ptr() { return get_ptr(_q); }
955968
/// Get memory pointer of the memory object, which is virtual pointer when
956969
/// usm is not used, and device pointer when usm is used.
957970
value_t *get_ptr(sycl::queue q) {
@@ -977,16 +990,25 @@ template <class T, memory_region Memory, size_t Dimension> class device_memory {
977990
}
978991

979992
private:
980-
device_memory(value_t *memory_ptr, size_t size)
993+
device_memory(value_t *memory_ptr, size_t size,
994+
sycl::queue q = get_default_queue())
981995
: _size(size), _range(size / sizeof(T)), _reference(true),
982-
_device_ptr(memory_ptr) {}
996+
_device_ptr(memory_ptr), _q(q) {}
983997

984998
void allocate_device(sycl::queue q) {
985999
if (Memory == memory_region::usm_shared) {
9861000
_device_ptr = (value_t *)sycl::malloc_shared(_size, q.get_device(),
9871001
q.get_context());
9881002
return;
9891003
}
1004+
#ifdef SYCL_EXT_ONEAPI_USM_DEVICE_READ_ONLY
1005+
if (Memory == memory_region::constant) {
1006+
_device_ptr = (value_t *)sycl::malloc_device(
1007+
_size, q.get_device(), q.get_context(),
1008+
sycl::ext::oneapi::property::usm::device_read_only());
1009+
return;
1010+
}
1011+
#endif
9901012
_device_ptr = (value_t *)detail::malloc(_size, q);
9911013
}
9921014

@@ -995,6 +1017,7 @@ template <class T, memory_region Memory, size_t Dimension> class device_memory {
9951017
bool _reference;
9961018
value_t *_host_ptr;
9971019
value_t *_device_ptr;
1020+
sycl::queue _q;
9981021
};
9991022
template <class T, memory_region Memory>
10001023
class device_memory<T, Memory, 0> : public device_memory<T, Memory, 1> {

sycl/test-e2e/syclcompat/memory/memory_management_test1.cpp

Lines changed: 109 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -368,11 +368,120 @@ template <typename T> void test_fill_q() {
368368
free(h_A);
369369
}
370370

371+
void test_constant_memcpy() {
372+
std::cout << __PRETTY_FUNCTION__ << std::endl;
373+
374+
constexpr size_t size = 2000;
375+
constexpr size_t offset = 1000;
376+
377+
syclcompat::constant_memory<float, 1> d_A(size);
378+
syclcompat::constant_memory<float, 1> d_B(size);
379+
380+
float *h_A = (float *)malloc(size / 2 * sizeof(float));
381+
float *h_B = (float *)malloc(size / 2 * sizeof(float));
382+
float *h_C = (float *)malloc(size * sizeof(float));
383+
float *h_D = (float *)malloc(size * sizeof(float));
384+
385+
for (int i = 0; i < size / 2; i++) {
386+
h_A[i] = 1.0f;
387+
h_B[i] = 2.0f;
388+
}
389+
390+
// hostA[0..999] -> deviceA[0..999]
391+
// hostB[0..999] -> deviceA[1000..1999]
392+
// deviceA[0..1999] -> hostC[0..1999]
393+
// deviceA[0..999] -> deviceB[0..999]
394+
// deviceA[1000..1999] -> deviceB[1000..1999]
395+
// deviceB[0..1999] -> hostD[0..1999]
396+
397+
syclcompat::memcpy(d_A.get_ptr(), h_A, offset * sizeof(float));
398+
syclcompat::memcpy((char *)d_A.get_ptr() + offset * sizeof(float), h_B,
399+
(size - offset) * sizeof(float));
400+
syclcompat::memcpy(h_C, d_A.get_ptr(), size * sizeof(float));
401+
syclcompat::memcpy(d_B.get_ptr(), d_A.get_ptr(), offset * sizeof(float));
402+
syclcompat::memcpy((char *)d_B.get_ptr() + offset * sizeof(float),
403+
(void *)((size_t)d_A.get_ptr() + offset * sizeof(float)),
404+
(size - offset) * sizeof(float));
405+
syclcompat::memcpy(h_D, d_B.get_ptr(), size * sizeof(float));
406+
407+
// verify hostD
408+
for (int i = 0; i < offset; i++) {
409+
assert(fabs(h_A[i] - h_D[i]) <= 1e-5);
410+
}
411+
412+
for (int i = offset; i < size; i++) {
413+
assert(fabs(h_B[i - offset] - h_D[i]) <= 1e-5);
414+
}
415+
416+
free(h_A);
417+
free(h_B);
418+
free(h_C);
419+
free(h_D);
420+
}
421+
422+
void test_constant_memcpy_q() {
423+
std::cout << __PRETTY_FUNCTION__ << std::endl;
424+
425+
sycl::queue q{{sycl::property::queue::in_order()}};
426+
427+
constexpr size_t size = 2000;
428+
constexpr size_t offset = 1000;
429+
syclcompat::constant_memory<float, 1> d_A(size, q);
430+
syclcompat::constant_memory<float, 1> d_B(size, q);
431+
432+
float *h_A = (float *)malloc(size / 2 * sizeof(float));
433+
float *h_B = (float *)malloc(size / 2 * sizeof(float));
434+
float *h_C = (float *)malloc(size * sizeof(float));
435+
float *h_D = (float *)malloc(size * sizeof(float));
436+
437+
for (int i = 0; i < size / 2; i++) {
438+
h_A[i] = 1.0f;
439+
h_B[i] = 2.0f;
440+
}
441+
442+
// hostA[0..999] -> deviceA[0..999]
443+
// hostB[0..999] -> deviceA[1000..1999]
444+
// deviceA[0..1999] -> hostC[0..1999]
445+
// deviceA[0..999] -> deviceB[0..999]
446+
// deviceA[1000..1999] -> deviceB[1000..1999]
447+
// deviceB[0..1999] -> hostD[0..1999]
448+
449+
syclcompat::memcpy(d_A.get_ptr(), h_A, offset * sizeof(float), q);
450+
451+
syclcompat::memcpy((char *)d_A.get_ptr() + offset * sizeof(float), h_B,
452+
(size - offset) * sizeof(float), q);
453+
syclcompat::memcpy(h_C, d_A.get_ptr(), size * sizeof(float), q);
454+
455+
syclcompat::memcpy(d_B.get_ptr(), d_A.get_ptr(), offset * sizeof(float), q);
456+
457+
syclcompat::memcpy((char *)d_B.get_ptr() + offset * sizeof(float),
458+
(void *)((size_t)d_A.get_ptr() + offset * sizeof(float)),
459+
(size - offset) * sizeof(float), q);
460+
461+
syclcompat::memcpy(h_D, d_B.get_ptr(), size * sizeof(float), q);
462+
463+
// verify hostD
464+
for (int i = 0; i < offset; i++) {
465+
assert(fabs(h_A[i] - h_D[i]) <= 1e-5);
466+
}
467+
468+
for (int i = offset; i < size; i++) {
469+
assert(fabs(h_B[i - offset] - h_D[i]) <= 1e-5);
470+
}
471+
472+
free(h_A);
473+
free(h_B);
474+
free(h_C);
475+
free(h_D);
476+
}
477+
371478
int main() {
372479
test_memcpy();
373480
test_memcpy_q();
374481
test_memset();
375482
test_memset_q();
483+
test_constant_memcpy();
484+
test_constant_memcpy_q();
376485

377486
INSTANTIATE_ALL_TYPES(value_type_list, test_memcpy_t);
378487
INSTANTIATE_ALL_TYPES(value_type_list, test_memcpy_t_q);

sycl/test-e2e/syclcompat/memory/memory_management_test2.cpp

Lines changed: 66 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@
3939

4040
#include "memory_common.hpp"
4141

42+
constexpr size_t DataW = 100;
43+
constexpr size_t DataH = 100;
44+
4245
void test_memcpy_pitched() {
4346
std::cout << __PRETTY_FUNCTION__ << std::endl;
4447

@@ -151,13 +154,6 @@ void test_memcpy_kernel() {
151154
free(h_C);
152155
}
153156

154-
#define DataW 100
155-
#define DataH 100
156-
157-
syclcompat::global_memory<float, 2> g_A(DataW, DataH);
158-
syclcompat::global_memory<float, 2> g_B(DataW, DataH);
159-
syclcompat::global_memory<float, 2> g_C(DataW, DataH);
160-
161157
void test_global_memory() {
162158
std::cout << __PRETTY_FUNCTION__ << std::endl;
163159

@@ -172,6 +168,10 @@ void test_global_memory() {
172168
}
173169
}
174170

171+
syclcompat::global_memory<float, 2> g_A(DataW, DataH);
172+
syclcompat::global_memory<float, 2> g_B(DataW, DataH);
173+
syclcompat::global_memory<float, 2> g_C(DataW, DataH);
174+
175175
g_A.init();
176176
g_B.init();
177177
g_C.init();
@@ -216,13 +216,13 @@ void test_global_memory() {
216216
}
217217
}
218218

219-
syclcompat::shared_memory<float, 1> s_A(DataW);
220-
syclcompat::shared_memory<float, 1> s_B(DataW);
221-
syclcompat::shared_memory<float, 1> s_C(DataW);
222-
223219
void test_shared_memory() {
224220
std::cout << __PRETTY_FUNCTION__ << std::endl;
225221

222+
syclcompat::shared_memory<float, 1> s_A(DataW);
223+
syclcompat::shared_memory<float, 1> s_B(DataW);
224+
syclcompat::shared_memory<float, 1> s_C(DataW);
225+
226226
s_A.init();
227227
s_B.init();
228228
s_C.init();
@@ -256,6 +256,60 @@ void test_shared_memory() {
256256
}
257257
}
258258

259+
void test_constant_memory() {
260+
std::cout << __PRETTY_FUNCTION__ << std::endl;
261+
262+
float h_A[DataW][DataH];
263+
float h_B[DataW][DataH];
264+
float h_C[DataW][DataH];
265+
266+
for (int i = 0; i < DataW; i++) {
267+
for (int j = 0; j < DataH; j++) {
268+
h_A[i][j] = 1.0f;
269+
h_B[i][j] = 2.0f;
270+
}
271+
}
272+
273+
syclcompat::constant_memory<float, 2> c_A(DataW, DataH);
274+
syclcompat::constant_memory<float, 2> c_B(DataW, DataH);
275+
syclcompat::global_memory<float, 2> g_C(DataW, DataH);
276+
277+
c_A.init();
278+
c_B.init();
279+
g_C.init();
280+
syclcompat::memcpy((void *)c_A.get_ptr(), (void *)&h_A[0][0],
281+
DataW * DataH * sizeof(float));
282+
syclcompat::memcpy((void *)c_B.get_ptr(), (void *)&h_B[0][0],
283+
DataW * DataH * sizeof(float));
284+
285+
{
286+
syclcompat::get_default_queue().submit([&](sycl::handler &cgh) {
287+
auto c_A_acc = c_A.get_access(cgh);
288+
auto c_B_acc = c_B.get_access(cgh);
289+
auto g_C_acc = g_C.get_access(cgh);
290+
cgh.parallel_for(sycl::range<2>(DataW, DataH), [=](sycl::id<2> id) {
291+
syclcompat::accessor<float, syclcompat::memory_region::constant, 2> A(
292+
c_A_acc);
293+
syclcompat::accessor<float, syclcompat::memory_region::constant, 2> B(
294+
c_B_acc);
295+
syclcompat::accessor<float, syclcompat::memory_region::global, 2> C(
296+
g_C_acc);
297+
int i = id[0], j = id[1];
298+
C[i][j] = A[i][j] + B[i][j];
299+
});
300+
});
301+
syclcompat::get_default_queue().wait_and_throw();
302+
}
303+
syclcompat::memcpy((void *)&h_C[0][0], (void *)g_C.get_ptr(),
304+
DataW * DataH * sizeof(float));
305+
// verify hostD
306+
for (int i = 0; i < DataW; i++) {
307+
for (int j = 0; j < DataH; j++) {
308+
assert(fabs(h_C[i][j] - h_A[i][j] - h_B[i][j]) <= 1e-5);
309+
}
310+
}
311+
}
312+
259313
void test_memcpy_pitched_q() {
260314
std::cout << __PRETTY_FUNCTION__ << std::endl;
261315

@@ -312,6 +366,6 @@ int main() {
312366

313367
test_global_memory();
314368
test_shared_memory();
315-
369+
test_constant_memory();
316370
return 0;
317371
}

0 commit comments

Comments
 (0)