Skip to content

Commit f207062

Browse files
MrSidimsromanovvlad
authored andcommitted
[SYCL] Implement and use aligned allocator
Alignment is of 64 bytes by default. Buffer is not longer zero-initialized. Signed-off-by: Dmitry Sidorov <[email protected]>
1 parent f308c48 commit f207062

File tree

5 files changed

+145
-49
lines changed

5 files changed

+145
-49
lines changed
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
//==------------ aligned_allocator.hpp - SYCL standard header file ---------==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#pragma once
10+
11+
#include <CL/cl.h>
12+
#include <CL/sycl/detail/cnri.h>
13+
#include <CL/sycl/range.hpp>
14+
15+
#include <cstring>
16+
#include <cstdlib>
17+
#include <memory>
18+
#include <vector>
19+
20+
namespace cl {
21+
namespace sycl {
22+
template <typename T, size_t Alignment>
23+
class aligned_allocator {
24+
public:
25+
using value_type = T;
26+
using pointer = T*;
27+
using const_pointer = const T*;
28+
using reference = T&;
29+
using const_reference = const T&;
30+
31+
public:
32+
template<typename U>
33+
struct rebind {
34+
typedef aligned_allocator<U, Alignment> other;
35+
};
36+
37+
// Construct an object
38+
void construct(pointer Ptr, const_reference Val) {
39+
new (Ptr) value_type(Val);
40+
}
41+
42+
// Destroy an object
43+
void destroy(pointer Ptr) { Ptr->~value_type(); }
44+
45+
pointer address(reference Val) const { return &Val; }
46+
const_pointer address(const_reference Val) { return &Val; }
47+
48+
// Allocate aligned (to Alignment) memory
49+
pointer allocate(size_t Size) {
50+
Size += Alignment - Size % Alignment;
51+
pointer Result = reinterpret_cast<pointer>(
52+
aligned_alloc(Alignment, Size * sizeof(value_type)));
53+
if (!Result)
54+
throw std::bad_alloc();
55+
return Result;
56+
}
57+
58+
// Release allocated memory
59+
void deallocate(pointer Ptr, size_t size) {
60+
if (Ptr)
61+
free(Ptr);
62+
}
63+
64+
bool operator==(const aligned_allocator&) { return true; }
65+
bool operator!=(const aligned_allocator& rhs) { return false; }
66+
};
67+
} // namespace sycl
68+
} // namespace cl

sycl/include/CL/sycl/detail/buffer_impl.hpp

Lines changed: 47 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <CL/sycl/context.hpp>
1313
#include <CL/sycl/detail/common.hpp>
1414
#include <CL/sycl/detail/helpers.hpp>
15+
#include <CL/sycl/detail/aligned_allocator.hpp>
1516
#include <CL/sycl/detail/queue_impl.hpp>
1617
#include <CL/sycl/detail/scheduler/scheduler.h>
1718
#include <CL/sycl/handler.hpp>
@@ -37,7 +38,7 @@ class handler;
3738
class queue;
3839
template <int dimentions> class id;
3940
template <int dimentions> class range;
40-
using buffer_allocator = std::allocator<char>;
41+
using buffer_allocator = aligned_allocator<char, /*alignment*/ 64>;
4142
namespace detail {
4243
template <typename AllocatorT> class buffer_impl {
4344
public:
@@ -48,16 +49,15 @@ template <typename AllocatorT> class buffer_impl {
4849
buffer_impl(void *hostData, const size_t sizeInBytes,
4950
const property_list &propList,
5051
AllocatorT allocator = AllocatorT())
51-
: SizeInBytes(sizeInBytes), Props(propList), MAllocator(allocator) {
52-
if (Props.has_property<property::buffer::use_host_ptr>()) {
53-
BufPtr = hostData;
54-
} else {
55-
BufData.resize(get_size());
56-
BufPtr = reinterpret_cast<void *>(BufData.data());
57-
if (hostData != nullptr) {
58-
auto HostPtr = reinterpret_cast<char *>(hostData);
59-
set_final_data(HostPtr);
60-
std::copy(HostPtr, HostPtr + SizeInBytes, BufData.data());
52+
: SizeInBytes(sizeInBytes), Props(propList), BufPtr(hostData),
53+
MAllocator(allocator) {
54+
if (!Props.has_property<property::buffer::use_host_ptr>()) {
55+
BufPtr = allocateHostMem();
56+
if (hostData) {
57+
set_final_data(reinterpret_cast<char *>(hostData));
58+
std::copy(reinterpret_cast<char *>(hostData),
59+
reinterpret_cast<char *>(hostData) + SizeInBytes,
60+
reinterpret_cast<char *>(BufPtr));
6161
}
6262
}
6363
}
@@ -66,35 +66,33 @@ template <typename AllocatorT> class buffer_impl {
6666
buffer_impl(const void *hostData, const size_t sizeInBytes,
6767
const property_list &propList,
6868
AllocatorT allocator = AllocatorT())
69-
: SizeInBytes(sizeInBytes), Props(propList), MAllocator(allocator) {
70-
if (Props.has_property<property::buffer::use_host_ptr>()) {
69+
: SizeInBytes(sizeInBytes), Props(propList),
70+
BufPtr(const_cast<void *>(hostData)), MAllocator(allocator) {
71+
if (!Props.has_property<property::buffer::use_host_ptr>()) {
7172
// TODO make this buffer read only
72-
BufPtr = const_cast<void *>(hostData);
73-
} else {
74-
BufData.resize(get_size());
75-
BufPtr = reinterpret_cast<void *>(BufData.data());
76-
if (hostData != nullptr) {
77-
std::copy((char *)hostData, (char *)hostData + SizeInBytes,
78-
BufData.data());
79-
}
73+
BufPtr = allocateHostMem();
74+
if (hostData)
75+
std::copy(const_cast<char *>(reinterpret_cast<const char *>(hostData)),
76+
const_cast<char *>(
77+
reinterpret_cast<const char *>(hostData)) + SizeInBytes,
78+
const_cast<char *>(reinterpret_cast<const char *>(BufPtr)));
8079
}
8180
}
8281

8382
template <typename T>
8483
buffer_impl(const shared_ptr_class<T> &hostData, const size_t sizeInBytes,
8584
const property_list &propList,
8685
AllocatorT allocator = AllocatorT())
87-
: SizeInBytes(sizeInBytes), Props(propList), MAllocator(allocator) {
88-
if (Props.has_property<property::buffer::use_host_ptr>()) {
89-
BufPtr = hostData.get();
90-
} else {
91-
BufData.resize(get_size());
92-
BufPtr = reinterpret_cast<void *>(BufData.data());
93-
if (hostData.get() != nullptr) {
86+
: SizeInBytes(sizeInBytes), Props(propList), BufPtr(hostData.get()),
87+
MAllocator(allocator) {
88+
if (!Props.has_property<property::buffer::use_host_ptr>()) {
89+
BufPtr = allocateHostMem();
90+
if (hostData.get()) {
9491
weak_ptr_class<T> hostDataWeak = hostData;
9592
set_final_data(hostDataWeak);
96-
std::copy((char *)hostData.get(), (char *)hostData.get() + SizeInBytes,
97-
BufData.data());
93+
std::copy(reinterpret_cast<char *>(hostData.get()),
94+
reinterpret_cast<char *>(hostData.get()) + SizeInBytes,
95+
reinterpret_cast<char *>(BufPtr));
9896
}
9997
}
10098
}
@@ -120,8 +118,7 @@ template <typename AllocatorT> class buffer_impl {
120118
const size_t sizeInBytes, const property_list &propList,
121119
AllocatorT allocator = AllocatorT())
122120
: SizeInBytes(sizeInBytes), Props(propList), MAllocator(allocator) {
123-
BufData.resize(get_size());
124-
BufPtr = reinterpret_cast<void *>(BufData.data());
121+
BufPtr = allocateHostMem();
125122
// We need cast BufPtr to pointer to the iteration type to get correct
126123
// offset in std::copy when it will increment destination pointer.
127124
auto *Ptr =
@@ -137,8 +134,7 @@ template <typename AllocatorT> class buffer_impl {
137134
const size_t sizeInBytes, const property_list &propList,
138135
AllocatorT allocator = AllocatorT())
139136
: SizeInBytes(sizeInBytes), Props(propList), MAllocator(allocator) {
140-
BufData.resize(get_size());
141-
BufPtr = reinterpret_cast<void *>(BufData.data());
137+
BufPtr = allocateHostMem();
142138
// We need cast BufPtr to pointer to the iteration type to get correct
143139
// offset in std::copy when it will increment destination pointer.
144140
typedef typename std::iterator_traits<InputIterator>::value_type value;
@@ -148,9 +144,10 @@ template <typename AllocatorT> class buffer_impl {
148144
}
149145

150146
buffer_impl(cl_mem MemObject, const context &SyclContext,
151-
const size_t sizeInBytes, event AvailableEvent = {})
147+
const size_t sizeInBytes, event AvailableEvent = {},
148+
AllocatorT allocator = AllocatorT())
152149
: OpenCLInterop(true), SizeInBytes(sizeInBytes),
153-
AvailableEvent(AvailableEvent) {
150+
AvailableEvent(AvailableEvent), MAllocator(allocator) {
154151
if (SyclContext.is_host())
155152
throw cl::sycl::invalid_parameter_error(
156153
"Creation of interoperability buffer using host context is not "
@@ -165,8 +162,7 @@ template <typename AllocatorT> class buffer_impl {
165162
OCLState.Mem = MemObject;
166163
CHECK_OCL_CODE(clRetainMemObject(MemObject));
167164

168-
BufData.resize(get_size());
169-
BufPtr = reinterpret_cast<void *>(BufData.data());
165+
BufPtr = allocateHostMem();
170166
}
171167

172168
size_t get_size() const { return SizeInBytes; }
@@ -184,6 +180,12 @@ template <typename AllocatorT> class buffer_impl {
184180

185181
if (OpenCLInterop)
186182
CHECK_OCL_CODE_NO_EXC(clReleaseMemObject(OCLState.Mem));
183+
184+
if (!Props.has_property<property::buffer::use_host_ptr>()) {
185+
if (BufPtr)
186+
MAllocator.deallocate(reinterpret_cast<
187+
typename AllocatorT::pointer>(BufPtr), SizeInBytes);
188+
}
187189
}
188190

189191
void set_final_data(std::nullptr_t) { uploadData = nullptr; }
@@ -252,6 +254,13 @@ template <typename AllocatorT> class buffer_impl {
252254
accessOffset);
253255
}
254256

257+
inline void *allocateHostMem() {
258+
size_t AllocatorValueSize = sizeof(typename AllocatorT::value_type);
259+
size_t AllocationSize = get_size() / AllocatorValueSize;
260+
AllocationSize += (get_size() % AllocatorValueSize) ? 1 : 0;
261+
return MAllocator.allocate(AllocationSize);
262+
}
263+
255264
template <typename propertyT> bool has_property() const {
256265
return Props.has_property<propertyT>();
257266
}
@@ -311,7 +320,6 @@ template <typename AllocatorT> class buffer_impl {
311320
event AvailableEvent;
312321
cl_context OpenCLContext = nullptr;
313322
void *BufPtr = nullptr;
314-
vector_class<byte> BufData;
315323
// TODO: enable support of cl_mem objects from multiple contexts
316324
// TODO: at the current moment, using a buffer on multiple devices
317325
// or on a device and a host simultaneously is not supported (the

sycl/include/CL/sycl/handler.hpp

Lines changed: 6 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <CL/sycl/access/access.hpp>
1313
#include <CL/sycl/context.hpp>
1414
#include <CL/sycl/detail/common.hpp>
15+
#include <CL/sycl/detail/aligned_allocator.hpp>
1516
#include <CL/sycl/detail/os_util.hpp>
1617
#include <CL/sycl/detail/scheduler/scheduler.h>
1718
#include <CL/sycl/event.hpp>
@@ -54,6 +55,7 @@ class queue;
5455
template <typename DataT, int Dimensions, access::mode AccessMode,
5556
access::target AccessTarget, access::placeholder IsPlaceholder>
5657
class accessor;
58+
using buffer_allocator = aligned_allocator<char, /*alignment*/ 64>;
5759
template <typename T, int Dimensions, typename AllocatorT> class buffer;
5860
namespace detail {
5961

@@ -587,8 +589,7 @@ class handler {
587589
range<dim> Range =
588590
getAccessorRangeHelper<T_src, dim, mode, tgt,
589591
isPlaceholder>::getAccessorRange(src);
590-
// TODO use buffer_allocator when it is possible
591-
buffer<T_src, dim, std::allocator<char>> Buffer(
592+
buffer<T_src, dim, buffer_allocator> Buffer(
592593
(shared_ptr_class<T_src>)dest, Range,
593594
{property::buffer::use_host_ptr()});
594595
accessor<T_src, dim, access::mode::write, access::target::global_buffer,
@@ -608,8 +609,7 @@ class handler {
608609
range<dim> Range =
609610
getAccessorRangeHelper<T_dest, dim, mode, tgt,
610611
isPlaceholder>::getAccessorRange(dest);
611-
// TODO use buffer_allocator when it is possible
612-
buffer<T_dest, dim, std::allocator<char>> Buffer(
612+
buffer<T_dest, dim, buffer_allocator> Buffer(
613613
(shared_ptr_class<T_dest>)src, Range,
614614
{property::buffer::use_host_ptr()});
615615
accessor<T_dest, dim, access::mode::read, access::target::global_buffer,
@@ -628,8 +628,7 @@ class handler {
628628
range<dim> Range =
629629
getAccessorRangeHelper<T_src, dim, mode, tgt,
630630
isPlaceholder>::getAccessorRange(src);
631-
// TODO use buffer_allocator when it is possible
632-
buffer<T_src, dim, std::allocator<char>> Buffer(
631+
buffer<T_src, dim, buffer_allocator> Buffer(
633632
(T_src *)dest, Range, {property::buffer::use_host_ptr()});
634633
accessor<T_src, dim, access::mode::write, access::target::global_buffer,
635634
access::placeholder::false_t>
@@ -647,8 +646,7 @@ class handler {
647646
range<dim> Range =
648647
getAccessorRangeHelper<T_dest, dim, mode, tgt,
649648
isPlaceholder>::getAccessorRange(dest);
650-
// TODO use buffer_allocator when it is possible
651-
buffer<T_dest, dim, std::allocator<char>> Buffer(
649+
buffer<T_dest, dim, buffer_allocator> Buffer(
652650
(T_dest *)src, Range, {property::buffer::use_host_ptr()});
653651
accessor<T_dest, dim, access::mode::read, access::target::global_buffer,
654652
access::placeholder::false_t>

sycl/test/basic_tests/buffer/buffer.cpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -328,6 +328,12 @@ int main() {
328328
buffer<int, 2> Buffer(range<2>(20, 20));
329329
Buffer.set_final_data((int *)result);
330330
queue myQueue;
331+
myQueue.submit([&](handler &cgh) {
332+
auto B = Buffer.get_access<access::mode::write>(cgh);
333+
cgh.parallel_for<class bufferByRange2Init>(
334+
range<2>{20, 20}, [=](id<2> index) { B[index] = 0; });
335+
});
336+
331337
myQueue.submit([&](handler &cgh) {
332338
auto B = Buffer.get_access<access::mode::write>(cgh);
333339
cgh.parallel_for<class bufferByRange2>(
@@ -362,6 +368,12 @@ int main() {
362368
buffer<int, 2> Buffer(range<2>(20, 20));
363369
Buffer.set_final_data((int *)result);
364370
queue myQueue;
371+
myQueue.submit([&](handler &cgh) {
372+
auto B = Buffer.get_access<access::mode::write>(cgh);
373+
cgh.parallel_for<class bufferByRangeOffsetInit>(
374+
range<2>{20, 20}, [=](id<2> index) { B[index] = 0; });
375+
});
376+
365377
myQueue.submit([&](handler &cgh) {
366378
accessor<int, 2, access::mode::write, access::target::global_buffer,
367379
access::placeholder::false_t>

sycl/test/sub_group/vote.cpp

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,19 +25,29 @@ void check(queue Queue, const int G, const int L, const int D, const int R) {
2525
buffer<int, 1> sganybuf(G);
2626
buffer<int, 1> sgallbuf(G);
2727

28+
// Initialise buffer with zeros
29+
Queue.submit([&](handler &cgh) {
30+
auto sganyacc = sganybuf.get_access<access::mode::read_write>(cgh);
31+
auto sgallacc = sgallbuf.get_access<access::mode::read_write>(cgh);
32+
cgh.parallel_for<class init>(range<1>{(unsigned)G}, [=](id<1> index) {
33+
sganyacc[index] = 0;
34+
sgallacc[index] = 0;
35+
});
36+
});
37+
2838
Queue.submit([&](handler &cgh) {
2939
auto sganyacc = sganybuf.get_access<access::mode::read_write>(cgh);
3040
auto sgallacc = sgallbuf.get_access<access::mode::read_write>(cgh);
3141
cgh.parallel_for<class subgr>(NdRange, [=](nd_item<1> NdItem) {
3242
intel::sub_group SG = NdItem.get_sub_group();
3343
/* Set to 1 if any local ID in subgroup devided by D has remainder R */
3444
if (SG.any(SG.get_local_id().get(0) % D == R)) {
35-
sganyacc[NdItem.get_global_id()]++;
45+
sganyacc[NdItem.get_global_id()] = 1;
3646
}
3747
/* Set to 1 if remainder of division of subgroup local ID by D is less
3848
* than R for all work items in subgroup */
3949
if (SG.all(SG.get_local_id().get(0) % D < R)) {
40-
sgallacc[NdItem.get_global_id()]++;
50+
sgallacc[NdItem.get_global_id()] = 1;
4151
}
4252
});
4353
});

0 commit comments

Comments
 (0)