Skip to content

[SYCL] Use a global flush buffer in stream #1678

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
May 27, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ option(SYCL_ADD_DEV_VERSION_POSTFIX "Adds -V postfix to version string" ON)
set(SYCL_MAJOR_VERSION 1)
set(SYCL_MINOR_VERSION 0)
set(SYCL_PATCH_VERSION 0)
set(SYCL_DEV_ABI_VERSION 0)
set(SYCL_DEV_ABI_VERSION 1)
if (SYCL_ADD_DEV_VERSION_POSTFIX)
set(SYCL_VERSION_POSTFIX "-${SYCL_DEV_ABI_VERSION}")
endif()
Expand Down
30 changes: 13 additions & 17 deletions sycl/include/CL/sycl/detail/accessor_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,17 @@ class __SYCL_EXPORT AccessorImplHost {
MElemSize(Other.MElemSize), MOffsetInBytes(Other.MOffsetInBytes),
MIsSubBuffer(Other.MIsSubBuffer) {}

// The resize method provides a way to change the size of the
// allocated memory and corresponding properties for the accessor.
// These are normally fixed for the accessor, but this capability
// is needed to support the stream class.
// Stream implementation creates an accessor with initial size for
// work item. But the number of work items is not available during
// stream construction. The resize method allows to update the accessor
// as the information becomes available to the handler.

void resize(size_t GlobalSize);

id<3> MOffset;
// The size of accessing region.
range<3> MAccessRange;
Expand All @@ -96,6 +107,8 @@ class __SYCL_EXPORT AccessorImplHost {
void *MData = nullptr;

Command *MBlockedCmd = nullptr;

bool PerWI = false;
};

using AccessorImplPtr = shared_ptr_class<AccessorImplHost>;
Expand Down Expand Up @@ -139,23 +152,6 @@ class __SYCL_EXPORT LocalAccessorImplHost {
int MDims;
int MElemSize;
std::vector<char> MMem;

bool PerWI = false;
size_t LocalMemSize;
size_t MaxWGSize;
void resize(size_t LocalSize, size_t GlobalSize) {
if (GlobalSize != 1 && LocalSize != 1) {
// If local size is not specified then work group size is chosen by
// runtime. That is why try to allocate based on max work group size or
// global size. In the worst case allocate 80% of local memory.
size_t MinEstWGSize = LocalSize ? LocalSize : GlobalSize;
MinEstWGSize = MinEstWGSize > MaxWGSize ? MaxWGSize : MinEstWGSize;
size_t NewSize = MinEstWGSize * MSize[0];
MSize[0] =
NewSize > 8 * LocalMemSize / 10 ? 8 * LocalMemSize / 10 : NewSize;
MMem.resize(NewSize * MElemSize);
}
}
};

using LocalAccessorImplPtr = shared_ptr_class<LocalAccessorImplHost>;
Expand Down
2 changes: 2 additions & 0 deletions sycl/include/CL/sycl/detail/buffer_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ class __SYCL_EXPORT buffer_impl final : public SYCLMemObjT {
} catch (...) {
}
}

void resize(size_t size) { BaseT::MSizeInBytes = size; }
};

} // namespace detail
Expand Down
81 changes: 35 additions & 46 deletions sycl/include/CL/sycl/detail/stream_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,6 @@ class __SYCL_EXPORT stream_impl {
cl::sycl::access::target::global_buffer,
cl::sycl::access::placeholder::false_t>;

using FlushBufAccessorT =
accessor<char, 1, cl::sycl::access::mode::read_write,
cl::sycl::access::target::local,
cl::sycl::access::placeholder::false_t>;

using LocalOffsetAccessorT =
accessor<unsigned, 1, cl::sycl::access::mode::atomic,
cl::sycl::access::target::local,
cl::sycl::access::placeholder::false_t>;

stream_impl(size_t BufferSize, size_t MaxStatementSize, handler &CGH);

// Method to provide an access to the global stream buffer
Expand All @@ -108,6 +98,12 @@ class __SYCL_EXPORT stream_impl {
CGH, range<1>(BufferSize_), id<1>(OffsetSize));
}

// Method to provide an accessor to the global flush buffer
GlobalBufAccessorT accessGlobalFlushBuf(handler &CGH) {
return FlushBuf.get_access<cl::sycl::access::mode::read_write>(
CGH, range<1>(MaxStatementSize_), id<1>(0));
}

// Method to provide an atomic access to the offset in the global stream
// buffer
GlobalOffsetAccessorT accessGlobalOffset(handler &CGH) {
Expand All @@ -117,12 +113,6 @@ class __SYCL_EXPORT stream_impl {
CGH, range<1>(1), id<1>(0));
}

// Method to provide an atomic access to the flush buffer size
GlobalOffsetAccessorT accessFlushBufferSize(handler &CGH) {
return FlushBufferSize.get_access<cl::sycl::access::mode::atomic>(
CGH, range<1>(1), id<1>(0));
}

// Copy stream buffer to the host and print the contents
void flush();

Expand All @@ -149,8 +139,8 @@ class __SYCL_EXPORT stream_impl {
// Stream buffer
buffer<char, 1> Buf;

// Buffer for flush buffer size
buffer<unsigned, 1> FlushBufferSize;
// Global flush buffer
buffer<char, 1> FlushBuf;
};

template <typename T>
Expand Down Expand Up @@ -310,35 +300,34 @@ inline bool updateOffset(stream_impl::GlobalOffsetAccessorT &GlobalOffset,

inline void flushBuffer(stream_impl::GlobalOffsetAccessorT &GlobalOffset,
stream_impl::GlobalBufAccessorT &GlobalBuf,
stream_impl::FlushBufAccessorT &FlushBufs,
stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
unsigned &WIOffset, unsigned &Offset) {
// Copy data from flush buffer (local memory) to global buffer (global
// memory)

unsigned Cur = 0;
if (!updateOffset(GlobalOffset, GlobalBuf, Offset, Cur))
return;

for (unsigned I = WIOffset; I < WIOffset + Offset; I++) {
GlobalBuf[Cur++] = FlushBufs[I];
GlobalBuf[Cur++] = GlobalFlushBuf[I];
}
// Reset the offset in the flush buffer
Offset = 0;
}

inline void write(stream_impl::FlushBufAccessorT &FlushBufs,
inline void write(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset,
const char *Str, unsigned Len, unsigned Padding = 0) {
if ((FlushBufferSize - Offset < Len + Padding) ||
(WIOffset + Offset + Len + Padding > FlushBufs.get_count()))
(WIOffset + Offset + Len + Padding > GlobalFlushBuf.get_count()))
// TODO: flush here
return;

// Write padding
for (size_t I = 0; I < Padding; ++I, ++Offset)
FlushBufs[WIOffset + Offset] = ' ';
GlobalFlushBuf[WIOffset + Offset] = ' ';

for (size_t I = 0; I < Len; ++I, ++Offset) {
FlushBufs[WIOffset + Offset] = Str[I];
GlobalFlushBuf[WIOffset + Offset] = Str[I];
}
}

Expand Down Expand Up @@ -477,25 +466,25 @@ ScalarToStr(const T &Val, char *Buf, unsigned Flags, int Width,

template <typename T>
inline typename std::enable_if<std::is_integral<T>::value>::type
writeIntegral(stream_impl::FlushBufAccessorT &FlushBufs, size_t FlushBufferSize,
unsigned WIOffset, unsigned &Offset, unsigned Flags, int Width,
const T &Val) {
writeIntegral(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset,
unsigned Flags, int Width, const T &Val) {
char Digits[MAX_INTEGRAL_DIGITS] = {0};
unsigned Len = ScalarToStr(Val, Digits, Flags, Width);
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Digits, Len,
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Digits, Len,
(Width > 0 && static_cast<unsigned>(Width) > Len)
? static_cast<unsigned>(Width) - Len
: 0);
}

template <typename T>
inline EnableIfFP<T>
writeFloatingPoint(stream_impl::FlushBufAccessorT &FlushBufs,
writeFloatingPoint(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
size_t FlushBufferSize, unsigned WIOffset, unsigned &Offset,
unsigned Flags, int Width, int Precision, const T &Val) {
char Digits[MAX_FLOATING_POINT_DIGITS] = {0};
unsigned Len = ScalarToStr(Val, Digits, Flags, Width, Precision);
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Digits, Len,
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Digits, Len,
(Width > 0 && static_cast<unsigned>(Width) > Len)
? static_cast<unsigned>(Width) - Len
: 0);
Expand Down Expand Up @@ -533,7 +522,7 @@ VecToStr(const vec<T, VecLength> &Vec, char *VecStr, unsigned Flags, int Width,
}

template <typename T, int VecLength>
inline void writeVec(stream_impl::FlushBufAccessorT &FlushBufs,
inline void writeVec(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
size_t FlushBufferSize, unsigned WIOffset,
unsigned &Offset, unsigned Flags, int Width, int Precision,
const vec<T, VecLength> &Vec) {
Expand All @@ -542,7 +531,7 @@ inline void writeVec(stream_impl::FlushBufAccessorT &FlushBufs,
MAX_FLOATING_POINT_DIGITS * VecLength + (VecLength - 1) * 2;
char VecStr[MAX_VEC_SIZE] = {0};
unsigned Len = VecToStr<T, VecLength>(Vec, VecStr, Flags, Width, Precision);
write(FlushBufs, FlushBufferSize, WIOffset, Offset, VecStr, Len,
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, VecStr, Len,
(Width > 0 && Width > Len) ? Width - Len : 0);
}

Expand All @@ -563,16 +552,16 @@ inline unsigned ArrayToStr(char *Buf, const array<ArrayLength> &Arr) {
}

template <int ArrayLength>
inline void writeArray(stream_impl::FlushBufAccessorT &FlushBufs,
inline void writeArray(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
size_t FlushBufferSize, unsigned WIOffset,
unsigned &Offset, const array<ArrayLength> &Arr) {
char Buf[MAX_ARRAY_SIZE];
unsigned Len = ArrayToStr(Buf, Arr);
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
}

template <int Dimensions>
inline void writeItem(stream_impl::FlushBufAccessorT &FlushBufs,
inline void writeItem(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
size_t FlushBufferSize, unsigned WIOffset,
unsigned &Offset, const item<Dimensions> &Item) {
// Reserve space for 3 arrays and additional place (40 symbols) for printing
Expand All @@ -587,11 +576,11 @@ inline void writeItem(stream_impl::FlushBufAccessorT &FlushBufs,
Len += append(Buf + Len, ", offset: ");
Len += ArrayToStr(Buf + Len, Item.get_offset());
Buf[Len++] = ')';
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
}

template <int Dimensions>
inline void writeNDRange(stream_impl::FlushBufAccessorT &FlushBufs,
inline void writeNDRange(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
size_t FlushBufferSize, unsigned WIOffset,
unsigned &Offset,
const nd_range<Dimensions> &ND_Range) {
Expand All @@ -607,11 +596,11 @@ inline void writeNDRange(stream_impl::FlushBufAccessorT &FlushBufs,
Len += append(Buf + Len, ", offset: ");
Len += ArrayToStr(Buf + Len, ND_Range.get_offset());
Buf[Len++] = ')';
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
}

template <int Dimensions>
inline void writeNDItem(stream_impl::FlushBufAccessorT &FlushBufs,
inline void writeNDItem(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
size_t FlushBufferSize, unsigned WIOffset,
unsigned &Offset, const nd_item<Dimensions> &ND_Item) {
// Reserve space for 2 arrays and additional place (40 symbols) for printing
Expand All @@ -624,11 +613,11 @@ inline void writeNDItem(stream_impl::FlushBufAccessorT &FlushBufs,
Len += append(Buf + Len, ", local_id: ");
Len += ArrayToStr(Buf + Len, ND_Item.get_local_id());
Buf[Len++] = ')';
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
}

template <int Dimensions>
inline void writeGroup(stream_impl::FlushBufAccessorT &FlushBufs,
inline void writeGroup(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
size_t FlushBufferSize, unsigned WIOffset,
unsigned &Offset, const group<Dimensions> &Group) {
// Reserve space for 4 arrays and additional place (60 symbols) for printing
Expand All @@ -645,7 +634,7 @@ inline void writeGroup(stream_impl::FlushBufAccessorT &FlushBufs,
Len += append(Buf + Len, ", group_range: ");
Len += ArrayToStr(Buf + Len, Group.get_group_range());
Buf[Len++] = ')';
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
}

// Space for 2 arrays and additional place (20 symbols) for printing
Expand All @@ -665,7 +654,7 @@ inline unsigned ItemToStr(char *Buf, const item<Dimensions, false> &Item) {
}

template <int Dimensions>
inline void writeHItem(stream_impl::FlushBufAccessorT &FlushBufs,
inline void writeHItem(stream_impl::GlobalBufAccessorT &GlobalFlushBuf,
size_t FlushBufferSize, unsigned WIOffset,
unsigned &Offset, const h_item<Dimensions> &HItem) {
// Reserve space for 3 items and additional place (60 symbols) for printing
Expand All @@ -682,7 +671,7 @@ inline void writeHItem(stream_impl::FlushBufAccessorT &FlushBufs,
: HItem.get_physical_local());
}
Len += append(Buf + Len, "\n)");
write(FlushBufs, FlushBufferSize, WIOffset, Offset, Buf, Len);
write(GlobalFlushBuf, FlushBufferSize, WIOffset, Offset, Buf, Len);
}

} // namespace detail
Expand Down
Loading