Skip to content

[SYCL] Fix rectangle argument order when passing to PI routines #2608

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Oct 20, 2020
Merged
14 changes: 10 additions & 4 deletions sycl/include/CL/sycl/detail/image_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ class __SYCL_EXPORT image_impl final : public SYCLMemObjT {
size_t WHD[3] = {1, 1, 1}; // Width, Height, Depth.
for (int I = 0; I < Dimensions; I++)
WHD[I] = MRange[I];

MRowPitch = MElementSize * WHD[0];
MSlicePitch = MRowPitch * WHD[1];
BaseT::MSizeInBytes = MSlicePitch * WHD[2];
Expand All @@ -95,6 +96,7 @@ class __SYCL_EXPORT image_impl final : public SYCLMemObjT {
// NumSlices is depth when dim==3, and height when dim==2.
size_t NumSlices =
(Dimensions == 3) ? MRange[2] : MRange[1]; // Dimensions will be 2/3.

BaseT::MSizeInBytes = MSlicePitch * NumSlices;
}

Expand Down Expand Up @@ -245,15 +247,19 @@ class __SYCL_EXPORT image_impl final : public SYCLMemObjT {
RT::PiMemImageDesc getImageDesc(bool InitFromHostPtr) {
RT::PiMemImageDesc Desc;
Desc.image_type = getImageType();
Desc.image_width = MRange[0];
Desc.image_height = Dimensions > 1 ? MRange[1] : 1;
Desc.image_depth = Dimensions > 2 ? MRange[2] : 1;

// MRange<> is [width], [width,height], or [width,height,depth] (which
// is different than MAccessRange, etc in bufffers)
static constexpr int XTermPos = 0, YTermPos = 1, ZTermPos = 2;
Desc.image_width = MRange[XTermPos];
Desc.image_height = Dimensions > 1 ? MRange[YTermPos] : 1;
Desc.image_depth = Dimensions > 2 ? MRange[ZTermPos] : 1;

// TODO handle cases with IMAGE1D_ARRAY and IMAGE2D_ARRAY
Desc.image_array_size = 0;
// Pitches must be 0 if host ptr is not provided.
Desc.image_row_pitch = InitFromHostPtr ? MRowPitch : 0;
Desc.image_slice_pitch = InitFromHostPtr ? MSlicePitch : 0;

Desc.num_mip_levels = 0;
Desc.num_samples = 0;
Desc.buffer = nullptr;
Expand Down
10 changes: 10 additions & 0 deletions sycl/include/CL/sycl/detail/pi.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -206,6 +206,16 @@ template <> inline void print<>(pi_image_offset off) {
<< off->z << std::endl;
}

template <> inline void print<>(const pi_image_desc *desc) {
std::cout << "image_desc w/h/d : " << desc->image_width << " / "
<< desc->image_height << " / " << desc->image_depth
<< " -- arrSz/row/slice : " << desc->image_array_size << " / "
<< desc->image_row_pitch << " / " << desc->image_slice_pitch
<< " -- num_mip_lvls/num_smpls/image_type : "
<< desc->num_mip_levels << " / " << desc->num_samples << " / "
<< desc->image_type << std::endl;
}

template <> inline void print<>(PiResult val) {
std::cout << "pi_result : ";
if (val == PI_SUCCESS)
Expand Down
12 changes: 6 additions & 6 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,16 +462,16 @@ class __SYCL_EXPORT handler {
}

static id<2> getDelinearizedIndex(const range<2> Range, const size_t Index) {
size_t x = Index / Range[1];
size_t y = Index % Range[1];
return {x, y};
size_t x = Index % Range[1];
size_t y = Index / Range[1];
return {y, x};
}

static id<3> getDelinearizedIndex(const range<3> Range, const size_t Index) {
size_t x = Index / (Range[1] * Range[2]);
size_t z = Index / (Range[1] * Range[2]);
size_t y = (Index / Range[2]) % Range[1];
size_t z = Index % Range[2];
return {x, y, z};
size_t x = Index % Range[2];
return {z, y, x};
}

/// Stores lambda to the template-free object
Expand Down
235 changes: 154 additions & 81 deletions sycl/source/detail/memory_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,36 @@ void *MemoryManager::allocateMemSubBuffer(ContextImplPtr TargetContext,
return NewMem;
}

struct TermPositions {
int XTerm;
int YTerm;
int ZTerm;
};
void prepTermPositions(TermPositions &pos, int Dimensions,
detail::SYCLMemObjI::MemObjType Type) {
// For buffers, the offsets/ranges coming from accessor are always
// id<3>/range<3> But their organization varies by dimension:
// 1 ==> {width, 1, 1}
// 2 ==> {height, width, 1}
// 3 ==> {depth, height, width}
// Some callers schedule 0 as DimDst/DimSrc.

if (Type == detail::SYCLMemObjI::MemObjType::BUFFER) {
if (Dimensions == 3) {
pos.XTerm = 2, pos.YTerm = 1, pos.ZTerm = 0;
} else if (Dimensions == 2) {
pos.XTerm = 1, pos.YTerm = 0, pos.ZTerm = 2;
} else { // Dimension is 1 or 0
pos.XTerm = 0, pos.YTerm = 1, pos.ZTerm = 2;
}
} else { // While range<>/id<> use by images is different than buffers, it's
// consistent with their accessors.
pos.XTerm = 0;
pos.YTerm = 1;
pos.ZTerm = 2;
}
}

void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr,
unsigned int DimSrc, sycl::range<3> SrcSize,
sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
Expand All @@ -255,34 +285,40 @@ void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr,
assert(SYCLMemObj && "The SYCLMemObj is nullptr");

const RT::PiQueue Queue = TgtQueue->getHandleRef();
// Adjust first dimension of copy range and offset as OpenCL expects size in
// bytes.
DstSize[0] *= DstElemSize;
const detail::plugin &Plugin = TgtQueue->getPlugin();
if (SYCLMemObj->getType() == detail::SYCLMemObjI::MemObjType::BUFFER) {
DstOffset[0] *= DstElemSize;
SrcOffset[0] *= SrcElemSize;
SrcAccessRange[0] *= SrcElemSize;
DstAccessRange[0] *= DstElemSize;
SrcSize[0] *= SrcElemSize;

detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType();
TermPositions SrcPos, DstPos;
prepTermPositions(SrcPos, DimSrc, MemType);
prepTermPositions(DstPos, DimDst, MemType);

size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize;
size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize;
size_t DstAccessRangeWidthBytes = DstAccessRange[DstPos.XTerm] * DstElemSize;
size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize;
size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize;

if (MemType == detail::SYCLMemObjI::MemObjType::BUFFER) {
if (1 == DimDst && 1 == DimSrc) {
Plugin.call<PiApiKind::piEnqueueMemBufferWrite>(
Queue, DstMem,
/*blocking_write=*/CL_FALSE, DstOffset[0], DstAccessRange[0],
SrcMem + SrcOffset[0], DepEvents.size(), DepEvents.data(), &OutEvent);
/*blocking_write=*/CL_FALSE, DstXOffBytes, DstAccessRangeWidthBytes,
SrcMem + SrcXOffBytes, DepEvents.size(), DepEvents.data(), &OutEvent);
} else {
size_t BufferRowPitch = (1 == DimDst) ? 0 : DstSize[0];
size_t BufferSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0;
size_t HostRowPitch = (1 == DimSrc) ? 0 : SrcSize[0];
size_t HostSlicePitch = (3 == DimSrc) ? SrcSize[0] * SrcSize[1] : 0;

pi_buff_rect_offset_struct BufferOffset{DstOffset[0], DstOffset[1],
DstOffset[2]};
pi_buff_rect_offset_struct HostOffset{SrcOffset[0], SrcOffset[1],
SrcOffset[2]};
pi_buff_rect_region_struct RectRegion{
DstAccessRange[0], DstAccessRange[1], DstAccessRange[2]};
size_t BufferRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
size_t BufferSlicePitch =
(3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm] : 0;
size_t HostRowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
size_t HostSlicePitch =
(3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm] : 0;

pi_buff_rect_offset_struct BufferOffset{
DstXOffBytes, DstOffset[DstPos.YTerm], DstOffset[DstPos.ZTerm]};
pi_buff_rect_offset_struct HostOffset{
SrcXOffBytes, SrcOffset[SrcPos.YTerm], SrcOffset[SrcPos.ZTerm]};
pi_buff_rect_region_struct RectRegion{DstAccessRangeWidthBytes,
DstAccessRange[DstPos.YTerm],
DstAccessRange[DstPos.ZTerm]};

Plugin.call<PiApiKind::piEnqueueMemBufferWriteRect>(
Queue, DstMem,
Expand All @@ -291,12 +327,16 @@ void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr,
SrcMem, DepEvents.size(), DepEvents.data(), &OutEvent);
}
} else {
size_t InputRowPitch = (1 == DimDst) ? 0 : DstSize[0];
size_t InputSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0;
size_t InputRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
size_t InputSlicePitch =
(3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm] : 0;

pi_image_offset_struct Origin{DstOffset[0], DstOffset[1], DstOffset[2]};
pi_image_region_struct Region{DstAccessRange[0], DstAccessRange[1],
DstAccessRange[2]};
pi_image_offset_struct Origin{DstOffset[DstPos.XTerm],
DstOffset[DstPos.YTerm],
DstOffset[DstPos.ZTerm]};
pi_image_region_struct Region{DstAccessRange[DstPos.XTerm],
DstAccessRange[DstPos.YTerm],
DstAccessRange[DstPos.ZTerm]};

Plugin.call<PiApiKind::piEnqueueMemImageWrite>(
Queue, DstMem,
Expand All @@ -316,34 +356,46 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue,
assert(SYCLMemObj && "The SYCLMemObj is nullptr");

const RT::PiQueue Queue = SrcQueue->getHandleRef();
// Adjust sizes of 1 dimensions as OpenCL expects size in bytes.
SrcSize[0] *= SrcElemSize;
const detail::plugin &Plugin = SrcQueue->getPlugin();
if (SYCLMemObj->getType() == detail::SYCLMemObjI::MemObjType::BUFFER) {
DstOffset[0] *= DstElemSize;
SrcOffset[0] *= SrcElemSize;
SrcAccessRange[0] *= SrcElemSize;
DstAccessRange[0] *= DstElemSize;
DstSize[0] *= DstElemSize;

detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType();
TermPositions SrcPos, DstPos;
prepTermPositions(SrcPos, DimSrc, MemType);
prepTermPositions(DstPos, DimDst, MemType);

// For a given buffer, the various mem copy routines (copyD2H, copyH2D,
// copyD2D) will usually have the same values for AccessRange, Size,
// Dimension, Offset, etc. EXCEPT when the dtor for ~SYCLMemObjT is called.
// Essentially, it schedules a copyBack of chars thus in copyD2H the
// Dimension will then be 1 and DstAccessRange[0] and DstSize[0] will be
// sized to bytes with a DstElemSize of 1.
size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize;
size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize;
size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm] * SrcElemSize;
size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize;
size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize;

if (MemType == detail::SYCLMemObjI::MemObjType::BUFFER) {
if (1 == DimDst && 1 == DimSrc) {
Plugin.call<PiApiKind::piEnqueueMemBufferRead>(
Queue, SrcMem,
/*blocking_read=*/CL_FALSE, SrcOffset[0], SrcAccessRange[0],
DstMem + DstOffset[0], DepEvents.size(), DepEvents.data(), &OutEvent);
/*blocking_read=*/CL_FALSE, SrcXOffBytes, SrcAccessRangeWidthBytes,
DstMem + DstXOffBytes, DepEvents.size(), DepEvents.data(), &OutEvent);
} else {
size_t BufferRowPitch = (1 == DimSrc) ? 0 : SrcSize[0];
size_t BufferSlicePitch = (3 == DimSrc) ? SrcSize[0] * SrcSize[1] : 0;

size_t HostRowPitch = (1 == DimDst) ? 0 : DstSize[0];
size_t HostSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0;

pi_buff_rect_offset_struct BufferOffset{SrcOffset[0], SrcOffset[1],
SrcOffset[2]};
pi_buff_rect_offset_struct HostOffset{DstOffset[0], DstOffset[1],
DstOffset[2]};
pi_buff_rect_region_struct RectRegion{
SrcAccessRange[0], SrcAccessRange[1], SrcAccessRange[2]};
size_t BufferRowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
size_t BufferSlicePitch =
(3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm] : 0;
size_t HostRowPitch = (1 == DimDst) ? 0 : DstSzWidthBytes;
size_t HostSlicePitch =
(3 == DimDst) ? DstSzWidthBytes * DstSize[DstPos.YTerm] : 0;

pi_buff_rect_offset_struct BufferOffset{
SrcXOffBytes, SrcOffset[SrcPos.YTerm], SrcOffset[SrcPos.ZTerm]};
pi_buff_rect_offset_struct HostOffset{
DstXOffBytes, DstOffset[DstPos.YTerm], DstOffset[DstPos.ZTerm]};
pi_buff_rect_region_struct RectRegion{SrcAccessRangeWidthBytes,
SrcAccessRange[SrcPos.YTerm],
SrcAccessRange[SrcPos.ZTerm]};

Plugin.call<PiApiKind::piEnqueueMemBufferReadRect>(
Queue, SrcMem,
Expand All @@ -352,12 +404,16 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue,
DstMem, DepEvents.size(), DepEvents.data(), &OutEvent);
}
} else {
size_t RowPitch = (1 == DimSrc) ? 0 : SrcSize[0];
size_t SlicePitch = (3 == DimSrc) ? SrcSize[0] * SrcSize[1] : 0;
size_t RowPitch = (1 == DimSrc) ? 0 : SrcSzWidthBytes;
size_t SlicePitch =
(3 == DimSrc) ? SrcSzWidthBytes * SrcSize[SrcPos.YTerm] : 0;

pi_image_offset_struct Offset{SrcOffset[0], SrcOffset[1], SrcOffset[2]};
pi_image_region_struct Region{SrcAccessRange[0], SrcAccessRange[1],
SrcAccessRange[2]};
pi_image_offset_struct Offset{SrcOffset[SrcPos.XTerm],
SrcOffset[SrcPos.YTerm],
SrcOffset[SrcPos.ZTerm]};
pi_image_region_struct Region{SrcAccessRange[SrcPos.XTerm],
SrcAccessRange[SrcPos.YTerm],
SrcAccessRange[SrcPos.ZTerm]};

Plugin.call<PiApiKind::piEnqueueMemImageRead>(
Queue, SrcMem, CL_FALSE, &Offset, &Region, RowPitch, SlicePitch, DstMem,
Expand All @@ -376,43 +432,60 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, RT::PiMem SrcMem, QueueImplPtr SrcQueue,

const RT::PiQueue Queue = SrcQueue->getHandleRef();
const detail::plugin &Plugin = SrcQueue->getPlugin();
if (SYCLMemObj->getType() == detail::SYCLMemObjI::MemObjType::BUFFER) {
// Adjust sizes of 1 dimensions as OpenCL expects size in bytes.
DstOffset[0] *= DstElemSize;
SrcOffset[0] *= SrcElemSize;
SrcAccessRange[0] *= SrcElemSize;
SrcSize[0] *= SrcElemSize;
DstSize[0] *= DstElemSize;

detail::SYCLMemObjI::MemObjType MemType = SYCLMemObj->getType();
TermPositions SrcPos, DstPos;
prepTermPositions(SrcPos, DimSrc, MemType);
prepTermPositions(DstPos, DimDst, MemType);

size_t DstXOffBytes = DstOffset[DstPos.XTerm] * DstElemSize;
size_t SrcXOffBytes = SrcOffset[SrcPos.XTerm] * SrcElemSize;
size_t SrcAccessRangeWidthBytes = SrcAccessRange[SrcPos.XTerm] * SrcElemSize;
size_t DstSzWidthBytes = DstSize[DstPos.XTerm] * DstElemSize;
size_t SrcSzWidthBytes = SrcSize[SrcPos.XTerm] * SrcElemSize;

if (MemType == detail::SYCLMemObjI::MemObjType::BUFFER) {
if (1 == DimDst && 1 == DimSrc) {
Plugin.call<PiApiKind::piEnqueueMemBufferCopy>(
Queue, SrcMem, DstMem, SrcOffset[0], DstOffset[0], SrcAccessRange[0],
DepEvents.size(), DepEvents.data(), &OutEvent);
Queue, SrcMem, DstMem, SrcXOffBytes, DstXOffBytes,
SrcAccessRangeWidthBytes, DepEvents.size(), DepEvents.data(),
&OutEvent);
} else {
size_t SrcRowPitch = (1 == DimSrc) ? 0 : SrcSize[0];
size_t SrcSlicePitch =
(DimSrc > 1) ? SrcSize[0] * SrcSize[1] : SrcSize[0];

size_t DstRowPitch = (1 == DimDst) ? 0 : DstSize[0];
size_t DstSlicePitch =
(DimDst > 1) ? DstSize[0] * DstSize[1] : DstSize[0];

pi_buff_rect_offset_struct SrcOrigin{SrcOffset[0], SrcOffset[1],
SrcOffset[2]};
pi_buff_rect_offset_struct DstOrigin{DstOffset[0], DstOffset[1],
DstOffset[2]};
pi_buff_rect_region_struct Region{SrcAccessRange[0], SrcAccessRange[1],
SrcAccessRange[2]};
// passing 0 for pitches not allowed. Because clEnqueueCopyBufferRect will
// calculate both src and dest pitch using region[0], which is not correct
// if src and dest are not the same size.
size_t SrcRowPitch = SrcSzWidthBytes;
size_t SrcSlicePitch = (DimSrc <= 1)
? SrcSzWidthBytes
: SrcSzWidthBytes * SrcSize[SrcPos.YTerm];
size_t DstRowPitch = DstSzWidthBytes;
size_t DstSlicePitch = (DimDst <= 1)
? DstSzWidthBytes
: DstSzWidthBytes * DstSize[DstPos.YTerm];

pi_buff_rect_offset_struct SrcOrigin{
SrcXOffBytes, SrcOffset[SrcPos.YTerm], SrcOffset[SrcPos.ZTerm]};
pi_buff_rect_offset_struct DstOrigin{
DstXOffBytes, DstOffset[DstPos.YTerm], DstOffset[DstPos.ZTerm]};
pi_buff_rect_region_struct Region{SrcAccessRangeWidthBytes,
SrcAccessRange[SrcPos.YTerm],
SrcAccessRange[SrcPos.ZTerm]};

Plugin.call<PiApiKind::piEnqueueMemBufferCopyRect>(
Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region, SrcRowPitch,
SrcSlicePitch, DstRowPitch, DstSlicePitch, DepEvents.size(),
DepEvents.data(), &OutEvent);
}
} else {
pi_image_offset_struct SrcOrigin{SrcOffset[0], SrcOffset[1], SrcOffset[2]};
pi_image_offset_struct DstOrigin{DstOffset[0], DstOffset[1], DstOffset[2]};
pi_image_region_struct Region{SrcAccessRange[0], SrcAccessRange[1],
SrcAccessRange[2]};
pi_image_offset_struct SrcOrigin{SrcOffset[SrcPos.XTerm],
SrcOffset[SrcPos.YTerm],
SrcOffset[SrcPos.ZTerm]};
pi_image_offset_struct DstOrigin{DstOffset[DstPos.XTerm],
DstOffset[DstPos.YTerm],
DstOffset[DstPos.ZTerm]};
pi_image_region_struct Region{SrcAccessRange[SrcPos.XTerm],
SrcAccessRange[SrcPos.YTerm],
SrcAccessRange[SrcPos.ZTerm]};

Plugin.call<PiApiKind::piEnqueueMemImageCopy>(
Queue, SrcMem, DstMem, &SrcOrigin, &DstOrigin, &Region,
Expand Down
Loading