Skip to content

Commit 1e2e6ba

Browse files
[SYCL][Bindless] Add image_mem_handle to image_mem_handle devices copies. (#12449)
Add support for device to device copy for `image_mem_handle` to `image_mem_handle`. --------- Co-authored-by: Przemek Malon <[email protected]>
1 parent 269a0af commit 1e2e6ba

File tree

9 files changed

+294
-4
lines changed

9 files changed

+294
-4
lines changed

sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,12 @@ public:
810810
size_t DeviceRowPitch,
811811
sycl::range<3> HostExtent,
812812
sycl::range<3> CopyExtent);
813+
814+
// Simple device to device copy
815+
void ext_oneapi_copy(
816+
ext::oneapi::experimental::image_mem_handle Src,
817+
ext::oneapi::experimental::image_mem_handle Dest,
818+
const ext::oneapi::experimental::image_descriptor &ImageDesc);
813819
};
814820

815821
class queue {
@@ -935,14 +941,32 @@ public:
935941
size_t DeviceRowPitch,
936942
sycl::range<3> HostExtent,
937943
sycl::range<3> CopyExtent);
944+
945+
// Simple device to device copy
946+
event ext_oneapi_copy(
947+
ext::oneapi::experimental::image_mem_handle Src,
948+
ext::oneapi::experimental::image_mem_handle Dest,
949+
const ext::oneapi::experimental::image_descriptor &ImageDesc);
950+
event ext_oneapi_copy(
951+
ext::oneapi::experimental::image_mem_handle Src,
952+
ext::oneapi::experimental::image_mem_handle Dest,
953+
const ext::oneapi::experimental::image_descriptor &ImageDesc,
954+
event DepEvent);
955+
event ext_oneapi_copy(
956+
ext::oneapi::experimental::image_mem_handle Src,
957+
ext::oneapi::experimental::image_mem_handle Dest,
958+
const ext::oneapi::experimental::image_descriptor &ImageDesc,
959+
const std::vector<event> &DepEvents);
938960
};
939961
}
940962
```
941963

942964
To enable the copying of images an `ext_oneapi_copy` function is proposed as a
943965
method of the queue and handler. It can be used to copy image memory, whether
944966
allocated through USM or using an `image_mem_handle`, from host to
945-
device, or device to host. For the `ext_oneapi_copy` variants that do not take
967+
device, or device to host. Device to device copies are currently supported only
968+
through `image_mem_handle` allocations.
969+
For the `ext_oneapi_copy` variants that do not take
946970
offsets and extents, the image descriptor passed to the `ext_oneapi_copy` API
947971
is used to determine the pixel size, dimensions, and extent in memory of the
948972
image to copy. If performing sub-region copy, the size of the memory region is
@@ -2588,6 +2612,8 @@ These features still need to be handled:
25882612
wording around what types are allowed to be read or written.
25892613
- Allow `read_image` and `read_mipmap` to return a
25902614
user-defined type.
2615+
|5.1|2024-01-17| - Added overload for `ext_oneapi_copy` enabling device to device
2616+
copies using `image_mem_handle`.
25912617
|5.1|2023-12-06| - Added unique addressing modes per dimension to the
25922618
`bindless_image_sampler`
25932619
|5.2|2024-02-14| - Image read and write functions now accept 3-component

sycl/include/sycl/handler.hpp

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3225,6 +3225,18 @@ class __SYCL_EXPORT handler {
32253225
const ext::oneapi::experimental::image_descriptor &DeviceImgDesc,
32263226
size_t DeviceRowPitch);
32273227

3228+
/// Copies data from device to device memory, where \p Src and \p Dest
3229+
/// are opaque image memory handles.
3230+
/// An exception is thrown if either \p Src or \p Dest is incomplete
3231+
///
3232+
/// \param Src is an opaque image memory handle to the source memory.
3233+
/// \param Dest is an opaque image memory handle to the destination memory.
3234+
/// \param ImageDesc is the source image descriptor
3235+
void
3236+
ext_oneapi_copy(ext::oneapi::experimental::image_mem_handle Src,
3237+
ext::oneapi::experimental::image_mem_handle Dest,
3238+
const ext::oneapi::experimental::image_descriptor &ImageDesc);
3239+
32283240
/// Copies data from one memory region to another, where \p Src and \p Dest
32293241
/// are USM pointers. Allows for a sub-region copy, where \p SrcOffset ,
32303242
/// \p DestOffset , and \p Extent are used to determine the sub-region.

sycl/include/sycl/queue.hpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1838,6 +1838,74 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
18381838
CodeLoc);
18391839
}
18401840

1841+
/// Copies data from device to device memory, where \p Src and \p Dest
1842+
/// are opaque image memory handles.
1843+
/// An exception is thrown if either \p Src or \p Dest is incomplete
1844+
///
1845+
/// \param Src is an opaque image memory handle to the source memory.
1846+
/// \param Dest is an opaque image memory handle to the destination memory.
1847+
/// \param ImageDesc is the source image descriptor
1848+
/// \param DepEvent is an events that specifies the kernel dependency.
1849+
/// \return an event representing the copy operation.
1850+
event ext_oneapi_copy(
1851+
ext::oneapi::experimental::image_mem_handle Src,
1852+
ext::oneapi::experimental::image_mem_handle Dest,
1853+
const ext::oneapi::experimental::image_descriptor &ImageDesc,
1854+
event DepEvent,
1855+
const detail::code_location &CodeLoc = detail::code_location::current()) {
1856+
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
1857+
return submit(
1858+
[&](handler &CGH) {
1859+
CGH.depends_on(DepEvent);
1860+
CGH.ext_oneapi_copy(Src, Dest, ImageDesc);
1861+
},
1862+
CodeLoc);
1863+
}
1864+
1865+
/// Copies data from device to device memory, where \p Src and \p Dest
1866+
/// are opaque image memory handles.
1867+
/// An exception is thrown if either \p Src or \p Dest is incomplete
1868+
///
1869+
/// \param Src is an opaque image memory handle to the source memory.
1870+
/// \param Dest is an opaque image memory handle to the destination memory.
1871+
/// \param ImageDesc is the source image descriptor
1872+
/// \param DepEvents is a vector of events that specifies the kernel
1873+
/// dependencies.
1874+
/// \return an event representing the copy operation.
1875+
event ext_oneapi_copy(
1876+
ext::oneapi::experimental::image_mem_handle Src,
1877+
ext::oneapi::experimental::image_mem_handle Dest,
1878+
const ext::oneapi::experimental::image_descriptor &ImageDesc,
1879+
const std::vector<event> &DepEvents,
1880+
const detail::code_location &CodeLoc = detail::code_location::current()) {
1881+
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
1882+
return submit(
1883+
[&](handler &CGH) {
1884+
CGH.depends_on(DepEvents);
1885+
CGH.ext_oneapi_copy(Src, Dest, ImageDesc);
1886+
},
1887+
CodeLoc);
1888+
}
1889+
1890+
/// Copies data from device to device memory, where \p Src and \p Dest
1891+
/// are opaque image memory handles.
1892+
/// An exception is thrown if either \p Src or \p Dest is incomplete
1893+
///
1894+
/// \param Src is an opaque image memory handle to the source memory.
1895+
/// \param Dest is an opaque image memory handle to the destination memory.
1896+
/// \param ImageDesc is the source image descriptor
1897+
/// \return an event representing the copy operation.
1898+
event ext_oneapi_copy(
1899+
ext::oneapi::experimental::image_mem_handle Src,
1900+
ext::oneapi::experimental::image_mem_handle Dest,
1901+
const ext::oneapi::experimental::image_descriptor &ImageDesc,
1902+
const detail::code_location &CodeLoc = detail::code_location::current()) {
1903+
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
1904+
return submit(
1905+
[&](handler &CGH) { CGH.ext_oneapi_copy(Src, Dest, ImageDesc); },
1906+
CodeLoc);
1907+
}
1908+
18411909
/// Copies data from one memory region to another, where \p Src and \p Dest
18421910
/// are USM pointers. Allows for a sub-region copy, where \p SrcOffset ,
18431911
/// \p DestOffset , and \p Extent are used to determine the sub-region.

sycl/plugins/unified_runtime/CMakeLists.txt

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -114,8 +114,15 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
114114
)
115115

116116
fetch_adapter_source(cuda
117-
${UNIFIED_RUNTIME_REPO}
118-
${UNIFIED_RUNTIME_TAG}
117+
"https://github.com/oneapi-src/unified-runtime.git"
118+
# commit 7fcfe3ad8882fee23d83fa0fc4c4c944262a9ea3
119+
# Merge: b37fa2c4 f9fb1167
120+
# Author: Kenneth Benzie (Benie) <[email protected]>
121+
# Date: Wed Apr 24 10:38:00 2024 +0100
122+
# Merge pull request #1265 from cppchedy/chedy/device-to-device-copy
123+
#
124+
# [Bindless][Exp] Add support for device to device copies between CuArrays
125+
7fcfe3ad8882fee23d83fa0fc4c4c944262a9ea3
119126
)
120127

121128
fetch_adapter_source(hip

sycl/source/detail/memory_manager.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1776,7 +1776,9 @@ void MemoryManager::copy_image_bindless(
17761776
assert((Flags == (sycl::detail::pi::PiImageCopyFlags)
17771777
ext::oneapi::experimental::image_copy_flags::HtoD ||
17781778
Flags == (sycl::detail::pi::PiImageCopyFlags)
1779-
ext::oneapi::experimental::image_copy_flags::DtoH) &&
1779+
ext::oneapi::experimental::image_copy_flags::DtoH ||
1780+
Flags == (sycl::detail::pi::PiImageCopyFlags)
1781+
ext::oneapi::experimental::image_copy_flags::DtoD) &&
17801782
"Invalid flags passed to copy_image_bindless.");
17811783
if (!Dst || !Src)
17821784
throw sycl::exception(

sycl/source/handler.cpp

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1167,6 +1167,57 @@ void handler::ext_oneapi_copy(
11671167
setType(detail::CG::CopyImage);
11681168
}
11691169

1170+
void handler::ext_oneapi_copy(
1171+
ext::oneapi::experimental::image_mem_handle Src,
1172+
ext::oneapi::experimental::image_mem_handle Dest,
1173+
const ext::oneapi::experimental::image_descriptor &ImageDesc) {
1174+
throwIfGraphAssociated<
1175+
ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
1176+
sycl_ext_oneapi_bindless_images>();
1177+
ImageDesc.verify();
1178+
1179+
MSrcPtr = Src.raw_handle;
1180+
MDstPtr = Dest.raw_handle;
1181+
1182+
sycl::detail::pi::PiMemImageDesc PiDesc = {};
1183+
PiDesc.image_width = ImageDesc.width;
1184+
PiDesc.image_height = ImageDesc.height;
1185+
PiDesc.image_depth = ImageDesc.depth;
1186+
PiDesc.image_array_size = ImageDesc.array_size;
1187+
if (ImageDesc.array_size > 1) {
1188+
// Image Array.
1189+
PiDesc.image_type = ImageDesc.height > 0 ? PI_MEM_TYPE_IMAGE2D_ARRAY
1190+
: PI_MEM_TYPE_IMAGE1D_ARRAY;
1191+
1192+
// Cubemap.
1193+
PiDesc.image_type =
1194+
ImageDesc.type == sycl::ext::oneapi::experimental::image_type::cubemap
1195+
? PI_MEM_TYPE_IMAGE_CUBEMAP
1196+
: PiDesc.image_type;
1197+
} else {
1198+
PiDesc.image_type = ImageDesc.depth > 0
1199+
? PI_MEM_TYPE_IMAGE3D
1200+
: (ImageDesc.height > 0 ? PI_MEM_TYPE_IMAGE2D
1201+
: PI_MEM_TYPE_IMAGE1D);
1202+
}
1203+
1204+
sycl::detail::pi::PiMemImageFormat PiFormat;
1205+
PiFormat.image_channel_data_type =
1206+
sycl::_V1::detail::convertChannelType(ImageDesc.channel_type);
1207+
PiFormat.image_channel_order =
1208+
sycl::_V1::detail::convertChannelOrder(ImageDesc.channel_order);
1209+
1210+
MImpl->MSrcOffset = {0, 0, 0};
1211+
MImpl->MDestOffset = {0, 0, 0};
1212+
MImpl->MCopyExtent = {ImageDesc.width, ImageDesc.height, ImageDesc.depth};
1213+
MImpl->MHostExtent = {ImageDesc.width, ImageDesc.height, ImageDesc.depth};
1214+
MImpl->MImageDesc = PiDesc;
1215+
MImpl->MImageFormat = PiFormat;
1216+
MImpl->MImageCopyFlags =
1217+
sycl::detail::pi::PiImageCopyFlags::PI_IMAGE_COPY_DEVICE_TO_DEVICE;
1218+
setType(detail::CG::CopyImage);
1219+
}
1220+
11701221
void handler::ext_oneapi_copy(
11711222
ext::oneapi::experimental::image_mem_handle Src, sycl::range<3> SrcOffset,
11721223
const ext::oneapi::experimental::image_descriptor &SrcImgDesc, void *Dest,
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
// REQUIRES: linux
2+
// REQUIRES: cuda
3+
4+
// RUN: %{build} -o %t.out
5+
// RUN: %{run} %t.out
6+
7+
#include <iostream>
8+
#include <sycl/sycl.hpp>
9+
10+
// Uncomment to print additional test information
11+
// #define VERBOSE_PRINT
12+
13+
namespace syclexp = sycl::ext::oneapi::experimental;
14+
15+
void copy_image_mem_handle_to_image_mem_handle(
16+
syclexp::image_descriptor &desc, const std::vector<float> &testData,
17+
sycl::device dev, sycl::queue q, std::vector<float> &out) {
18+
syclexp::image_mem imgMemSrc(desc, dev, q.get_context());
19+
syclexp::image_mem imgMemDst(desc, dev, q.get_context());
20+
21+
q.ext_oneapi_copy((void *)testData.data(), imgMemSrc.get_handle(), desc);
22+
q.wait_and_throw();
23+
24+
q.ext_oneapi_copy(imgMemSrc.get_handle(), imgMemDst.get_handle(), desc);
25+
q.wait_and_throw();
26+
27+
q.ext_oneapi_copy(imgMemDst.get_handle(), (void *)out.data(), desc);
28+
q.wait_and_throw();
29+
}
30+
31+
bool check_test(const std::vector<float> &out,
32+
const std::vector<float> &expected) {
33+
assert(out.size() == expected.size());
34+
bool validated = true;
35+
for (int i = 0; i < out.size(); i++) {
36+
bool mismatch = false;
37+
if (out[i] != expected[i]) {
38+
mismatch = true;
39+
validated = false;
40+
}
41+
42+
if (mismatch) {
43+
#ifdef VERBOSE_PRINT
44+
std::cout << "Result mismatch! Expected: " << expected[i]
45+
<< ", Actual: " << out[i] << std::endl;
46+
#else
47+
break;
48+
#endif
49+
}
50+
}
51+
return validated;
52+
}
53+
54+
template <sycl::image_channel_order channelOrder,
55+
sycl::image_channel_type channelType, int dim,
56+
syclexp::image_type type = syclexp::image_type::standard>
57+
bool run_copy_test_with(sycl::device &dev, sycl::queue &q,
58+
sycl::range<dim> dims) {
59+
std::vector<float> dataSequence(dims.size());
60+
std::vector<float> out(dims.size());
61+
62+
std::vector<float> expected(dims.size());
63+
64+
std::iota(dataSequence.begin(), dataSequence.end(), 0);
65+
std::iota(expected.begin(), expected.end(), 0);
66+
67+
syclexp::image_descriptor desc;
68+
69+
if constexpr (type == syclexp::image_type::standard) {
70+
desc = syclexp::image_descriptor(dims, channelOrder, channelType);
71+
} else {
72+
desc = syclexp::image_descriptor(
73+
{dims[0], dim > 2 ? dims[1] : 0}, channelOrder, channelType,
74+
syclexp::image_type::array, 1, dim > 2 ? dims[2] : dims[1]);
75+
}
76+
77+
copy_image_mem_handle_to_image_mem_handle(desc, dataSequence, dev, q, out);
78+
79+
return check_test(out, expected);
80+
}
81+
82+
int main() {
83+
84+
sycl::device dev;
85+
sycl::queue q(dev);
86+
auto ctxt = q.get_context();
87+
88+
// Standard images copies
89+
bool validated = run_copy_test_with<sycl::image_channel_order::r,
90+
sycl::image_channel_type::fp32, 2>(
91+
dev, q, {2048 * 8, 2048 * 8});
92+
93+
validated &= run_copy_test_with<sycl::image_channel_order::r,
94+
sycl::image_channel_type::fp32, 1>(
95+
dev, q, {512 * 256});
96+
97+
validated &= run_copy_test_with<sycl::image_channel_order::r,
98+
sycl::image_channel_type::fp32, 3>(
99+
dev, q, {2048, 2048, 64});
100+
101+
// Layered images copies
102+
validated &=
103+
run_copy_test_with<sycl::image_channel_order::r,
104+
sycl::image_channel_type::fp32, 2,
105+
syclexp::image_type::array>(dev, q, {956, 38});
106+
validated &=
107+
run_copy_test_with<sycl::image_channel_order::r,
108+
sycl::image_channel_type::fp32, 3,
109+
syclexp::image_type::array>(dev, q, {2048, 2048, 64});
110+
111+
if (!validated) {
112+
std::cout << "Tests failed";
113+
return 1;
114+
}
115+
116+
std::cout << "Tests passed";
117+
118+
return 0;
119+
}

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3577,6 +3577,7 @@ _ZN4sycl3_V17handler13getKernelNameEv
35773577
_ZN4sycl3_V17handler14setNDRangeUsedEb
35783578
_ZN4sycl3_V17handler15ext_oneapi_copyENS0_3ext6oneapi12experimental16image_mem_handleENS0_5rangeILi3EEERKNS4_16image_descriptorEPvS7_S7_S7_
35793579
_ZN4sycl3_V17handler15ext_oneapi_copyENS0_3ext6oneapi12experimental16image_mem_handleEPvRKNS4_16image_descriptorE
3580+
_ZN4sycl3_V17handler15ext_oneapi_copyENS0_3ext6oneapi12experimental16image_mem_handleES5_RKNS4_16image_descriptorE
35803581
_ZN4sycl3_V17handler15ext_oneapi_copyEPvNS0_3ext6oneapi12experimental16image_mem_handleERKNS5_16image_descriptorE
35813582
_ZN4sycl3_V17handler15ext_oneapi_copyEPvNS0_5rangeILi3EEES2_S4_RKNS0_3ext6oneapi12experimental16image_descriptorEmS4_S4_
35823583
_ZN4sycl3_V17handler15ext_oneapi_copyEPvNS0_5rangeILi3EEES4_NS0_3ext6oneapi12experimental16image_mem_handleES4_RKNS7_16image_descriptorES4_

sycl/test/abi/sycl_symbols_windows.dump

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4067,6 +4067,7 @@
40674067
?ext_oneapi_copy@handler@_V1@sycl@@QEAAXPEAXUimage_mem_handle@experimental@oneapi@ext@23@AEBUimage_descriptor@56723@@Z
40684068
?ext_oneapi_copy@handler@_V1@sycl@@QEAAXPEAXV?$range@$02@23@01AEBUimage_descriptor@experimental@oneapi@ext@23@_K11@Z
40694069
?ext_oneapi_copy@handler@_V1@sycl@@QEAAXPEAXV?$range@$02@23@1Uimage_mem_handle@experimental@oneapi@ext@23@1AEBUimage_descriptor@67823@1@Z
4070+
?ext_oneapi_copy@handler@_V1@sycl@@QEAAXUimage_mem_handle@experimental@oneapi@ext@23@0AEBUimage_descriptor@56723@@Z
40704071
?ext_oneapi_copy@handler@_V1@sycl@@QEAAXUimage_mem_handle@experimental@oneapi@ext@23@PEAXAEBUimage_descriptor@56723@@Z
40714072
?ext_oneapi_copy@handler@_V1@sycl@@QEAAXUimage_mem_handle@experimental@oneapi@ext@23@V?$range@$02@23@AEBUimage_descriptor@56723@PEAX111@Z
40724073
?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@PEAX0AEBUimage_descriptor@experimental@oneapi@ext@23@_KAEBUcode_location@detail@23@@Z
@@ -4081,6 +4082,9 @@
40814082
?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@PEAXV?$range@$02@23@1Uimage_mem_handle@experimental@oneapi@ext@23@1AEBUimage_descriptor@78923@1AEBUcode_location@detail@23@@Z
40824083
?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@PEAXV?$range@$02@23@1Uimage_mem_handle@experimental@oneapi@ext@23@1AEBUimage_descriptor@78923@1AEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
40834084
?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@PEAXV?$range@$02@23@1Uimage_mem_handle@experimental@oneapi@ext@23@1AEBUimage_descriptor@78923@1V423@AEBUcode_location@detail@23@@Z
4085+
?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@Uimage_mem_handle@experimental@oneapi@ext@23@0AEBUimage_descriptor@67823@AEBUcode_location@detail@23@@Z
4086+
?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@Uimage_mem_handle@experimental@oneapi@ext@23@0AEBUimage_descriptor@67823@AEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
4087+
?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@Uimage_mem_handle@experimental@oneapi@ext@23@0AEBUimage_descriptor@67823@V423@AEBUcode_location@detail@23@@Z
40844088
?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@Uimage_mem_handle@experimental@oneapi@ext@23@PEAXAEBUimage_descriptor@67823@AEBUcode_location@detail@23@@Z
40854089
?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@Uimage_mem_handle@experimental@oneapi@ext@23@PEAXAEBUimage_descriptor@67823@AEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
40864090
?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@Uimage_mem_handle@experimental@oneapi@ext@23@PEAXAEBUimage_descriptor@67823@V423@AEBUcode_location@detail@23@@Z

0 commit comments

Comments
 (0)