Skip to content

Commit 2dcc581

Browse files
[SYCL] Fix handling of host-side memory in 2D memops (#8359)
The current implementation of 2D USM memory operations will launch a kernel as a fallback mechanism. However, these interfaces are also intended to accept pointers to USM and non-USM host memory. To allow this, the implementation now picks different fallback mechanisms depending on the type of pointer passed. For fill/memset this means: 1. Use a host-task with a fill per row if the pointer is to host memory. 2. Use backend-defined interface if available. 3. Use auxiliary fill kernel if pointer is to device memory. For memcpy/copy this means: 1. Use a host-task to copy memory when both pointers are to host memory. 2. Use backend-defined interface if available. 3. Enqueue a USM memcpy per row if one pointer is to in host memory and the other pointer is in device memory. 4. Use auxiliary copy kernel if both pointers are to device memory. --------- Signed-off-by: Larsen, Steffen <[email protected]>
1 parent f31df05 commit 2dcc581

File tree

8 files changed

+219
-29
lines changed

8 files changed

+219
-29
lines changed

sycl/include/sycl/handler.hpp

Lines changed: 103 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#include <sycl/reduction_forward.hpp>
3535
#include <sycl/sampler.hpp>
3636
#include <sycl/stl.hpp>
37+
#include <sycl/usm/usm_pointer_info.hpp>
3738

3839
#include <functional>
3940
#include <limits>
@@ -2469,13 +2470,30 @@ class __SYCL_EXPORT handler {
24692470
throw sycl::exception(sycl::make_error_code(errc::invalid),
24702471
"Source pitch must be greater than or equal "
24712472
"to the width specified in 'ext_oneapi_memcpy2d'");
2472-
// If the backends supports 2D copy we use that. Otherwise we use a fallback
2473-
// kernel.
2474-
if (supportsUSMMemcpy2D())
2473+
2474+
// Get the type of the pointers.
2475+
context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2476+
usm::alloc SrcAllocType = get_pointer_type(Src, Ctx);
2477+
usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2478+
bool SrcIsHost =
2479+
SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
2480+
bool DestIsHost = DestAllocType == usm::alloc::unknown ||
2481+
DestAllocType == usm::alloc::host;
2482+
2483+
// Do the following:
2484+
// 1. If both are host, use host_task to copy.
2485+
// 2. If either pointer is host or of the backend supports native memcpy2d,
2486+
// use special command.
2487+
// 3. Otherwise, launch a kernel for copying.
2488+
if (SrcIsHost && DestIsHost) {
2489+
commonUSMCopy2DFallbackHostTask<T>(Src, SrcPitch, Dest, DestPitch, Width,
2490+
Height);
2491+
} else if (SrcIsHost || DestIsHost || supportsUSMMemcpy2D()) {
24752492
ext_oneapi_memcpy2d_impl(Dest, DestPitch, Src, SrcPitch, Width, Height);
2476-
else
2493+
} else {
24772494
commonUSMCopy2DFallbackKernel<T>(Src, SrcPitch, Dest, DestPitch, Width,
24782495
Height);
2496+
}
24792497
}
24802498

24812499
/// Copies data from one 2D memory region to another, both pointed by
@@ -2503,14 +2521,31 @@ class __SYCL_EXPORT handler {
25032521
throw sycl::exception(sycl::make_error_code(errc::invalid),
25042522
"Source pitch must be greater than or equal "
25052523
"to the width specified in 'ext_oneapi_copy2d'");
2506-
// If the backends supports 2D copy we use that. Otherwise we use a fallback
2507-
// kernel.
2508-
if (supportsUSMMemcpy2D())
2524+
2525+
// Get the type of the pointers.
2526+
context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2527+
usm::alloc SrcAllocType = get_pointer_type(Src, Ctx);
2528+
usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2529+
bool SrcIsHost =
2530+
SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
2531+
bool DestIsHost = DestAllocType == usm::alloc::unknown ||
2532+
DestAllocType == usm::alloc::host;
2533+
2534+
// Do the following:
2535+
// 1. If both are host, use host_task to copy.
2536+
// 2. If either pointer is host or of the backend supports native memcpy2d,
2537+
// use special command.
2538+
// 3. Otherwise, launch a kernel for copying.
2539+
if (SrcIsHost && DestIsHost) {
2540+
commonUSMCopy2DFallbackHostTask<T>(Src, SrcPitch, Dest, DestPitch, Width,
2541+
Height);
2542+
} else if (SrcIsHost || DestIsHost || supportsUSMMemcpy2D()) {
25092543
ext_oneapi_memcpy2d_impl(Dest, DestPitch * sizeof(T), Src,
25102544
SrcPitch * sizeof(T), Width * sizeof(T), Height);
2511-
else
2545+
} else {
25122546
commonUSMCopy2DFallbackKernel<T>(Src, SrcPitch, Dest, DestPitch, Width,
25132547
Height);
2548+
}
25142549
}
25152550

25162551
/// Fills the memory pointed by a USM pointer with the value specified.
@@ -2538,9 +2573,16 @@ class __SYCL_EXPORT handler {
25382573
"Destination pitch must be greater than or equal "
25392574
"to the width specified in 'ext_oneapi_memset2d'");
25402575
T CharVal = static_cast<T>(Value);
2576+
2577+
context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2578+
usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2579+
25412580
// If the backends supports 2D fill we use that. Otherwise we use a fallback
2542-
// kernel.
2543-
if (supportsUSMMemset2D())
2581+
// kernel. If the target is on host we will always do the operation on host.
2582+
if (DestAllocType == usm::alloc::unknown ||
2583+
DestAllocType == usm::alloc::host)
2584+
commonUSMFill2DFallbackHostTask(Dest, DestPitch, CharVal, Width, Height);
2585+
else if (supportsUSMMemset2D())
25442586
ext_oneapi_memset2d_impl(Dest, DestPitch, Value, Width, Height);
25452587
else
25462588
commonUSMFill2DFallbackKernel(Dest, DestPitch, CharVal, Width, Height);
@@ -2568,9 +2610,16 @@ class __SYCL_EXPORT handler {
25682610
throw sycl::exception(sycl::make_error_code(errc::invalid),
25692611
"Destination pitch must be greater than or equal "
25702612
"to the width specified in 'ext_oneapi_fill2d'");
2613+
2614+
context Ctx = detail::createSyclObjFromImpl<context>(getContextImplPtr());
2615+
usm::alloc DestAllocType = get_pointer_type(Dest, Ctx);
2616+
25712617
// If the backends supports 2D fill we use that. Otherwise we use a fallback
2572-
// kernel.
2573-
if (supportsUSMFill2D())
2618+
// kernel. If the target is on host we will always do the operation on host.
2619+
if (DestAllocType == usm::alloc::unknown ||
2620+
DestAllocType == usm::alloc::host)
2621+
commonUSMFill2DFallbackHostTask(Dest, DestPitch, Pattern, Width, Height);
2622+
else if (supportsUSMFill2D())
25742623
ext_oneapi_fill2d_impl(Dest, DestPitch, &Pattern, sizeof(T), Width,
25752624
Height);
25762625
else
@@ -2792,6 +2841,8 @@ class __SYCL_EXPORT handler {
27922841
NumWorkItems, KernelFunc);
27932842
}
27942843

2844+
const std::shared_ptr<detail::context_impl> &getContextImplPtr() const;
2845+
27952846
// Checks if 2D memory operations are supported by the underlying platform.
27962847
bool supportsUSMMemcpy2D();
27972848
bool supportsUSMFill2D();
@@ -2806,6 +2857,8 @@ class __SYCL_EXPORT handler {
28062857
void commonUSMCopy2DFallbackKernel(const void *Src, size_t SrcPitch,
28072858
void *Dest, size_t DestPitch, size_t Width,
28082859
size_t Height) {
2860+
// Otherwise the data is accessible on the device so we do the operation
2861+
// there instead.
28092862
// Limit number of work items to be resistant to big copies.
28102863
id<2> Chunk = computeFallbackKernelBounds(Height, Width);
28112864
id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
@@ -2825,12 +2878,33 @@ class __SYCL_EXPORT handler {
28252878
});
28262879
}
28272880

2881+
// Common function for launching a 2D USM memcpy host-task to avoid
2882+
// redefinitions of the kernel from copy and memcpy.
2883+
template <typename T>
2884+
void commonUSMCopy2DFallbackHostTask(const void *Src, size_t SrcPitch,
2885+
void *Dest, size_t DestPitch,
2886+
size_t Width, size_t Height) {
2887+
// If both pointers are host USM or unknown (assumed non-USM) we use a
2888+
// host-task to satisfy dependencies.
2889+
host_task([=] {
2890+
const T *CastedSrc = static_cast<const T *>(Src);
2891+
T *CastedDest = static_cast<T *>(Dest);
2892+
for (size_t I = 0; I < Height; ++I) {
2893+
const T *SrcItBegin = CastedSrc + SrcPitch * I;
2894+
T *DestItBegin = CastedDest + DestPitch * I;
2895+
std::copy(SrcItBegin, SrcItBegin + Width, DestItBegin);
2896+
}
2897+
});
2898+
}
2899+
28282900
// Common function for launching a 2D USM fill kernel to avoid redefinitions
28292901
// of the kernel from memset and fill.
28302902
template <typename T>
28312903
void commonUSMFill2DFallbackKernel(void *Dest, size_t DestPitch,
28322904
const T &Pattern, size_t Width,
28332905
size_t Height) {
2906+
// Otherwise the data is accessible on the device so we do the operation
2907+
// there instead.
28342908
// Limit number of work items to be resistant to big fill operations.
28352909
id<2> Chunk = computeFallbackKernelBounds(Height, Width);
28362910
id<2> Iterations = (Chunk + id<2>{Height, Width} - 1) / Chunk;
@@ -2849,6 +2923,23 @@ class __SYCL_EXPORT handler {
28492923
});
28502924
}
28512925

2926+
// Common function for launching a 2D USM fill kernel or host_task to avoid
2927+
// redefinitions of the kernel from memset and fill.
2928+
template <typename T>
2929+
void commonUSMFill2DFallbackHostTask(void *Dest, size_t DestPitch,
2930+
const T &Pattern, size_t Width,
2931+
size_t Height) {
2932+
// If the pointer is host USM or unknown (assumed non-USM) we use a
2933+
// host-task to satisfy dependencies.
2934+
host_task([=] {
2935+
T *CastedDest = static_cast<T *>(Dest);
2936+
for (size_t I = 0; I < Height; ++I) {
2937+
T *ItBegin = CastedDest + DestPitch * I;
2938+
std::fill(ItBegin, ItBegin + Width, Pattern);
2939+
}
2940+
});
2941+
}
2942+
28522943
// Implementation of ext_oneapi_memcpy2d using command for native 2D memcpy.
28532944
void ext_oneapi_memcpy2d_impl(void *Dest, size_t DestPitch, const void *Src,
28542945
size_t SrcPitch, size_t Width, size_t Height);

sycl/include/sycl/sycl.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
#include <sycl/sub_group.hpp>
5858
#include <sycl/types.hpp>
5959
#include <sycl/usm.hpp>
60+
#include <sycl/usm/usm_pointer_info.hpp>
6061
#include <sycl/version.hpp>
6162
#if SYCL_EXT_ONEAPI_BACKEND_LEVEL_ZERO
6263
#include <sycl/ext/oneapi/backend/level_zero.hpp>

sycl/include/sycl/usm.hpp

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -277,19 +277,5 @@ T *aligned_alloc(
277277
Kind, PropList, CodeLoc);
278278
}
279279

280-
// Pointer queries
281-
/// Query the allocation type from a USM pointer
282-
///
283-
/// \param ptr is the USM pointer to query
284-
/// \param ctxt is the sycl context the ptr was allocated in
285-
__SYCL_EXPORT usm::alloc get_pointer_type(const void *ptr, const context &ctxt);
286-
287-
/// Queries the device against which the pointer was allocated
288-
/// Throws an invalid_object_error if ptr is a host allocation.
289-
///
290-
/// \param ptr is the USM pointer to query
291-
/// \param ctxt is the sycl context the ptr was allocated in
292-
__SYCL_EXPORT device get_pointer_device(const void *ptr, const context &ctxt);
293-
294280
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
295281
} // namespace sycl
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
//==---- usm_pointer_info.hpp - SYCL USM pointer info queries --*- C++ -*---==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
// ===--------------------------------------------------------------------=== //
8+
#pragma once
9+
10+
#include <sycl/detail/common.hpp>
11+
#include <sycl/detail/export.hpp>
12+
#include <sycl/usm/usm_enums.hpp>
13+
14+
namespace sycl {
15+
__SYCL_INLINE_VER_NAMESPACE(_V1) {
16+
17+
class device;
18+
class context;
19+
20+
// Pointer queries
21+
/// Query the allocation type from a USM pointer
22+
///
23+
/// \param ptr is the USM pointer to query
24+
/// \param ctxt is the sycl context the ptr was allocated in
25+
__SYCL_EXPORT usm::alloc get_pointer_type(const void *ptr, const context &ctxt);
26+
27+
/// Queries the device against which the pointer was allocated
28+
/// Throws an invalid_object_error if ptr is a host allocation.
29+
///
30+
/// \param ptr is the USM pointer to query
31+
/// \param ctxt is the sycl context the ptr was allocated in
32+
__SYCL_EXPORT device get_pointer_device(const void *ptr, const context &ctxt);
33+
34+
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
35+
} // namespace sycl

sycl/source/detail/memory_manager.cpp

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -932,10 +932,49 @@ void MemoryManager::copy_2d_usm(const void *SrcMem, size_t SrcPitch,
932932
if (!DstMem || !SrcMem)
933933
throw sycl::exception(sycl::make_error_code(errc::invalid),
934934
"NULL pointer argument in 2D memory copy operation.");
935+
935936
const detail::plugin &Plugin = Queue->getPlugin();
936-
Plugin.call<PiApiKind::piextUSMEnqueueMemcpy2D>(
937-
Queue->getHandleRef(), /*blocking=*/PI_FALSE, DstMem, DstPitch, SrcMem,
938-
SrcPitch, Width, Height, DepEvents.size(), DepEvents.data(), OutEvent);
937+
938+
pi_bool SupportsUSMMemcpy2D = false;
939+
Plugin.call<detail::PiApiKind::piContextGetInfo>(
940+
Queue->getContextImplPtr()->getHandleRef(),
941+
PI_EXT_ONEAPI_CONTEXT_INFO_USM_MEMCPY2D_SUPPORT, sizeof(pi_bool),
942+
&SupportsUSMMemcpy2D, nullptr);
943+
944+
if (SupportsUSMMemcpy2D) {
945+
// Direct memcpy2D is supported so we use this function.
946+
Plugin.call<PiApiKind::piextUSMEnqueueMemcpy2D>(
947+
Queue->getHandleRef(), /*blocking=*/PI_FALSE, DstMem, DstPitch, SrcMem,
948+
SrcPitch, Width, Height, DepEvents.size(), DepEvents.data(), OutEvent);
949+
return;
950+
}
951+
952+
// Otherwise we allow the special case where the copy is to or from host.
953+
#ifndef NDEBUG
954+
context Ctx = createSyclObjFromImpl<context>(Queue->getContextImplPtr());
955+
usm::alloc SrcAllocType = get_pointer_type(SrcMem, Ctx);
956+
usm::alloc DstAllocType = get_pointer_type(DstMem, Ctx);
957+
bool SrcIsHost =
958+
SrcAllocType == usm::alloc::unknown || SrcAllocType == usm::alloc::host;
959+
bool DstIsHost =
960+
DstAllocType == usm::alloc::unknown || DstAllocType == usm::alloc::host;
961+
assert((SrcIsHost || DstIsHost) && "In fallback path for copy_2d_usm either "
962+
"source or destination must be on host.");
963+
#endif // NDEBUG
964+
965+
// The fallback in this case is to insert a copy per row.
966+
std::vector<RT::PiEvent> CopyEvents(Height);
967+
for (size_t I = 0; I < Height; ++I) {
968+
char *DstItBegin = static_cast<char *>(DstMem) + I * DstPitch;
969+
const char *SrcItBegin = static_cast<const char *>(SrcMem) + I * SrcPitch;
970+
Plugin.call<PiApiKind::piextUSMEnqueueMemcpy>(
971+
Queue->getHandleRef(), /* blocking */ PI_FALSE, DstItBegin, SrcItBegin,
972+
Width, DepEvents.size(), DepEvents.data(), CopyEvents.data() + I);
973+
}
974+
975+
// Then insert a wait to coalesce the copy events.
976+
Queue->getPlugin().call<PiApiKind::piEnqueueEventsWait>(
977+
Queue->getHandleRef(), CopyEvents.size(), CopyEvents.data(), OutEvent);
939978
}
940979

941980
void MemoryManager::fill_2d_usm(void *DstMem, QueueImplPtr Queue, size_t Pitch,

sycl/source/handler.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -877,5 +877,10 @@ void handler::memcpyFromDeviceGlobal(void *Dest, const void *DeviceGlobalPtr,
877877
setType(detail::CG::CopyFromDeviceGlobal);
878878
}
879879

880+
const std::shared_ptr<detail::context_impl> &
881+
handler::getContextImplPtr() const {
882+
return MQueue->getContextImplPtr();
883+
}
884+
880885
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
881886
} // namespace sycl

sycl/test/abi/sycl_symbols_linux.dump

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4454,6 +4454,7 @@ _ZNK4sycl3_V17context8get_infoINS0_4info7context32atomic_memory_scope_capabiliti
44544454
_ZNK4sycl3_V17context8get_infoINS0_4info7context7devicesEEENS0_6detail20is_context_info_descIT_E11return_typeEv
44554455
_ZNK4sycl3_V17context8get_infoINS0_4info7context8platformEEENS0_6detail20is_context_info_descIT_E11return_typeEv
44564456
_ZNK4sycl3_V17context9getNativeEv
4457+
_ZNK4sycl3_V17handler17getContextImplPtrEv
44574458
_ZNK4sycl3_V17handler27isStateExplicitKernelBundleEv
44584459
_ZNK4sycl3_V17handler30getOrInsertHandlerKernelBundleEb
44594460
_ZNK4sycl3_V17sampler12get_propertyINS0_3ext6oneapi4cuda8property7context19use_primary_contextEEET_v

0 commit comments

Comments
 (0)