Skip to content

Commit 2ae1bc9

Browse files
authored
[SYCL] Change in buffer management for integrated GPUs. (#2631)
This change special-cases GPU devices with integrated memory. Buffers for these devices is allocated in host shared memory, improving performance of buffer data transfers. Signed-off-by: rdeodhar <[email protected]>
1 parent c239604 commit 2ae1bc9

File tree

2 files changed

+144
-77
lines changed

2 files changed

+144
-77
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

100644100755
Lines changed: 135 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -1916,42 +1916,59 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
19161916
assert(RetMem);
19171917

19181918
void *Ptr;
1919+
ze_device_handle_t ZeDevice = Context->Devices[0]->ZeDevice;
1920+
1921+
// We treat integrated devices (physical memory shared with the CPU)
1922+
// differently from discrete devices (those with distinct memories).
1923+
// For integrated devices, allocating the buffer in host shared memory
1924+
// enables automatic access from the device, and makes copying
1925+
// unnecessary in the map/unmap operations. This improves performance.
1926+
bool DeviceIsIntegrated = Context->Devices.size() == 1 &&
1927+
Context->Devices[0]->ZeDeviceProperties.flags &
1928+
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
1929+
1930+
if (DeviceIsIntegrated) {
1931+
ze_host_mem_alloc_desc_t ZeDesc = {};
1932+
ZeDesc.flags = 0;
19191933

1920-
ze_device_mem_alloc_desc_t ZeDeviceMemDesc = {};
1921-
ZeDeviceMemDesc.flags = 0;
1922-
ZeDeviceMemDesc.ordinal = 0;
1934+
ZE_CALL(zeMemAllocHost(Context->ZeContext, &ZeDesc, Size, 1, &Ptr));
19231935

1924-
if (Context->Devices.size() == 1) {
1925-
ZE_CALL(zeMemAllocDevice(Context->ZeContext, &ZeDeviceMemDesc, Size,
1926-
1, // TODO: alignment
1927-
Context->Devices[0]->ZeDevice, &Ptr));
1928-
} else {
1929-
ze_host_mem_alloc_desc_t ZeHostMemDesc = {};
1930-
ZeHostMemDesc.flags = 0;
1931-
ZE_CALL(zeMemAllocShared(Context->ZeContext, &ZeDeviceMemDesc,
1932-
&ZeHostMemDesc, Size,
1933-
1, // TODO: alignment
1934-
nullptr, // not bound to any device
1935-
&Ptr));
1936-
}
1937-
1938-
if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1939-
(Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
1940-
// Initialize the buffer synchronously with immediate offload
1941-
ZE_CALL(zeCommandListAppendMemoryCopy(Context->ZeCommandListInit, Ptr,
1942-
HostPtr, Size, nullptr, 0, nullptr));
1943-
} else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1944-
// Nothing more to do.
19451936
} else {
1946-
die("piMemBufferCreate: not implemented");
1937+
ze_device_mem_alloc_desc_t ZeDesc = {};
1938+
ZeDesc.flags = 0;
1939+
ZeDesc.ordinal = 0;
1940+
1941+
ZE_CALL(
1942+
zeMemAllocDevice(Context->ZeContext, &ZeDesc, Size, 1, ZeDevice, &Ptr));
1943+
}
1944+
if (HostPtr) {
1945+
if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1946+
(Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
1947+
// Initialize the buffer with user data
1948+
if (DeviceIsIntegrated) {
1949+
// Do a host to host copy
1950+
memcpy(Ptr, HostPtr, Size);
1951+
} else {
1952+
1953+
// Initialize the buffer synchronously with immediate offload
1954+
ZE_CALL(zeCommandListAppendMemoryCopy(Context->ZeCommandListInit, Ptr,
1955+
HostPtr, Size, nullptr, 0,
1956+
nullptr));
1957+
}
1958+
} else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1959+
// Nothing more to do.
1960+
} else {
1961+
die("piMemBufferCreate: not implemented");
1962+
}
19471963
}
19481964

19491965
auto HostPtrOrNull =
19501966
(Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast<char *>(HostPtr) : nullptr;
19511967
try {
19521968
*RetMem = new _pi_buffer(
19531969
Context, pi_cast<char *>(Ptr) /* Level Zero Memory Handle */,
1954-
HostPtrOrNull);
1970+
HostPtrOrNull, nullptr, 0, 0,
1971+
DeviceIsIntegrated /* Flag indicating allocation in host memory */);
19551972
} catch (const std::bad_alloc &) {
19561973
return PI_OUT_OF_HOST_MEMORY;
19571974
} catch (...) {
@@ -4031,17 +4048,11 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
40314048
assert(Buffer);
40324049
assert(Queue);
40334050

4034-
// Lock automatically releases when this goes out of scope.
4035-
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
4036-
4037-
// Get a new command list to be used on this call
4051+
// For discrete devices we don't need a commandlist
40384052
ze_command_list_handle_t ZeCommandList = nullptr;
40394053
ze_fence_handle_t ZeFence = nullptr;
4040-
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
4041-
&ZeFence))
4042-
return Res;
4043-
40444054
ze_event_handle_t ZeEvent = nullptr;
4055+
40454056
if (Event) {
40464057
auto Res = piEventCreate(Queue->Context, Event);
40474058
if (Res != PI_SUCCESS)
@@ -4054,38 +4065,64 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
40544065
ZeEvent = (*Event)->ZeEvent;
40554066
}
40564067

4068+
// TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4069+
// left to doing new memory allocation and a copy (read) on discrete devices.
4070+
// On integrated devices we have allocated the buffer in host memory
4071+
// so no actions are needed here except for synchronizing on incoming events
4072+
// and doing a host-to-host copy if a host pointer had been supplied
4073+
// during buffer creation.
4074+
//
4075+
// TODO: for discrete, check if the input buffer is already allocated
4076+
// in shared memory and thus is accessible from the host as is.
4077+
// Can we get SYCL RT to predict/allocate in shared memory
4078+
// from the beginning?
4079+
//
4080+
// On integrated devices the buffer has been allocated in host memory.
4081+
if (Buffer->OnHost) {
4082+
// Wait on incoming events before doing the copy
4083+
piEventsWait(NumEventsInWaitList, EventWaitList);
4084+
if (Buffer->MapHostPtr) {
4085+
*RetMap = Buffer->MapHostPtr + Offset;
4086+
memcpy(*RetMap, pi_cast<char *>(Buffer->getZeHandle()) + Offset, Size);
4087+
} else {
4088+
*RetMap = pi_cast<char *>(Buffer->getZeHandle()) + Offset;
4089+
}
4090+
4091+
// Signal this event
4092+
ZE_CALL(zeEventHostSignal(ZeEvent));
4093+
4094+
return Buffer->addMapping(*RetMap, Offset, Size);
4095+
}
4096+
4097+
// Lock automatically releases when this goes out of scope.
4098+
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
4099+
4100+
// For discrete devices we need a command list
4101+
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
4102+
&ZeFence))
4103+
return Res;
4104+
4105+
// Set the commandlist in the event
4106+
if (Event) {
4107+
(*Event)->ZeCommandList = ZeCommandList;
4108+
}
4109+
40574110
ze_event_handle_t *ZeEventWaitList =
40584111
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
40594112

4060-
ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList,
4061-
ZeEventWaitList));
4062-
4063-
// TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4064-
// left to doing new memory allocation and a copy (read).
4065-
//
4066-
// TODO: check if the input buffer is already allocated in shared
4067-
// memory and thus is accessible from the host as is. Can we get SYCL RT
4068-
// to predict/allocate in shared memory from the beginning?
40694113
if (Buffer->MapHostPtr) {
4070-
// NOTE: borrowing below semantics from OpenCL as SYCL RT relies on it.
4071-
// It is also better for performance.
4072-
//
4073-
// "If the buffer object is created with CL_MEM_USE_HOST_PTR set in
4074-
// mem_flags, the following will be true:
4075-
// - The host_ptr specified in clCreateBuffer is guaranteed to contain the
4076-
// latest bits in the region being mapped when the clEnqueueMapBuffer
4077-
// command has completed.
4078-
// - The pointer value returned by clEnqueueMapBuffer will be derived from
4079-
// the host_ptr specified when the buffer object is created."
40804114
*RetMap = Buffer->MapHostPtr + Offset;
40814115
} else {
40824116
ze_host_mem_alloc_desc_t ZeDesc = {};
40834117
ZeDesc.flags = 0;
4084-
ZE_CALL(zeMemAllocHost(Queue->Context->ZeContext, &ZeDesc, Size,
4085-
1, // TODO: alignment
4086-
RetMap));
4118+
4119+
ZE_CALL(
4120+
zeMemAllocHost(Queue->Context->ZeContext, &ZeDesc, Size, 1, RetMap));
40874121
}
40884122

4123+
ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList,
4124+
ZeEventWaitList));
4125+
40894126
ZE_CALL(zeCommandListAppendMemoryCopy(
40904127
ZeCommandList, *RetMap, pi_cast<char *>(Buffer->getZeHandle()) + Offset,
40914128
Size, ZeEvent, 0, nullptr));
@@ -4103,15 +4140,10 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
41034140
const pi_event *EventWaitList, pi_event *Event) {
41044141
assert(Queue);
41054142

4106-
// Lock automatically releases when this goes out of scope.
4107-
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
4108-
4109-
// Get a new command list to be used on this call
4143+
// Integrated devices don't need a command list.
4144+
// If discrete we will get a commandlist later.
41104145
ze_command_list_handle_t ZeCommandList = nullptr;
41114146
ze_fence_handle_t ZeFence = nullptr;
4112-
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
4113-
&ZeFence))
4114-
return Res;
41154147

41164148
// TODO: handle the case when user does not care to follow the event
41174149
// of unmap completion.
@@ -4130,6 +4162,46 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
41304162
ZeEvent = (*Event)->ZeEvent;
41314163
}
41324164

4165+
_pi_mem::Mapping MapInfo = {};
4166+
if (pi_result Res = MemObj->removeMapping(MappedPtr, MapInfo))
4167+
return Res;
4168+
4169+
// NOTE: we still have to free the host memory allocated/returned by
4170+
// piEnqueueMemBufferMap, but can only do so after the above copy
4171+
// is completed. Instead of waiting for It here (blocking), we shall
4172+
// do so in piEventRelease called for the pi_event tracking the unmap.
4173+
// In the case of an integrated device, the map operation does not allocate
4174+
// any memory, so there is nothing to free. This is indicated by a nullptr.
4175+
if (Event)
4176+
(*Event)->CommandData =
4177+
(MemObj->OnHost ? nullptr : (MemObj->MapHostPtr ? nullptr : MappedPtr));
4178+
4179+
// On integrated devices the buffer is allocated in host memory.
4180+
if (MemObj->OnHost) {
4181+
// Wait on incoming events before doing the copy
4182+
piEventsWait(NumEventsInWaitList, EventWaitList);
4183+
if (MemObj->MapHostPtr)
4184+
memcpy(pi_cast<char *>(MemObj->getZeHandle()) + MapInfo.Offset, MappedPtr,
4185+
MapInfo.Size);
4186+
4187+
// Signal this event
4188+
ZE_CALL(zeEventHostSignal(ZeEvent));
4189+
4190+
return PI_SUCCESS;
4191+
}
4192+
4193+
// Lock automatically releases when this goes out of scope.
4194+
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
4195+
4196+
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
4197+
&ZeFence))
4198+
return Res;
4199+
4200+
// Set the commandlist in the event
4201+
if (Event) {
4202+
(*Event)->ZeCommandList = ZeCommandList;
4203+
}
4204+
41334205
ze_event_handle_t *ZeEventWaitList =
41344206
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
41354207

@@ -4141,21 +4213,11 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
41414213
//
41424214
// NOTE: Keep this in sync with the implementation of
41434215
// piEnqueueMemBufferMap/piEnqueueMemImageMap.
4144-
_pi_mem::Mapping MapInfo = {};
4145-
if (pi_result Res = MemObj->removeMapping(MappedPtr, MapInfo))
4146-
return Res;
41474216

41484217
ZE_CALL(zeCommandListAppendMemoryCopy(
41494218
ZeCommandList, pi_cast<char *>(MemObj->getZeHandle()) + MapInfo.Offset,
41504219
MappedPtr, MapInfo.Size, ZeEvent, 0, nullptr));
41514220

4152-
// NOTE: we still have to free the host memory allocated/returned by
4153-
// piEnqueueMemBufferMap, but can only do so after the above copy
4154-
// is completed. Instead of waiting for It here (blocking), we shall
4155-
// do so in piEventRelease called for the pi_event tracking the unmap.
4156-
if (Event)
4157-
(*Event)->CommandData = MemObj->MapHostPtr ? nullptr : MappedPtr;
4158-
41594221
// Execute command list asynchronously, as the event will be used
41604222
// to track down its completion.
41614223
if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence))

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -355,6 +355,9 @@ struct _pi_mem : _pi_object {
355355
// piEnqueueMemBufferMap for details).
356356
char *MapHostPtr;
357357

358+
// Flag to indicate that this memory is allocated in host memory
359+
bool OnHost;
360+
358361
// Supplementary data to keep track of the mappings of this memory
359362
// created with piEnqueueMemBufferMap and piEnqueueMemImageMap.
360363
struct Mapping {
@@ -382,8 +385,8 @@ struct _pi_mem : _pi_object {
382385
pi_result removeMapping(void *MappedTo, Mapping &MapInfo);
383386

384387
protected:
385-
_pi_mem(pi_context Ctx, char *HostPtr)
386-
: Context{Ctx}, MapHostPtr{HostPtr}, Mappings{} {}
388+
_pi_mem(pi_context Ctx, char *HostPtr, bool MemOnHost = false)
389+
: Context{Ctx}, MapHostPtr{HostPtr}, OnHost{MemOnHost}, Mappings{} {}
387390

388391
private:
389392
// The key is the host pointer representing an active mapping.
@@ -399,8 +402,10 @@ struct _pi_mem : _pi_object {
399402
struct _pi_buffer final : _pi_mem {
400403
// Buffer/Sub-buffer constructor
401404
_pi_buffer(pi_context Ctx, char *Mem, char *HostPtr,
402-
_pi_mem *Parent = nullptr, size_t Origin = 0, size_t Size = 0)
403-
: _pi_mem(Ctx, HostPtr), ZeMem{Mem}, SubBuffer{Parent, Origin, Size} {}
405+
_pi_mem *Parent = nullptr, size_t Origin = 0, size_t Size = 0,
406+
bool MemOnHost = false)
407+
: _pi_mem(Ctx, HostPtr, MemOnHost), ZeMem{Mem}, SubBuffer{Parent, Origin,
408+
Size} {}
404409

405410
void *getZeHandle() override { return ZeMem; }
406411

0 commit comments

Comments
 (0)