Skip to content

Commit 6445c61

Browse files
committed
[SYCL] Change in buffer management for integrated GPUs.
Signed-off-by: rdeodhar <[email protected]>
1 parent 2939b92 commit 6445c61

File tree

2 files changed

+225
-75
lines changed

2 files changed

+225
-75
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 209 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1862,34 +1862,50 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
18621862
assert(RetMem);
18631863

18641864
void *Ptr;
1865+
ze_device_handle_t ZeDevice = Context->Devices[0]->ZeDevice;
1866+
1867+
// We treat integrated devices (physical memory shared with the CPU)
1868+
// differently from discrete devices (those with distinct memories).
1869+
// For integrated devices, allocating the buffer in host shared memory
1870+
// enables automatic access from the device, and makes copying
1871+
// unnecessary in the map/unmap operations. This improves performance.
1872+
bool DeviceIsIntegrated = Context->Devices.size() == 1 &&
1873+
Context->Devices[0]->ZeDeviceProperties.flags &
1874+
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
1875+
1876+
if (DeviceIsIntegrated) {
1877+
ze_host_mem_alloc_desc_t ZeDesc = {};
1878+
ZeDesc.flags = 0;
18651879

1866-
ze_device_mem_alloc_desc_t ZeDeviceMemDesc = {};
1867-
ZeDeviceMemDesc.flags = 0;
1868-
ZeDeviceMemDesc.ordinal = 0;
1880+
ZE_CALL(zeMemAllocHost(Context->ZeContext, &ZeDesc, Size, 4096, &Ptr));
18691881

1870-
if (Context->Devices.size() == 1) {
1871-
ZE_CALL(zeMemAllocDevice(Context->ZeContext, &ZeDeviceMemDesc, Size,
1872-
1, // TODO: alignment
1873-
Context->Devices[0]->ZeDevice, &Ptr));
18741882
} else {
1875-
ze_host_mem_alloc_desc_t ZeHostMemDesc = {};
1876-
ZeHostMemDesc.flags = 0;
1877-
ZE_CALL(zeMemAllocShared(Context->ZeContext, &ZeDeviceMemDesc,
1878-
&ZeHostMemDesc, Size,
1879-
1, // TODO: alignment
1880-
nullptr, // not bound to any device
1883+
ze_device_mem_alloc_desc_t ZeDesc = {};
1884+
ZeDesc.flags = 0;
1885+
ZeDesc.ordinal = 0;
1886+
1887+
ZE_CALL(zeMemAllocDevice(Context->ZeContext, &ZeDesc, Size, 4096, ZeDevice,
18811888
&Ptr));
18821889
}
1890+
if (HostPtr) {
1891+
if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1892+
(Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
1893+
// Initialize the buffer with user data
1894+
if (DeviceIsIntegrated) {
1895+
// Do a host to host copy
1896+
memcpy(Ptr, HostPtr, Size);
1897+
} else {
18831898

1884-
if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1885-
(Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
1886-
// Initialize the buffer synchronously with immediate offload
1887-
ZE_CALL(zeCommandListAppendMemoryCopy(Context->ZeCommandListInit, Ptr,
1888-
HostPtr, Size, nullptr, 0, nullptr));
1889-
} else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1890-
// Nothing more to do.
1891-
} else {
1892-
die("piMemBufferCreate: not implemented");
1899+
// Initialize the buffer synchronously with immediate offload
1900+
ZE_CALL(zeCommandListAppendMemoryCopy(Context->ZeCommandListInit, Ptr,
1901+
HostPtr, Size, nullptr, 0,
1902+
nullptr));
1903+
}
1904+
} else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1905+
// Nothing more to do.
1906+
} else {
1907+
die("piMemBufferCreate: not implemented");
1908+
}
18931909
}
18941910

18951911
auto HostPtrOrNull =
@@ -3305,7 +3321,26 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
33053321
for (uint32_t I = 0; I < NumEvents; I++) {
33063322
ze_event_handle_t ZeEvent = EventList[I]->ZeEvent;
33073323
zePrint("ZeEvent = %lx\n", pi_cast<std::uintptr_t>(ZeEvent));
3308-
ZE_CALL(zeEventHostSynchronize(ZeEvent, UINT32_MAX));
3324+
3325+
// If event comes from a Map/UnMap operation in integrated device, then do
3326+
// sync, memcpy, and signaling on the host
3327+
if (EventList[I]->HostSyncforMap) {
3328+
#if 0
3329+
for (auto ZeWaitEvent : EventList[I]->waitEvents) {
3330+
zePrint("ZeWaitEvent = %lx\n", pi_cast<std::uintptr_t>(ZeWaitEvent));
3331+
if (ZeWaitEvent)
3332+
ZE_CALL(zeEventHostSynchronize(ZeWaitEvent, UINT32_MAX));
3333+
}
3334+
if (EventList[I]->CopyPending) {
3335+
memcpy(EventList[I]->DstBuffer, EventList[I]->SrcBuffer,
3336+
EventList[I]->RetMapSize);
3337+
EventList[I]->CopyPending = false;
3338+
}
3339+
#endif
3340+
ZE_CALL(zeEventHostSignal(ZeEvent));
3341+
} else {
3342+
ZE_CALL(zeEventHostSynchronize(ZeEvent, UINT32_MAX));
3343+
}
33093344

33103345
// NOTE: we are destroying associated command lists here to free
33113346
// resources sooner in case RT is not calling piEventRelease soon enough.
@@ -3983,14 +4018,22 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
39834018
// Lock automatically releases when this goes out of scope.
39844019
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
39854020

3986-
// Get a new command list to be used on this call
4021+
// Query the buffer allocation to determine if host allocation
4022+
ze_memory_allocation_properties_t ZeMemoryAllocationProperties = {};
4023+
ze_device_handle_t ZeDeviceHandle;
4024+
4025+
ZE_CALL(
4026+
zeMemGetAllocProperties(Queue->Context->ZeContext, Buffer->getZeHandle(),
4027+
&ZeMemoryAllocationProperties, &ZeDeviceHandle));
4028+
4029+
bool BufferUsesHostMem =
4030+
(ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST);
4031+
4032+
// For discrete devices we don't need a commandlist
39874033
ze_command_list_handle_t ZeCommandList = nullptr;
39884034
ze_fence_handle_t ZeFence = nullptr;
3989-
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
3990-
&ZeFence))
3991-
return Res;
3992-
39934035
ze_event_handle_t ZeEvent = nullptr;
4036+
39944037
if (Event) {
39954038
auto Res = piEventCreate(Queue->Context, Event);
39964039
if (Res != PI_SUCCESS)
@@ -4003,38 +4046,84 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
40034046
ZeEvent = (*Event)->ZeEvent;
40044047
}
40054048

4049+
// TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4050+
// left to doing new memory allocation and a copy (read) on discrete devices.
4051+
// On integrated devices we have allocated the buffer in host memory
4052+
// so no actions are needed here except for synchronizing on incoming events
4053+
// and doing a host-to-host copy if a host pointer had been supplied
4054+
// during buffer creation.
4055+
//
4056+
// TODO: for discrete, check if the input buffer is already allocated
4057+
// in shared memory and thus is accessible from the host as is.
4058+
// Can we get SYCL RT to predict/allocate in shared memory
4059+
// from the beginning?
4060+
if (BufferUsesHostMem) {
4061+
(*Event)->HostSyncforMap = true;
4062+
#if 0
4063+
for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
4064+
zePrint("Map added ZeWaitEvent = %lx\n",
4065+
pi_cast<std::uintptr_t>(EventWaitList[i]->ZeEvent));
4066+
(*Event)->waitEvents.push_back(EventWaitList[i]->ZeEvent);
4067+
}
4068+
#else
4069+
for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
4070+
auto Queue = EventWaitList[i]->Queue;
4071+
zePrint("Got Q\n");
4072+
if (Queue->RefCount > 0) {
4073+
zePrint("Executing commandlist\n");
4074+
if (auto Res = Queue->executeOpenCommandList())
4075+
return Res;
4076+
}
4077+
}
4078+
for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
4079+
zePrint("Going to wait on ZeWaitEvent = %lx\n",
4080+
pi_cast<std::uintptr_t>(EventWaitList[i]->ZeEvent));
4081+
auto ZeWaitEvent = EventWaitList[i]->ZeEvent;
4082+
if (ZeWaitEvent)
4083+
ZE_CALL(zeEventHostSynchronize(ZeWaitEvent, UINT32_MAX));
4084+
}
4085+
#endif
4086+
if (Buffer->MapHostPtr) {
4087+
*RetMap = Buffer->MapHostPtr + Offset;
4088+
#if 0
4089+
(*Event)->SrcBuffer = pi_cast<char*>(Buffer->getZeHandle()) + Offset;
4090+
(*Event)->DstBuffer = *RetMap;
4091+
(*Event)->RetMapSize = Size;
4092+
(*Event)->CopyPending = true;
4093+
#else
4094+
zePrint("Doing memcpy %p %p %zu\n", *RetMap,
4095+
pi_cast<char *>(Buffer->getZeHandle()) + Offset, Size);
4096+
memcpy(*RetMap, pi_cast<char *>(Buffer->getZeHandle()) + Offset, Size);
4097+
zePrint("DONE\n");
4098+
#endif
4099+
} else {
4100+
*RetMap = pi_cast<char *>(Buffer->getZeHandle()) + Offset;
4101+
}
4102+
4103+
return Buffer->addMapping(*RetMap, Offset, Size);
4104+
}
4105+
4106+
// For discrete devices we need a command list
4107+
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
4108+
&ZeFence))
4109+
return Res;
4110+
40064111
ze_event_handle_t *ZeEventWaitList =
40074112
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
40084113

4009-
ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList,
4010-
ZeEventWaitList));
4011-
4012-
// TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4013-
// left to doing new memory allocation and a copy (read).
4014-
//
4015-
// TODO: check if the input buffer is already allocated in shared
4016-
// memory and thus is accessible from the host as is. Can we get SYCL RT
4017-
// to predict/allocate in shared memory from the beginning?
40184114
if (Buffer->MapHostPtr) {
4019-
// NOTE: borrowing below semantics from OpenCL as SYCL RT relies on it.
4020-
// It is also better for performance.
4021-
//
4022-
// "If the buffer object is created with CL_MEM_USE_HOST_PTR set in
4023-
// mem_flags, the following will be true:
4024-
// - The host_ptr specified in clCreateBuffer is guaranteed to contain the
4025-
// latest bits in the region being mapped when the clEnqueueMapBuffer
4026-
// command has completed.
4027-
// - The pointer value returned by clEnqueueMapBuffer will be derived from
4028-
// the host_ptr specified when the buffer object is created."
40294115
*RetMap = Buffer->MapHostPtr + Offset;
40304116
} else {
40314117
ze_host_mem_alloc_desc_t ZeDesc = {};
40324118
ZeDesc.flags = 0;
4033-
ZE_CALL(zeMemAllocHost(Queue->Context->ZeContext, &ZeDesc, Size,
4034-
1, // TODO: alignment
4035-
RetMap));
4119+
4120+
ZE_CALL(
4121+
zeMemAllocHost(Queue->Context->ZeContext, &ZeDesc, Size, 4096, RetMap));
40364122
}
40374123

4124+
ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList,
4125+
ZeEventWaitList));
4126+
40384127
ZE_CALL(zeCommandListAppendMemoryCopy(
40394128
ZeCommandList, *RetMap, pi_cast<char *>(Buffer->getZeHandle()) + Offset,
40404129
Size, ZeEvent, 0, nullptr));
@@ -4055,12 +4144,21 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
40554144
// Lock automatically releases when this goes out of scope.
40564145
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
40574146

4058-
// Get a new command list to be used on this call
4147+
// Query the buffer allocation to determine if host allocation
4148+
ze_memory_allocation_properties_t ZeMemoryAllocationProperties = {};
4149+
ze_device_handle_t ZeDeviceHandle;
4150+
4151+
ZE_CALL(
4152+
zeMemGetAllocProperties(Queue->Context->ZeContext, MemObj->getZeHandle(),
4153+
&ZeMemoryAllocationProperties, &ZeDeviceHandle));
4154+
4155+
bool BufferUsesHostMem =
4156+
(ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST);
4157+
4158+
// Integrated devices don't need a command list.
4159+
// If discrete we will get a commandlist later.
40594160
ze_command_list_handle_t ZeCommandList = nullptr;
40604161
ze_fence_handle_t ZeFence = nullptr;
4061-
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
4062-
&ZeFence))
4063-
return Res;
40644162

40654163
// TODO: handle the case when user does not care to follow the event
40664164
// of unmap completion.
@@ -4079,38 +4177,74 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
40794177
ZeEvent = (*Event)->ZeEvent;
40804178
}
40814179

4082-
ze_event_handle_t *ZeEventWaitList =
4083-
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
4084-
4085-
ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList,
4086-
ZeEventWaitList));
4087-
4088-
// TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4089-
// left to doing copy (write back to the device).
4090-
//
4091-
// NOTE: Keep this in sync with the implementation of
4092-
// piEnqueueMemBufferMap/piEnqueueMemImageMap.
40934180
_pi_mem::Mapping MapInfo = {};
40944181
if (pi_result Res = MemObj->removeMapping(MappedPtr, MapInfo))
40954182
return Res;
40964183

4097-
ZE_CALL(zeCommandListAppendMemoryCopy(
4098-
ZeCommandList, pi_cast<char *>(MemObj->getZeHandle()) + MapInfo.Offset,
4099-
MappedPtr, MapInfo.Size, ZeEvent, 0, nullptr));
4100-
41014184
// NOTE: we still have to free the host memory allocated/returned by
41024185
// piEnqueueMemBufferMap, but can only do so after the above copy
41034186
// is completed. Instead of waiting for It here (blocking), we shall
41044187
// do so in piEventRelease called for the pi_event tracking the unmap.
4188+
// In the case of an integrated device, the map operation does not allocate
4189+
// any memory, so there is nothing to free. This is indicated by a nullptr.
41054190
if (Event)
4106-
(*Event)->CommandData = MemObj->MapHostPtr ? nullptr : MappedPtr;
4191+
(*Event)->CommandData =
4192+
(BufferUsesHostMem ? nullptr
4193+
: (MemObj->MapHostPtr ? nullptr : MappedPtr));
4194+
4195+
if (BufferUsesHostMem) {
4196+
(*Event)->HostSyncforMap = true;
4197+
#if 0
4198+
for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
4199+
zePrint("UnMap Added ZeWaitEvent = %lx\n",
4200+
pi_cast<std::uintptr_t>(EventWaitList[i]->ZeEvent));
4201+
(*Event)->waitEvents.push_back(EventWaitList[i]->ZeEvent);
4202+
}
4203+
(*Event)->SrcBuffer = MappedPtr;
4204+
(*Event)->DstBuffer =
4205+
pi_cast<char*>(MemObj->getZeHandle()) + MapInfo.Offset;
4206+
(*Event)->RetMapSize = MapInfo.Size;
4207+
(*Event)->CopyPending = true;
4208+
#else
4209+
for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
4210+
auto ZeWaitEvent = EventWaitList[i]->ZeEvent;
4211+
if (ZeWaitEvent)
4212+
ZE_CALL(zeEventHostSynchronize(ZeWaitEvent, UINT32_MAX));
4213+
}
4214+
memcpy(pi_cast<char *>(MemObj->getZeHandle()) + MapInfo.Offset, MappedPtr,
4215+
MapInfo.Size);
4216+
#endif
4217+
} else {
41074218

4108-
// Execute command list asynchronously, as the event will be used
4109-
// to track down its completion.
4110-
if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence))
4111-
return Res;
4219+
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
4220+
&ZeFence))
4221+
return Res;
41124222

4113-
_pi_event::deleteZeEventList(ZeEventWaitList);
4223+
ze_event_handle_t *ZeEventWaitList =
4224+
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
4225+
4226+
ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList,
4227+
ZeEventWaitList));
4228+
4229+
// TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4230+
// left to doing copy (write back to the device).
4231+
// See https://gitlab.devtools.intel.com/one-api/level_zero/issues/293. //
4232+
// INTEL
4233+
//
4234+
// NOTE: Keep this in sync with the implementation of
4235+
// piEnqueueMemBufferMap/piEnqueueMemImageMap.
4236+
4237+
ZE_CALL(zeCommandListAppendMemoryCopy(
4238+
ZeCommandList, pi_cast<char *>(MemObj->getZeHandle()) + MapInfo.Offset,
4239+
MappedPtr, MapInfo.Size, ZeEvent, 0, nullptr));
4240+
4241+
// Execute command list asynchronously, as the event will be used
4242+
// to track down its completion.
4243+
if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence))
4244+
return Res;
4245+
4246+
_pi_event::deleteZeEventList(ZeEventWaitList);
4247+
}
41144248

41154249
return PI_SUCCESS;
41164250
}

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,22 @@ struct _pi_event : _pi_object {
451451
// Level Zero event pool handle.
452452
ze_event_pool_handle_t ZeEventPool;
453453

454+
// The following are used by MemBufferMap/UnMap on integrated devices
455+
// Flag to indicate that a copy is pending from a map/unmap operation
456+
bool HostSyncforMap = false;
457+
#if 0
458+
// List of incoming events to be satisfied before the copy can be done
459+
std::vector<ze_event_handle_t> waitEvents;
460+
// The destination of the map/unmap copy
461+
void *DstBuffer = nullptr;
462+
// The source of the map/unmap copy
463+
void *SrcBuffer = nullptr;
464+
// The size of the copy
465+
size_t RetMapSize = 0;
466+
// A flag that enables doing the copy only once, for a list of events
467+
bool CopyPending = false;
468+
#endif
469+
454470
// Level Zero command list where the command signaling this event was appended
455471
// to. This is currently used to remember/destroy the command list after all
456472
// commands in it are completed, i.e. this event signaled.

0 commit comments

Comments
 (0)