Skip to content

Commit dd683b3

Browse files
committed
[SYCL] Change in buffer management for integrated GPUs.
Signed-off-by: rdeodhar <[email protected]>
1 parent 2939b92 commit dd683b3

File tree

2 files changed

+152
-75
lines changed

2 files changed

+152
-75
lines changed

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 144 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1862,34 +1862,50 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
18621862
assert(RetMem);
18631863

18641864
void *Ptr;
1865+
ze_device_handle_t ZeDevice = Context->Devices[0]->ZeDevice;
1866+
1867+
// We treat integrated devices (physical memory shared with the CPU)
1868+
// differently from discrete devices (those with distinct memories).
1869+
// For integrated devices, allocating the buffer in host shared memory
1870+
// enables automatic access from the device, and makes copying
1871+
// unnecessary in the map/unmap operations. This improves performance.
1872+
bool DeviceIsIntegrated = Context->Devices.size() == 1 &&
1873+
Context->Devices[0]->ZeDeviceProperties.flags &
1874+
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
1875+
1876+
if (DeviceIsIntegrated) {
1877+
ze_host_mem_alloc_desc_t ZeDesc = {};
1878+
ZeDesc.flags = 0;
18651879

1866-
ze_device_mem_alloc_desc_t ZeDeviceMemDesc = {};
1867-
ZeDeviceMemDesc.flags = 0;
1868-
ZeDeviceMemDesc.ordinal = 0;
1880+
ZE_CALL(zeMemAllocHost(Context->ZeContext, &ZeDesc, Size, 4096, &Ptr));
18691881

1870-
if (Context->Devices.size() == 1) {
1871-
ZE_CALL(zeMemAllocDevice(Context->ZeContext, &ZeDeviceMemDesc, Size,
1872-
1, // TODO: alignment
1873-
Context->Devices[0]->ZeDevice, &Ptr));
18741882
} else {
1875-
ze_host_mem_alloc_desc_t ZeHostMemDesc = {};
1876-
ZeHostMemDesc.flags = 0;
1877-
ZE_CALL(zeMemAllocShared(Context->ZeContext, &ZeDeviceMemDesc,
1878-
&ZeHostMemDesc, Size,
1879-
1, // TODO: alignment
1880-
nullptr, // not bound to any device
1883+
ze_device_mem_alloc_desc_t ZeDesc = {};
1884+
ZeDesc.flags = 0;
1885+
ZeDesc.ordinal = 0;
1886+
1887+
ZE_CALL(zeMemAllocDevice(Context->ZeContext, &ZeDesc, Size, 4096, ZeDevice,
18811888
&Ptr));
18821889
}
1890+
if (HostPtr) {
1891+
if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1892+
(Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
1893+
// Initialize the buffer with user data
1894+
if (DeviceIsIntegrated) {
1895+
// Do a host to host copy
1896+
memcpy(Ptr, HostPtr, Size);
1897+
} else {
18831898

1884-
if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1885-
(Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0) {
1886-
// Initialize the buffer synchronously with immediate offload
1887-
ZE_CALL(zeCommandListAppendMemoryCopy(Context->ZeCommandListInit, Ptr,
1888-
HostPtr, Size, nullptr, 0, nullptr));
1889-
} else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1890-
// Nothing more to do.
1891-
} else {
1892-
die("piMemBufferCreate: not implemented");
1899+
// Initialize the buffer synchronously with immediate offload
1900+
ZE_CALL(zeCommandListAppendMemoryCopy(Context->ZeCommandListInit, Ptr,
1901+
HostPtr, Size, nullptr, 0,
1902+
nullptr));
1903+
}
1904+
} else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1905+
// Nothing more to do.
1906+
} else {
1907+
die("piMemBufferCreate: not implemented");
1908+
}
18931909
}
18941910

18951911
auto HostPtrOrNull =
@@ -3305,7 +3321,24 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
33053321
for (uint32_t I = 0; I < NumEvents; I++) {
33063322
ze_event_handle_t ZeEvent = EventList[I]->ZeEvent;
33073323
zePrint("ZeEvent = %lx\n", pi_cast<std::uintptr_t>(ZeEvent));
3308-
ZE_CALL(zeEventHostSynchronize(ZeEvent, UINT32_MAX));
3324+
3325+
// If event comes from a Map/UnMap operation in integrated device, then do
3326+
// sync, memcpy, and signaling on the host
3327+
if (EventList[I]->HostSyncforMap) {
3328+
for (auto ZeWaitEvent : EventList[I]->waitEvents) {
3329+
zePrint("ZeWaitEvent = %lx\n", pi_cast<std::uintptr_t>(ZeWaitEvent));
3330+
if (ZeWaitEvent)
3331+
ZE_CALL(zeEventHostSynchronize(ZeWaitEvent, UINT32_MAX));
3332+
}
3333+
if (EventList[I]->CopyPending) {
3334+
memcpy(EventList[I]->DstBuffer, EventList[I]->SrcBuffer,
3335+
EventList[I]->RetMapSize);
3336+
EventList[I]->CopyPending = false;
3337+
}
3338+
ZE_CALL(zeEventHostSignal(ZeEvent));
3339+
} else {
3340+
ZE_CALL(zeEventHostSynchronize(ZeEvent, UINT32_MAX));
3341+
}
33093342

33103343
// NOTE: we are destroying associated command lists here to free
33113344
// resources sooner in case RT is not calling piEventRelease soon enough.
@@ -3983,14 +4016,14 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
39834016
// Lock automatically releases when this goes out of scope.
39844017
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
39854018

3986-
// Get a new command list to be used on this call
4019+
bool DeviceIsIntegrated = Queue->Device->ZeDeviceProperties.flags &
4020+
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
4021+
4022+
// For discrete devices we don't need a commandlist
39874023
ze_command_list_handle_t ZeCommandList = nullptr;
39884024
ze_fence_handle_t ZeFence = nullptr;
3989-
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
3990-
&ZeFence))
3991-
return Res;
3992-
39934025
ze_event_handle_t ZeEvent = nullptr;
4026+
39944027
if (Event) {
39954028
auto Res = piEventCreate(Queue->Context, Event);
39964029
if (Res != PI_SUCCESS)
@@ -4003,38 +4036,47 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
40034036
ZeEvent = (*Event)->ZeEvent;
40044037
}
40054038

4039+
if (DeviceIsIntegrated) {
4040+
(*Event)->HostSyncforMap = true;
4041+
for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
4042+
zePrint("Map added ZeWaitEvent = %lx\n",
4043+
pi_cast<std::uintptr_t>(EventWaitList[i]->ZeEvent));
4044+
(*Event)->waitEvents.push_back(EventWaitList[i]->ZeEvent);
4045+
}
4046+
if (Buffer->MapHostPtr) {
4047+
*RetMap = Buffer->MapHostPtr + Offset;
4048+
(*Event)->SrcBuffer = pi_cast<char *>(Buffer->getZeHandle()) + Offset;
4049+
(*Event)->DstBuffer = *RetMap;
4050+
(*Event)->RetMapSize = Size;
4051+
(*Event)->CopyPending = true;
4052+
} else {
4053+
*RetMap = pi_cast<char *>(Buffer->getZeHandle()) + Offset;
4054+
}
4055+
4056+
return Buffer->addMapping(*RetMap, Offset, Size);
4057+
}
4058+
4059+
// For discrete devices we need a command list
4060+
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
4061+
&ZeFence))
4062+
return Res;
4063+
40064064
ze_event_handle_t *ZeEventWaitList =
40074065
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
40084066

4009-
ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList,
4010-
ZeEventWaitList));
4011-
4012-
// TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4013-
// left to doing new memory allocation and a copy (read).
4014-
//
4015-
// TODO: check if the input buffer is already allocated in shared
4016-
// memory and thus is accessible from the host as is. Can we get SYCL RT
4017-
// to predict/allocate in shared memory from the beginning?
40184067
if (Buffer->MapHostPtr) {
4019-
// NOTE: borrowing below semantics from OpenCL as SYCL RT relies on it.
4020-
// It is also better for performance.
4021-
//
4022-
// "If the buffer object is created with CL_MEM_USE_HOST_PTR set in
4023-
// mem_flags, the following will be true:
4024-
// - The host_ptr specified in clCreateBuffer is guaranteed to contain the
4025-
// latest bits in the region being mapped when the clEnqueueMapBuffer
4026-
// command has completed.
4027-
// - The pointer value returned by clEnqueueMapBuffer will be derived from
4028-
// the host_ptr specified when the buffer object is created."
40294068
*RetMap = Buffer->MapHostPtr + Offset;
40304069
} else {
40314070
ze_host_mem_alloc_desc_t ZeDesc = {};
40324071
ZeDesc.flags = 0;
4033-
ZE_CALL(zeMemAllocHost(Queue->Context->ZeContext, &ZeDesc, Size,
4034-
1, // TODO: alignment
4035-
RetMap));
4072+
4073+
ZE_CALL(
4074+
zeMemAllocHost(Queue->Context->ZeContext, &ZeDesc, Size, 4096, RetMap));
40364075
}
40374076

4077+
ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList,
4078+
ZeEventWaitList));
4079+
40384080
ZE_CALL(zeCommandListAppendMemoryCopy(
40394081
ZeCommandList, *RetMap, pi_cast<char *>(Buffer->getZeHandle()) + Offset,
40404082
Size, ZeEvent, 0, nullptr));
@@ -4055,12 +4097,13 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
40554097
// Lock automatically releases when this goes out of scope.
40564098
std::lock_guard<std::mutex> lock(Queue->PiQueueMutex);
40574099

4058-
// Get a new command list to be used on this call
4100+
bool DeviceIsIntegrated = Queue->Device->ZeDeviceProperties.flags &
4101+
ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
4102+
4103+
// Integrated devices don't need a command list.
4104+
// If discrete we will get a commandlist later.
40594105
ze_command_list_handle_t ZeCommandList = nullptr;
40604106
ze_fence_handle_t ZeFence = nullptr;
4061-
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
4062-
&ZeFence))
4063-
return Res;
40644107

40654108
// TODO: handle the case when user does not care to follow the event
40664109
// of unmap completion.
@@ -4079,38 +4122,64 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
40794122
ZeEvent = (*Event)->ZeEvent;
40804123
}
40814124

4082-
ze_event_handle_t *ZeEventWaitList =
4083-
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
4084-
4085-
ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList,
4086-
ZeEventWaitList));
4087-
4088-
// TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4089-
// left to doing copy (write back to the device).
4090-
//
4091-
// NOTE: Keep this in sync with the implementation of
4092-
// piEnqueueMemBufferMap/piEnqueueMemImageMap.
40934125
_pi_mem::Mapping MapInfo = {};
40944126
if (pi_result Res = MemObj->removeMapping(MappedPtr, MapInfo))
40954127
return Res;
40964128

4097-
ZE_CALL(zeCommandListAppendMemoryCopy(
4098-
ZeCommandList, pi_cast<char *>(MemObj->getZeHandle()) + MapInfo.Offset,
4099-
MappedPtr, MapInfo.Size, ZeEvent, 0, nullptr));
4100-
41014129
// NOTE: we still have to free the host memory allocated/returned by
41024130
// piEnqueueMemBufferMap, but can only do so after the above copy
41034131
// is completed. Instead of waiting for It here (blocking), we shall
41044132
// do so in piEventRelease called for the pi_event tracking the unmap.
4133+
// In the case of an integrated device, the map operation does not allocate
4134+
// any memory, so there is nothing to free. This is indicated by a nullptr.
41054135
if (Event)
4106-
(*Event)->CommandData = MemObj->MapHostPtr ? nullptr : MappedPtr;
4136+
(*Event)->CommandData =
4137+
(DeviceIsIntegrated ? nullptr
4138+
: (MemObj->MapHostPtr ? nullptr : MappedPtr));
4139+
4140+
if (DeviceIsIntegrated) {
4141+
(*Event)->HostSyncforMap = true;
4142+
for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
4143+
zePrint("UnMap Added ZeWaitEvent = %lx\n",
4144+
pi_cast<std::uintptr_t>(EventWaitList[i]->ZeEvent));
4145+
(*Event)->waitEvents.push_back(EventWaitList[i]->ZeEvent);
4146+
}
4147+
(*Event)->SrcBuffer = MappedPtr;
4148+
(*Event)->DstBuffer =
4149+
pi_cast<char *>(MemObj->getZeHandle()) + MapInfo.Offset;
4150+
(*Event)->RetMapSize = MapInfo.Size;
4151+
(*Event)->CopyPending = true;
4152+
} else {
41074153

4108-
// Execute command list asynchronously, as the event will be used
4109-
// to track down its completion.
4110-
if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence))
4111-
return Res;
4154+
if (auto Res = Queue->Device->getAvailableCommandList(Queue, &ZeCommandList,
4155+
&ZeFence))
4156+
return Res;
41124157

4113-
_pi_event::deleteZeEventList(ZeEventWaitList);
4158+
ze_event_handle_t *ZeEventWaitList =
4159+
_pi_event::createZeEventList(NumEventsInWaitList, EventWaitList);
4160+
4161+
ZE_CALL(zeCommandListAppendWaitOnEvents(ZeCommandList, NumEventsInWaitList,
4162+
ZeEventWaitList));
4163+
4164+
// TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4165+
// left to doing copy (write back to the device).
4166+
// See https://gitlab.devtools.intel.com/one-api/level_zero/issues/293. //
4167+
// INTEL
4168+
//
4169+
// NOTE: Keep this in sync with the implementation of
4170+
// piEnqueueMemBufferMap/piEnqueueMemImageMap.
4171+
4172+
ZE_CALL(zeCommandListAppendMemoryCopy(
4173+
ZeCommandList, pi_cast<char *>(MemObj->getZeHandle()) + MapInfo.Offset,
4174+
MappedPtr, MapInfo.Size, ZeEvent, 0, nullptr));
4175+
4176+
// Execute command list asynchronously, as the event will be used
4177+
// to track down its completion.
4178+
if (auto Res = Queue->executeCommandList(ZeCommandList, ZeFence))
4179+
return Res;
4180+
4181+
_pi_event::deleteZeEventList(ZeEventWaitList);
4182+
}
41144183

41154184
return PI_SUCCESS;
41164185
}

sycl/plugins/level_zero/pi_level_zero.hpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -451,6 +451,14 @@ struct _pi_event : _pi_object {
451451
// Level Zero event pool handle.
452452
ze_event_pool_handle_t ZeEventPool;
453453

454+
// The following are used by MemBufferMap/UnMap on integrated devices
455+
bool HostSyncforMap = false;
456+
std::vector<ze_event_handle_t> waitEvents;
457+
void *DstBuffer = nullptr;
458+
void *SrcBuffer = nullptr;
459+
size_t RetMapSize = 0;
460+
bool CopyPending = false;
461+
454462
// Level Zero command list where the command signaling this event was appended
455463
// to. This is currently used to remember/destroy the command list after all
456464
// commands in it are completed, i.e. this event signaled.

0 commit comments

Comments
 (0)