@@ -1862,34 +1862,50 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
1862
1862
assert (RetMem);
1863
1863
1864
1864
void *Ptr;
1865
+ ze_device_handle_t ZeDevice = Context->Devices [0 ]->ZeDevice ;
1866
+
1867
+ // We treat integrated devices (physical memory shared with the CPU)
1868
+ // differently from discrete devices (those with distinct memories).
1869
+ // For integrated devices, allocating the buffer in host shared memory
1870
+ // enables automatic access from the device, and makes copying
1871
+ // unnecessary in the map/unmap operations. This improves performance.
1872
+ bool DeviceIsIntegrated = Context->Devices .size () == 1 &&
1873
+ Context->Devices [0 ]->ZeDeviceProperties .flags &
1874
+ ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
1875
+
1876
+ if (DeviceIsIntegrated) {
1877
+ ze_host_mem_alloc_desc_t ZeDesc = {};
1878
+ ZeDesc.flags = 0 ;
1865
1879
1866
- ze_device_mem_alloc_desc_t ZeDeviceMemDesc = {};
1867
- ZeDeviceMemDesc.flags = 0 ;
1868
- ZeDeviceMemDesc.ordinal = 0 ;
1880
+ ZE_CALL (zeMemAllocHost (Context->ZeContext , &ZeDesc, Size, 4096 , &Ptr));
1869
1881
1870
- if (Context->Devices .size () == 1 ) {
1871
- ZE_CALL (zeMemAllocDevice (Context->ZeContext , &ZeDeviceMemDesc, Size,
1872
- 1 , // TODO: alignment
1873
- Context->Devices [0 ]->ZeDevice , &Ptr));
1874
1882
} else {
1875
- ze_host_mem_alloc_desc_t ZeHostMemDesc = {};
1876
- ZeHostMemDesc.flags = 0 ;
1877
- ZE_CALL (zeMemAllocShared (Context->ZeContext , &ZeDeviceMemDesc,
1878
- &ZeHostMemDesc, Size,
1879
- 1 , // TODO: alignment
1880
- nullptr , // not bound to any device
1883
+ ze_device_mem_alloc_desc_t ZeDesc = {};
1884
+ ZeDesc.flags = 0 ;
1885
+ ZeDesc.ordinal = 0 ;
1886
+
1887
+ ZE_CALL (zeMemAllocDevice (Context->ZeContext , &ZeDesc, Size, 4096 , ZeDevice,
1881
1888
&Ptr));
1882
1889
}
1890
+ if (HostPtr) {
1891
+ if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1892
+ (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0 ) {
1893
+ // Initialize the buffer with user data
1894
+ if (DeviceIsIntegrated) {
1895
+ // Do a host to host copy
1896
+ memcpy (Ptr, HostPtr, Size);
1897
+ } else {
1883
1898
1884
- if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1885
- (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0 ) {
1886
- // Initialize the buffer synchronously with immediate offload
1887
- ZE_CALL (zeCommandListAppendMemoryCopy (Context->ZeCommandListInit , Ptr,
1888
- HostPtr, Size, nullptr , 0 , nullptr ));
1889
- } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1890
- // Nothing more to do.
1891
- } else {
1892
- die (" piMemBufferCreate: not implemented" );
1899
+ // Initialize the buffer synchronously with immediate offload
1900
+ ZE_CALL (zeCommandListAppendMemoryCopy (Context->ZeCommandListInit , Ptr,
1901
+ HostPtr, Size, nullptr , 0 ,
1902
+ nullptr ));
1903
+ }
1904
+ } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1905
+ // Nothing more to do.
1906
+ } else {
1907
+ die (" piMemBufferCreate: not implemented" );
1908
+ }
1893
1909
}
1894
1910
1895
1911
auto HostPtrOrNull =
@@ -3305,7 +3321,26 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
3305
3321
for (uint32_t I = 0 ; I < NumEvents; I++) {
3306
3322
ze_event_handle_t ZeEvent = EventList[I]->ZeEvent ;
3307
3323
zePrint (" ZeEvent = %lx\n " , pi_cast<std::uintptr_t >(ZeEvent));
3308
- ZE_CALL (zeEventHostSynchronize (ZeEvent, UINT32_MAX));
3324
+
3325
+ // If event comes from a Map/UnMap operation in integrated device, then do
3326
+ // sync, memcpy, and signaling on the host
3327
+ if (EventList[I]->HostSyncforMap ) {
3328
+ #if 0
3329
+ for (auto ZeWaitEvent : EventList[I]->waitEvents) {
3330
+ zePrint("ZeWaitEvent = %lx\n", pi_cast<std::uintptr_t>(ZeWaitEvent));
3331
+ if (ZeWaitEvent)
3332
+ ZE_CALL(zeEventHostSynchronize(ZeWaitEvent, UINT32_MAX));
3333
+ }
3334
+ if (EventList[I]->CopyPending) {
3335
+ memcpy(EventList[I]->DstBuffer, EventList[I]->SrcBuffer,
3336
+ EventList[I]->RetMapSize);
3337
+ EventList[I]->CopyPending = false;
3338
+ }
3339
+ #endif
3340
+ ZE_CALL (zeEventHostSignal (ZeEvent));
3341
+ } else {
3342
+ ZE_CALL (zeEventHostSynchronize (ZeEvent, UINT32_MAX));
3343
+ }
3309
3344
3310
3345
// NOTE: we are destroying associated command lists here to free
3311
3346
// resources sooner in case RT is not calling piEventRelease soon enough.
@@ -3983,14 +4018,22 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
3983
4018
// Lock automatically releases when this goes out of scope.
3984
4019
std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
3985
4020
3986
- // Get a new command list to be used on this call
4021
+ // Query the buffer allocation to determine if host allocation
4022
+ ze_memory_allocation_properties_t ZeMemoryAllocationProperties = {};
4023
+ ze_device_handle_t ZeDeviceHandle;
4024
+
4025
+ ZE_CALL (
4026
+ zeMemGetAllocProperties (Queue->Context ->ZeContext , Buffer->getZeHandle (),
4027
+ &ZeMemoryAllocationProperties, &ZeDeviceHandle));
4028
+
4029
+ bool BufferUsesHostMem =
4030
+ (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST);
4031
+
4032
+ // For discrete devices we don't need a commandlist
3987
4033
ze_command_list_handle_t ZeCommandList = nullptr ;
3988
4034
ze_fence_handle_t ZeFence = nullptr ;
3989
- if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
3990
- &ZeFence))
3991
- return Res;
3992
-
3993
4035
ze_event_handle_t ZeEvent = nullptr ;
4036
+
3994
4037
if (Event) {
3995
4038
auto Res = piEventCreate (Queue->Context , Event);
3996
4039
if (Res != PI_SUCCESS)
@@ -4003,38 +4046,84 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
4003
4046
ZeEvent = (*Event)->ZeEvent ;
4004
4047
}
4005
4048
4049
+ // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4050
+ // left to doing new memory allocation and a copy (read) on discrete devices.
4051
+ // On integrated devices we have allocated the buffer in host memory
4052
+ // so no actions are needed here except for synchronizing on incoming events
4053
+ // and doing a host-to-host copy if a host pointer had been supplied
4054
+ // during buffer creation.
4055
+ //
4056
+ // TODO: for discrete, check if the input buffer is already allocated
4057
+ // in shared memory and thus is accessible from the host as is.
4058
+ // Can we get SYCL RT to predict/allocate in shared memory
4059
+ // from the beginning?
4060
+ if (BufferUsesHostMem) {
4061
+ (*Event)->HostSyncforMap = true ;
4062
+ #if 0
4063
+ for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
4064
+ zePrint("Map added ZeWaitEvent = %lx\n",
4065
+ pi_cast<std::uintptr_t>(EventWaitList[i]->ZeEvent));
4066
+ (*Event)->waitEvents.push_back(EventWaitList[i]->ZeEvent);
4067
+ }
4068
+ #else
4069
+ for (uint32_t i = 0 ; i < NumEventsInWaitList; i++) {
4070
+ auto Queue = EventWaitList[i]->Queue ;
4071
+ zePrint (" Got Q\n " );
4072
+ if (Queue->RefCount > 0 ) {
4073
+ zePrint (" Executing commandlist\n " );
4074
+ if (auto Res = Queue->executeOpenCommandList ())
4075
+ return Res;
4076
+ }
4077
+ }
4078
+ for (uint32_t i = 0 ; i < NumEventsInWaitList; i++) {
4079
+ zePrint (" Going to wait on ZeWaitEvent = %lx\n " ,
4080
+ pi_cast<std::uintptr_t >(EventWaitList[i]->ZeEvent ));
4081
+ auto ZeWaitEvent = EventWaitList[i]->ZeEvent ;
4082
+ if (ZeWaitEvent)
4083
+ ZE_CALL (zeEventHostSynchronize (ZeWaitEvent, UINT32_MAX));
4084
+ }
4085
+ #endif
4086
+ if (Buffer->MapHostPtr ) {
4087
+ *RetMap = Buffer->MapHostPtr + Offset;
4088
+ #if 0
4089
+ (*Event)->SrcBuffer = pi_cast<char*>(Buffer->getZeHandle()) + Offset;
4090
+ (*Event)->DstBuffer = *RetMap;
4091
+ (*Event)->RetMapSize = Size;
4092
+ (*Event)->CopyPending = true;
4093
+ #else
4094
+ zePrint (" Doing memcpy %p %p %zu\n " , *RetMap,
4095
+ pi_cast<char *>(Buffer->getZeHandle ()) + Offset, Size);
4096
+ memcpy (*RetMap, pi_cast<char *>(Buffer->getZeHandle ()) + Offset, Size);
4097
+ zePrint (" DONE\n " );
4098
+ #endif
4099
+ } else {
4100
+ *RetMap = pi_cast<char *>(Buffer->getZeHandle ()) + Offset;
4101
+ }
4102
+
4103
+ return Buffer->addMapping (*RetMap, Offset, Size);
4104
+ }
4105
+
4106
+ // For discrete devices we need a command list
4107
+ if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4108
+ &ZeFence))
4109
+ return Res;
4110
+
4006
4111
ze_event_handle_t *ZeEventWaitList =
4007
4112
_pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
4008
4113
4009
- ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4010
- ZeEventWaitList));
4011
-
4012
- // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4013
- // left to doing new memory allocation and a copy (read).
4014
- //
4015
- // TODO: check if the input buffer is already allocated in shared
4016
- // memory and thus is accessible from the host as is. Can we get SYCL RT
4017
- // to predict/allocate in shared memory from the beginning?
4018
4114
if (Buffer->MapHostPtr ) {
4019
- // NOTE: borrowing below semantics from OpenCL as SYCL RT relies on it.
4020
- // It is also better for performance.
4021
- //
4022
- // "If the buffer object is created with CL_MEM_USE_HOST_PTR set in
4023
- // mem_flags, the following will be true:
4024
- // - The host_ptr specified in clCreateBuffer is guaranteed to contain the
4025
- // latest bits in the region being mapped when the clEnqueueMapBuffer
4026
- // command has completed.
4027
- // - The pointer value returned by clEnqueueMapBuffer will be derived from
4028
- // the host_ptr specified when the buffer object is created."
4029
4115
*RetMap = Buffer->MapHostPtr + Offset;
4030
4116
} else {
4031
4117
ze_host_mem_alloc_desc_t ZeDesc = {};
4032
4118
ZeDesc.flags = 0 ;
4033
- ZE_CALL ( zeMemAllocHost (Queue-> Context -> ZeContext , &ZeDesc, Size,
4034
- 1 , // TODO: alignment
4035
- RetMap));
4119
+
4120
+ ZE_CALL (
4121
+ zeMemAllocHost (Queue-> Context -> ZeContext , &ZeDesc, Size, 4096 , RetMap));
4036
4122
}
4037
4123
4124
+ ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4125
+ ZeEventWaitList));
4126
+
4038
4127
ZE_CALL (zeCommandListAppendMemoryCopy (
4039
4128
ZeCommandList, *RetMap, pi_cast<char *>(Buffer->getZeHandle ()) + Offset,
4040
4129
Size, ZeEvent, 0 , nullptr ));
@@ -4055,12 +4144,21 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
4055
4144
// Lock automatically releases when this goes out of scope.
4056
4145
std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4057
4146
4058
- // Get a new command list to be used on this call
4147
+ // Query the buffer allocation to determine if host allocation
4148
+ ze_memory_allocation_properties_t ZeMemoryAllocationProperties = {};
4149
+ ze_device_handle_t ZeDeviceHandle;
4150
+
4151
+ ZE_CALL (
4152
+ zeMemGetAllocProperties (Queue->Context ->ZeContext , MemObj->getZeHandle (),
4153
+ &ZeMemoryAllocationProperties, &ZeDeviceHandle));
4154
+
4155
+ bool BufferUsesHostMem =
4156
+ (ZeMemoryAllocationProperties.type == ZE_MEMORY_TYPE_HOST);
4157
+
4158
+ // Integrated devices don't need a command list.
4159
+ // If discrete we will get a commandlist later.
4059
4160
ze_command_list_handle_t ZeCommandList = nullptr ;
4060
4161
ze_fence_handle_t ZeFence = nullptr ;
4061
- if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4062
- &ZeFence))
4063
- return Res;
4064
4162
4065
4163
// TODO: handle the case when user does not care to follow the event
4066
4164
// of unmap completion.
@@ -4079,38 +4177,74 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
4079
4177
ZeEvent = (*Event)->ZeEvent ;
4080
4178
}
4081
4179
4082
- ze_event_handle_t *ZeEventWaitList =
4083
- _pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
4084
-
4085
- ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4086
- ZeEventWaitList));
4087
-
4088
- // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4089
- // left to doing copy (write back to the device).
4090
- //
4091
- // NOTE: Keep this in sync with the implementation of
4092
- // piEnqueueMemBufferMap/piEnqueueMemImageMap.
4093
4180
_pi_mem::Mapping MapInfo = {};
4094
4181
if (pi_result Res = MemObj->removeMapping (MappedPtr, MapInfo))
4095
4182
return Res;
4096
4183
4097
- ZE_CALL (zeCommandListAppendMemoryCopy (
4098
- ZeCommandList, pi_cast<char *>(MemObj->getZeHandle ()) + MapInfo.Offset ,
4099
- MappedPtr, MapInfo.Size , ZeEvent, 0 , nullptr ));
4100
-
4101
4184
// NOTE: we still have to free the host memory allocated/returned by
4102
4185
// piEnqueueMemBufferMap, but can only do so after the above copy
4103
4186
// is completed. Instead of waiting for It here (blocking), we shall
4104
4187
// do so in piEventRelease called for the pi_event tracking the unmap.
4188
+ // In the case of an integrated device, the map operation does not allocate
4189
+ // any memory, so there is nothing to free. This is indicated by a nullptr.
4105
4190
if (Event)
4106
- (*Event)->CommandData = MemObj->MapHostPtr ? nullptr : MappedPtr;
4191
+ (*Event)->CommandData =
4192
+ (BufferUsesHostMem ? nullptr
4193
+ : (MemObj->MapHostPtr ? nullptr : MappedPtr));
4194
+
4195
+ if (BufferUsesHostMem) {
4196
+ (*Event)->HostSyncforMap = true ;
4197
+ #if 0
4198
+ for (uint32_t i = 0; i < NumEventsInWaitList; i++) {
4199
+ zePrint("UnMap Added ZeWaitEvent = %lx\n",
4200
+ pi_cast<std::uintptr_t>(EventWaitList[i]->ZeEvent));
4201
+ (*Event)->waitEvents.push_back(EventWaitList[i]->ZeEvent);
4202
+ }
4203
+ (*Event)->SrcBuffer = MappedPtr;
4204
+ (*Event)->DstBuffer =
4205
+ pi_cast<char*>(MemObj->getZeHandle()) + MapInfo.Offset;
4206
+ (*Event)->RetMapSize = MapInfo.Size;
4207
+ (*Event)->CopyPending = true;
4208
+ #else
4209
+ for (uint32_t i = 0 ; i < NumEventsInWaitList; i++) {
4210
+ auto ZeWaitEvent = EventWaitList[i]->ZeEvent ;
4211
+ if (ZeWaitEvent)
4212
+ ZE_CALL (zeEventHostSynchronize (ZeWaitEvent, UINT32_MAX));
4213
+ }
4214
+ memcpy (pi_cast<char *>(MemObj->getZeHandle ()) + MapInfo.Offset , MappedPtr,
4215
+ MapInfo.Size );
4216
+ #endif
4217
+ } else {
4107
4218
4108
- // Execute command list asynchronously, as the event will be used
4109
- // to track down its completion.
4110
- if (auto Res = Queue->executeCommandList (ZeCommandList, ZeFence))
4111
- return Res;
4219
+ if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4220
+ &ZeFence))
4221
+ return Res;
4112
4222
4113
- _pi_event::deleteZeEventList (ZeEventWaitList);
4223
+ ze_event_handle_t *ZeEventWaitList =
4224
+ _pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
4225
+
4226
+ ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4227
+ ZeEventWaitList));
4228
+
4229
+ // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4230
+ // left to doing copy (write back to the device).
4231
+ // See https://gitlab.devtools.intel.com/one-api/level_zero/issues/293. //
4232
+ // INTEL
4233
+ //
4234
+ // NOTE: Keep this in sync with the implementation of
4235
+ // piEnqueueMemBufferMap/piEnqueueMemImageMap.
4236
+
4237
+ ZE_CALL (zeCommandListAppendMemoryCopy (
4238
+ ZeCommandList, pi_cast<char *>(MemObj->getZeHandle ()) + MapInfo.Offset ,
4239
+ MappedPtr, MapInfo.Size , ZeEvent, 0 , nullptr ));
4240
+
4241
+ // Execute command list asynchronously, as the event will be used
4242
+ // to track down its completion.
4243
+ if (auto Res = Queue->executeCommandList (ZeCommandList, ZeFence))
4244
+ return Res;
4245
+
4246
+ _pi_event::deleteZeEventList (ZeEventWaitList);
4247
+ }
4114
4248
4115
4249
return PI_SUCCESS;
4116
4250
}
0 commit comments