@@ -1862,34 +1862,50 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
1862
1862
assert (RetMem);
1863
1863
1864
1864
void *Ptr;
1865
+ ze_device_handle_t ZeDevice = Context->Devices [0 ]->ZeDevice ;
1866
+
1867
+ // We treat integrated devices (physical memory shared with the CPU)
1868
+ // differently from discrete devices (those with distinct memories).
1869
+ // For integrated devices, allocating the buffer in host shared memory
1870
+ // enables automatic access from the device, and makes copying
1871
+ // unnecessary in the map/unmap operations. This improves performance.
1872
+ bool DeviceIsIntegrated = Context->Devices .size () == 1 &&
1873
+ Context->Devices [0 ]->ZeDeviceProperties .flags &
1874
+ ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
1875
+
1876
+ if (DeviceIsIntegrated) {
1877
+ ze_host_mem_alloc_desc_t ZeDesc = {};
1878
+ ZeDesc.flags = 0 ;
1865
1879
1866
- ze_device_mem_alloc_desc_t ZeDeviceMemDesc = {};
1867
- ZeDeviceMemDesc.flags = 0 ;
1868
- ZeDeviceMemDesc.ordinal = 0 ;
1880
+ ZE_CALL (zeMemAllocHost (Context->ZeContext , &ZeDesc, Size, 4096 , &Ptr));
1869
1881
1870
- if (Context->Devices .size () == 1 ) {
1871
- ZE_CALL (zeMemAllocDevice (Context->ZeContext , &ZeDeviceMemDesc, Size,
1872
- 1 , // TODO: alignment
1873
- Context->Devices [0 ]->ZeDevice , &Ptr));
1874
1882
} else {
1875
- ze_host_mem_alloc_desc_t ZeHostMemDesc = {};
1876
- ZeHostMemDesc.flags = 0 ;
1877
- ZE_CALL (zeMemAllocShared (Context->ZeContext , &ZeDeviceMemDesc,
1878
- &ZeHostMemDesc, Size,
1879
- 1 , // TODO: alignment
1880
- nullptr , // not bound to any device
1883
+ ze_device_mem_alloc_desc_t ZeDesc = {};
1884
+ ZeDesc.flags = 0 ;
1885
+ ZeDesc.ordinal = 0 ;
1886
+
1887
+ ZE_CALL (zeMemAllocDevice (Context->ZeContext , &ZeDesc, Size, 4096 , ZeDevice,
1881
1888
&Ptr));
1882
1889
}
1890
+ if (HostPtr) {
1891
+ if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1892
+ (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0 ) {
1893
+ // Initialize the buffer with user data
1894
+ if (DeviceIsIntegrated) {
1895
+ // Do a host to host copy
1896
+ memcpy (Ptr, HostPtr, Size);
1897
+ } else {
1883
1898
1884
- if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1885
- (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0 ) {
1886
- // Initialize the buffer synchronously with immediate offload
1887
- ZE_CALL (zeCommandListAppendMemoryCopy (Context->ZeCommandListInit , Ptr,
1888
- HostPtr, Size, nullptr , 0 , nullptr ));
1889
- } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1890
- // Nothing more to do.
1891
- } else {
1892
- die (" piMemBufferCreate: not implemented" );
1899
+ // Initialize the buffer synchronously with immediate offload
1900
+ ZE_CALL (zeCommandListAppendMemoryCopy (Context->ZeCommandListInit , Ptr,
1901
+ HostPtr, Size, nullptr , 0 ,
1902
+ nullptr ));
1903
+ }
1904
+ } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1905
+ // Nothing more to do.
1906
+ } else {
1907
+ die (" piMemBufferCreate: not implemented" );
1908
+ }
1893
1909
}
1894
1910
1895
1911
auto HostPtrOrNull =
@@ -3305,7 +3321,24 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
3305
3321
for (uint32_t I = 0 ; I < NumEvents; I++) {
3306
3322
ze_event_handle_t ZeEvent = EventList[I]->ZeEvent ;
3307
3323
zePrint (" ZeEvent = %lx\n " , pi_cast<std::uintptr_t >(ZeEvent));
3308
- ZE_CALL (zeEventHostSynchronize (ZeEvent, UINT32_MAX));
3324
+
3325
+ // If event comes from a Map/UnMap operation in integrated device, then do
3326
+ // sync, memcpy, and signaling on the host
3327
+ if (EventList[I]->HostSyncforMap ) {
3328
+ for (auto ZeWaitEvent : EventList[I]->waitEvents ) {
3329
+ zePrint (" ZeWaitEvent = %lx\n " , pi_cast<std::uintptr_t >(ZeWaitEvent));
3330
+ if (ZeWaitEvent)
3331
+ ZE_CALL (zeEventHostSynchronize (ZeWaitEvent, UINT32_MAX));
3332
+ }
3333
+ if (EventList[I]->CopyPending ) {
3334
+ memcpy (EventList[I]->DstBuffer , EventList[I]->SrcBuffer ,
3335
+ EventList[I]->RetMapSize );
3336
+ EventList[I]->CopyPending = false ;
3337
+ }
3338
+ ZE_CALL (zeEventHostSignal (ZeEvent));
3339
+ } else {
3340
+ ZE_CALL (zeEventHostSynchronize (ZeEvent, UINT32_MAX));
3341
+ }
3309
3342
3310
3343
// NOTE: we are destroying associated command lists here to free
3311
3344
// resources sooner in case RT is not calling piEventRelease soon enough.
@@ -3983,14 +4016,14 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
3983
4016
// Lock automatically releases when this goes out of scope.
3984
4017
std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
3985
4018
3986
- // Get a new command list to be used on this call
4019
+ bool DeviceIsIntegrated = Queue->Device ->ZeDeviceProperties .flags &
4020
+ ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
4021
+
4022
+ // For discrete devices we don't need a commandlist
3987
4023
ze_command_list_handle_t ZeCommandList = nullptr ;
3988
4024
ze_fence_handle_t ZeFence = nullptr ;
3989
- if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
3990
- &ZeFence))
3991
- return Res;
3992
-
3993
4025
ze_event_handle_t ZeEvent = nullptr ;
4026
+
3994
4027
if (Event) {
3995
4028
auto Res = piEventCreate (Queue->Context , Event);
3996
4029
if (Res != PI_SUCCESS)
@@ -4003,38 +4036,47 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
4003
4036
ZeEvent = (*Event)->ZeEvent ;
4004
4037
}
4005
4038
4039
+ if (DeviceIsIntegrated) {
4040
+ (*Event)->HostSyncforMap = true ;
4041
+ for (uint32_t i = 0 ; i < NumEventsInWaitList; i++) {
4042
+ zePrint (" Map added ZeWaitEvent = %lx\n " ,
4043
+ pi_cast<std::uintptr_t >(EventWaitList[i]->ZeEvent ));
4044
+ (*Event)->waitEvents .push_back (EventWaitList[i]->ZeEvent );
4045
+ }
4046
+ if (Buffer->MapHostPtr ) {
4047
+ *RetMap = Buffer->MapHostPtr + Offset;
4048
+ (*Event)->SrcBuffer = pi_cast<char *>(Buffer->getZeHandle ()) + Offset;
4049
+ (*Event)->DstBuffer = *RetMap;
4050
+ (*Event)->RetMapSize = Size;
4051
+ (*Event)->CopyPending = true ;
4052
+ } else {
4053
+ *RetMap = pi_cast<char *>(Buffer->getZeHandle ()) + Offset;
4054
+ }
4055
+
4056
+ return Buffer->addMapping (*RetMap, Offset, Size);
4057
+ }
4058
+
4059
+ // For discrete devices we need a command list
4060
+ if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4061
+ &ZeFence))
4062
+ return Res;
4063
+
4006
4064
ze_event_handle_t *ZeEventWaitList =
4007
4065
_pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
4008
4066
4009
- ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4010
- ZeEventWaitList));
4011
-
4012
- // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4013
- // left to doing new memory allocation and a copy (read).
4014
- //
4015
- // TODO: check if the input buffer is already allocated in shared
4016
- // memory and thus is accessible from the host as is. Can we get SYCL RT
4017
- // to predict/allocate in shared memory from the beginning?
4018
4067
if (Buffer->MapHostPtr ) {
4019
- // NOTE: borrowing below semantics from OpenCL as SYCL RT relies on it.
4020
- // It is also better for performance.
4021
- //
4022
- // "If the buffer object is created with CL_MEM_USE_HOST_PTR set in
4023
- // mem_flags, the following will be true:
4024
- // - The host_ptr specified in clCreateBuffer is guaranteed to contain the
4025
- // latest bits in the region being mapped when the clEnqueueMapBuffer
4026
- // command has completed.
4027
- // - The pointer value returned by clEnqueueMapBuffer will be derived from
4028
- // the host_ptr specified when the buffer object is created."
4029
4068
*RetMap = Buffer->MapHostPtr + Offset;
4030
4069
} else {
4031
4070
ze_host_mem_alloc_desc_t ZeDesc = {};
4032
4071
ZeDesc.flags = 0 ;
4033
- ZE_CALL ( zeMemAllocHost (Queue-> Context -> ZeContext , &ZeDesc, Size,
4034
- 1 , // TODO: alignment
4035
- RetMap));
4072
+
4073
+ ZE_CALL (
4074
+ zeMemAllocHost (Queue-> Context -> ZeContext , &ZeDesc, Size, 4096 , RetMap));
4036
4075
}
4037
4076
4077
+ ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4078
+ ZeEventWaitList));
4079
+
4038
4080
ZE_CALL (zeCommandListAppendMemoryCopy (
4039
4081
ZeCommandList, *RetMap, pi_cast<char *>(Buffer->getZeHandle ()) + Offset,
4040
4082
Size, ZeEvent, 0 , nullptr ));
@@ -4055,12 +4097,13 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
4055
4097
// Lock automatically releases when this goes out of scope.
4056
4098
std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4057
4099
4058
- // Get a new command list to be used on this call
4100
+ bool DeviceIsIntegrated = Queue->Device ->ZeDeviceProperties .flags &
4101
+ ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
4102
+
4103
+ // Integrated devices don't need a command list.
4104
+ // If discrete we will get a commandlist later.
4059
4105
ze_command_list_handle_t ZeCommandList = nullptr ;
4060
4106
ze_fence_handle_t ZeFence = nullptr ;
4061
- if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4062
- &ZeFence))
4063
- return Res;
4064
4107
4065
4108
// TODO: handle the case when user does not care to follow the event
4066
4109
// of unmap completion.
@@ -4079,38 +4122,64 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
4079
4122
ZeEvent = (*Event)->ZeEvent ;
4080
4123
}
4081
4124
4082
- ze_event_handle_t *ZeEventWaitList =
4083
- _pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
4084
-
4085
- ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4086
- ZeEventWaitList));
4087
-
4088
- // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4089
- // left to doing copy (write back to the device).
4090
- //
4091
- // NOTE: Keep this in sync with the implementation of
4092
- // piEnqueueMemBufferMap/piEnqueueMemImageMap.
4093
4125
_pi_mem::Mapping MapInfo = {};
4094
4126
if (pi_result Res = MemObj->removeMapping (MappedPtr, MapInfo))
4095
4127
return Res;
4096
4128
4097
- ZE_CALL (zeCommandListAppendMemoryCopy (
4098
- ZeCommandList, pi_cast<char *>(MemObj->getZeHandle ()) + MapInfo.Offset ,
4099
- MappedPtr, MapInfo.Size , ZeEvent, 0 , nullptr ));
4100
-
4101
4129
// NOTE: we still have to free the host memory allocated/returned by
4102
4130
// piEnqueueMemBufferMap, but can only do so after the above copy
4103
4131
// is completed. Instead of waiting for It here (blocking), we shall
4104
4132
// do so in piEventRelease called for the pi_event tracking the unmap.
4133
+ // In the case of an integrated device, the map operation does not allocate
4134
+ // any memory, so there is nothing to free. This is indicated by a nullptr.
4105
4135
if (Event)
4106
- (*Event)->CommandData = MemObj->MapHostPtr ? nullptr : MappedPtr;
4136
+ (*Event)->CommandData =
4137
+ (DeviceIsIntegrated ? nullptr
4138
+ : (MemObj->MapHostPtr ? nullptr : MappedPtr));
4139
+
4140
+ if (DeviceIsIntegrated) {
4141
+ (*Event)->HostSyncforMap = true ;
4142
+ for (uint32_t i = 0 ; i < NumEventsInWaitList; i++) {
4143
+ zePrint (" UnMap Added ZeWaitEvent = %lx\n " ,
4144
+ pi_cast<std::uintptr_t >(EventWaitList[i]->ZeEvent ));
4145
+ (*Event)->waitEvents .push_back (EventWaitList[i]->ZeEvent );
4146
+ }
4147
+ (*Event)->SrcBuffer = MappedPtr;
4148
+ (*Event)->DstBuffer =
4149
+ pi_cast<char *>(MemObj->getZeHandle ()) + MapInfo.Offset ;
4150
+ (*Event)->RetMapSize = MapInfo.Size ;
4151
+ (*Event)->CopyPending = true ;
4152
+ } else {
4107
4153
4108
- // Execute command list asynchronously, as the event will be used
4109
- // to track down its completion.
4110
- if (auto Res = Queue->executeCommandList (ZeCommandList, ZeFence))
4111
- return Res;
4154
+ if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4155
+ &ZeFence))
4156
+ return Res;
4112
4157
4113
- _pi_event::deleteZeEventList (ZeEventWaitList);
4158
+ ze_event_handle_t *ZeEventWaitList =
4159
+ _pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
4160
+
4161
+ ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4162
+ ZeEventWaitList));
4163
+
4164
+ // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4165
+ // left to doing copy (write back to the device).
4166
+ // See https://gitlab.devtools.intel.com/one-api/level_zero/issues/293. //
4167
+ // INTEL
4168
+ //
4169
+ // NOTE: Keep this in sync with the implementation of
4170
+ // piEnqueueMemBufferMap/piEnqueueMemImageMap.
4171
+
4172
+ ZE_CALL (zeCommandListAppendMemoryCopy (
4173
+ ZeCommandList, pi_cast<char *>(MemObj->getZeHandle ()) + MapInfo.Offset ,
4174
+ MappedPtr, MapInfo.Size , ZeEvent, 0 , nullptr ));
4175
+
4176
+ // Execute command list asynchronously, as the event will be used
4177
+ // to track down its completion.
4178
+ if (auto Res = Queue->executeCommandList (ZeCommandList, ZeFence))
4179
+ return Res;
4180
+
4181
+ _pi_event::deleteZeEventList (ZeEventWaitList);
4182
+ }
4114
4183
4115
4184
return PI_SUCCESS;
4116
4185
}
0 commit comments