@@ -1916,42 +1916,59 @@ pi_result piMemBufferCreate(pi_context Context, pi_mem_flags Flags, size_t Size,
1916
1916
assert (RetMem);
1917
1917
1918
1918
void *Ptr;
1919
+ ze_device_handle_t ZeDevice = Context->Devices [0 ]->ZeDevice ;
1920
+
1921
+ // We treat integrated devices (physical memory shared with the CPU)
1922
+ // differently from discrete devices (those with distinct memories).
1923
+ // For integrated devices, allocating the buffer in host shared memory
1924
+ // enables automatic access from the device, and makes copying
1925
+ // unnecessary in the map/unmap operations. This improves performance.
1926
+ bool DeviceIsIntegrated = Context->Devices .size () == 1 &&
1927
+ Context->Devices [0 ]->ZeDeviceProperties .flags &
1928
+ ZE_DEVICE_PROPERTY_FLAG_INTEGRATED;
1929
+
1930
+ if (DeviceIsIntegrated) {
1931
+ ze_host_mem_alloc_desc_t ZeDesc = {};
1932
+ ZeDesc.flags = 0 ;
1919
1933
1920
- ze_device_mem_alloc_desc_t ZeDeviceMemDesc = {};
1921
- ZeDeviceMemDesc.flags = 0 ;
1922
- ZeDeviceMemDesc.ordinal = 0 ;
1934
+ ZE_CALL (zeMemAllocHost (Context->ZeContext , &ZeDesc, Size, 1 , &Ptr));
1923
1935
1924
- if (Context->Devices .size () == 1 ) {
1925
- ZE_CALL (zeMemAllocDevice (Context->ZeContext , &ZeDeviceMemDesc, Size,
1926
- 1 , // TODO: alignment
1927
- Context->Devices [0 ]->ZeDevice , &Ptr));
1928
- } else {
1929
- ze_host_mem_alloc_desc_t ZeHostMemDesc = {};
1930
- ZeHostMemDesc.flags = 0 ;
1931
- ZE_CALL (zeMemAllocShared (Context->ZeContext , &ZeDeviceMemDesc,
1932
- &ZeHostMemDesc, Size,
1933
- 1 , // TODO: alignment
1934
- nullptr , // not bound to any device
1935
- &Ptr));
1936
- }
1937
-
1938
- if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1939
- (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0 ) {
1940
- // Initialize the buffer synchronously with immediate offload
1941
- ZE_CALL (zeCommandListAppendMemoryCopy (Context->ZeCommandListInit , Ptr,
1942
- HostPtr, Size, nullptr , 0 , nullptr ));
1943
- } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1944
- // Nothing more to do.
1945
1936
} else {
1946
- die (" piMemBufferCreate: not implemented" );
1937
+ ze_device_mem_alloc_desc_t ZeDesc = {};
1938
+ ZeDesc.flags = 0 ;
1939
+ ZeDesc.ordinal = 0 ;
1940
+
1941
+ ZE_CALL (
1942
+ zeMemAllocDevice (Context->ZeContext , &ZeDesc, Size, 1 , ZeDevice, &Ptr));
1943
+ }
1944
+ if (HostPtr) {
1945
+ if ((Flags & PI_MEM_FLAGS_HOST_PTR_USE) != 0 ||
1946
+ (Flags & PI_MEM_FLAGS_HOST_PTR_COPY) != 0 ) {
1947
+ // Initialize the buffer with user data
1948
+ if (DeviceIsIntegrated) {
1949
+ // Do a host to host copy
1950
+ memcpy (Ptr, HostPtr, Size);
1951
+ } else {
1952
+
1953
+ // Initialize the buffer synchronously with immediate offload
1954
+ ZE_CALL (zeCommandListAppendMemoryCopy (Context->ZeCommandListInit , Ptr,
1955
+ HostPtr, Size, nullptr , 0 ,
1956
+ nullptr ));
1957
+ }
1958
+ } else if (Flags == 0 || (Flags == PI_MEM_FLAGS_ACCESS_RW)) {
1959
+ // Nothing more to do.
1960
+ } else {
1961
+ die (" piMemBufferCreate: not implemented" );
1962
+ }
1947
1963
}
1948
1964
1949
1965
auto HostPtrOrNull =
1950
1966
(Flags & PI_MEM_FLAGS_HOST_PTR_USE) ? pi_cast<char *>(HostPtr) : nullptr ;
1951
1967
try {
1952
1968
*RetMem = new _pi_buffer (
1953
1969
Context, pi_cast<char *>(Ptr) /* Level Zero Memory Handle */ ,
1954
- HostPtrOrNull);
1970
+ HostPtrOrNull, nullptr , 0 , 0 ,
1971
+ DeviceIsIntegrated /* Flag indicating allocation in host memory */ );
1955
1972
} catch (const std::bad_alloc &) {
1956
1973
return PI_OUT_OF_HOST_MEMORY;
1957
1974
} catch (...) {
@@ -4031,17 +4048,11 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
4031
4048
assert (Buffer);
4032
4049
assert (Queue);
4033
4050
4034
- // Lock automatically releases when this goes out of scope.
4035
- std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4036
-
4037
- // Get a new command list to be used on this call
4051
+ // For discrete devices we don't need a commandlist
4038
4052
ze_command_list_handle_t ZeCommandList = nullptr ;
4039
4053
ze_fence_handle_t ZeFence = nullptr ;
4040
- if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4041
- &ZeFence))
4042
- return Res;
4043
-
4044
4054
ze_event_handle_t ZeEvent = nullptr ;
4055
+
4045
4056
if (Event) {
4046
4057
auto Res = piEventCreate (Queue->Context , Event);
4047
4058
if (Res != PI_SUCCESS)
@@ -4054,38 +4065,64 @@ piEnqueueMemBufferMap(pi_queue Queue, pi_mem Buffer, pi_bool BlockingMap,
4054
4065
ZeEvent = (*Event)->ZeEvent ;
4055
4066
}
4056
4067
4068
+ // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4069
+ // left to doing new memory allocation and a copy (read) on discrete devices.
4070
+ // On integrated devices we have allocated the buffer in host memory
4071
+ // so no actions are needed here except for synchronizing on incoming events
4072
+ // and doing a host-to-host copy if a host pointer had been supplied
4073
+ // during buffer creation.
4074
+ //
4075
+ // TODO: for discrete, check if the input buffer is already allocated
4076
+ // in shared memory and thus is accessible from the host as is.
4077
+ // Can we get SYCL RT to predict/allocate in shared memory
4078
+ // from the beginning?
4079
+ //
4080
+ // On integrated devices the buffer has been allocated in host memory.
4081
+ if (Buffer->OnHost ) {
4082
+ // Wait on incoming events before doing the copy
4083
+ piEventsWait (NumEventsInWaitList, EventWaitList);
4084
+ if (Buffer->MapHostPtr ) {
4085
+ *RetMap = Buffer->MapHostPtr + Offset;
4086
+ memcpy (*RetMap, pi_cast<char *>(Buffer->getZeHandle ()) + Offset, Size);
4087
+ } else {
4088
+ *RetMap = pi_cast<char *>(Buffer->getZeHandle ()) + Offset;
4089
+ }
4090
+
4091
+ // Signal this event
4092
+ ZE_CALL (zeEventHostSignal (ZeEvent));
4093
+
4094
+ return Buffer->addMapping (*RetMap, Offset, Size);
4095
+ }
4096
+
4097
+ // Lock automatically releases when this goes out of scope.
4098
+ std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4099
+
4100
+ // For discrete devices we need a command list
4101
+ if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4102
+ &ZeFence))
4103
+ return Res;
4104
+
4105
+ // Set the commandlist in the event
4106
+ if (Event) {
4107
+ (*Event)->ZeCommandList = ZeCommandList;
4108
+ }
4109
+
4057
4110
ze_event_handle_t *ZeEventWaitList =
4058
4111
_pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
4059
4112
4060
- ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4061
- ZeEventWaitList));
4062
-
4063
- // TODO: Level Zero is missing the memory "mapping" capabilities, so we are
4064
- // left to doing new memory allocation and a copy (read).
4065
- //
4066
- // TODO: check if the input buffer is already allocated in shared
4067
- // memory and thus is accessible from the host as is. Can we get SYCL RT
4068
- // to predict/allocate in shared memory from the beginning?
4069
4113
if (Buffer->MapHostPtr ) {
4070
- // NOTE: borrowing below semantics from OpenCL as SYCL RT relies on it.
4071
- // It is also better for performance.
4072
- //
4073
- // "If the buffer object is created with CL_MEM_USE_HOST_PTR set in
4074
- // mem_flags, the following will be true:
4075
- // - The host_ptr specified in clCreateBuffer is guaranteed to contain the
4076
- // latest bits in the region being mapped when the clEnqueueMapBuffer
4077
- // command has completed.
4078
- // - The pointer value returned by clEnqueueMapBuffer will be derived from
4079
- // the host_ptr specified when the buffer object is created."
4080
4114
*RetMap = Buffer->MapHostPtr + Offset;
4081
4115
} else {
4082
4116
ze_host_mem_alloc_desc_t ZeDesc = {};
4083
4117
ZeDesc.flags = 0 ;
4084
- ZE_CALL ( zeMemAllocHost (Queue-> Context -> ZeContext , &ZeDesc, Size,
4085
- 1 , // TODO: alignment
4086
- RetMap));
4118
+
4119
+ ZE_CALL (
4120
+ zeMemAllocHost (Queue-> Context -> ZeContext , &ZeDesc, Size, 1 , RetMap));
4087
4121
}
4088
4122
4123
+ ZE_CALL (zeCommandListAppendWaitOnEvents (ZeCommandList, NumEventsInWaitList,
4124
+ ZeEventWaitList));
4125
+
4089
4126
ZE_CALL (zeCommandListAppendMemoryCopy (
4090
4127
ZeCommandList, *RetMap, pi_cast<char *>(Buffer->getZeHandle ()) + Offset,
4091
4128
Size, ZeEvent, 0 , nullptr ));
@@ -4103,15 +4140,10 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
4103
4140
const pi_event *EventWaitList, pi_event *Event) {
4104
4141
assert (Queue);
4105
4142
4106
- // Lock automatically releases when this goes out of scope.
4107
- std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4108
-
4109
- // Get a new command list to be used on this call
4143
+ // Integrated devices don't need a command list.
4144
+ // If discrete we will get a commandlist later.
4110
4145
ze_command_list_handle_t ZeCommandList = nullptr ;
4111
4146
ze_fence_handle_t ZeFence = nullptr ;
4112
- if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4113
- &ZeFence))
4114
- return Res;
4115
4147
4116
4148
// TODO: handle the case when user does not care to follow the event
4117
4149
// of unmap completion.
@@ -4130,6 +4162,46 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
4130
4162
ZeEvent = (*Event)->ZeEvent ;
4131
4163
}
4132
4164
4165
+ _pi_mem::Mapping MapInfo = {};
4166
+ if (pi_result Res = MemObj->removeMapping (MappedPtr, MapInfo))
4167
+ return Res;
4168
+
4169
+ // NOTE: we still have to free the host memory allocated/returned by
4170
+ // piEnqueueMemBufferMap, but can only do so after the above copy
4171
+ // is completed. Instead of waiting for It here (blocking), we shall
4172
+ // do so in piEventRelease called for the pi_event tracking the unmap.
4173
+ // In the case of an integrated device, the map operation does not allocate
4174
+ // any memory, so there is nothing to free. This is indicated by a nullptr.
4175
+ if (Event)
4176
+ (*Event)->CommandData =
4177
+ (MemObj->OnHost ? nullptr : (MemObj->MapHostPtr ? nullptr : MappedPtr));
4178
+
4179
+ // On integrated devices the buffer is allocated in host memory.
4180
+ if (MemObj->OnHost ) {
4181
+ // Wait on incoming events before doing the copy
4182
+ piEventsWait (NumEventsInWaitList, EventWaitList);
4183
+ if (MemObj->MapHostPtr )
4184
+ memcpy (pi_cast<char *>(MemObj->getZeHandle ()) + MapInfo.Offset , MappedPtr,
4185
+ MapInfo.Size );
4186
+
4187
+ // Signal this event
4188
+ ZE_CALL (zeEventHostSignal (ZeEvent));
4189
+
4190
+ return PI_SUCCESS;
4191
+ }
4192
+
4193
+ // Lock automatically releases when this goes out of scope.
4194
+ std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
4195
+
4196
+ if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
4197
+ &ZeFence))
4198
+ return Res;
4199
+
4200
+ // Set the commandlist in the event
4201
+ if (Event) {
4202
+ (*Event)->ZeCommandList = ZeCommandList;
4203
+ }
4204
+
4133
4205
ze_event_handle_t *ZeEventWaitList =
4134
4206
_pi_event::createZeEventList (NumEventsInWaitList, EventWaitList);
4135
4207
@@ -4141,21 +4213,11 @@ pi_result piEnqueueMemUnmap(pi_queue Queue, pi_mem MemObj, void *MappedPtr,
4141
4213
//
4142
4214
// NOTE: Keep this in sync with the implementation of
4143
4215
// piEnqueueMemBufferMap/piEnqueueMemImageMap.
4144
- _pi_mem::Mapping MapInfo = {};
4145
- if (pi_result Res = MemObj->removeMapping (MappedPtr, MapInfo))
4146
- return Res;
4147
4216
4148
4217
ZE_CALL (zeCommandListAppendMemoryCopy (
4149
4218
ZeCommandList, pi_cast<char *>(MemObj->getZeHandle ()) + MapInfo.Offset ,
4150
4219
MappedPtr, MapInfo.Size , ZeEvent, 0 , nullptr ));
4151
4220
4152
- // NOTE: we still have to free the host memory allocated/returned by
4153
- // piEnqueueMemBufferMap, but can only do so after the above copy
4154
- // is completed. Instead of waiting for It here (blocking), we shall
4155
- // do so in piEventRelease called for the pi_event tracking the unmap.
4156
- if (Event)
4157
- (*Event)->CommandData = MemObj->MapHostPtr ? nullptr : MappedPtr;
4158
-
4159
4221
// Execute command list asynchronously, as the event will be used
4160
4222
// to track down its completion.
4161
4223
if (auto Res = Queue->executeCommandList (ZeCommandList, ZeFence))
0 commit comments