Skip to content

Commit 77b85c0

Browse files
authored
Merge pull request #2658 from igchor/fill_non_pow2
[L0 v2] extend USMFill implementation to support sizes which are not powers of 2
2 parents e5cb366 + a0b895e commit 77b85c0

File tree

5 files changed

+85
-93
lines changed

5 files changed

+85
-93
lines changed

source/adapters/level_zero/v2/command_list_manager.cpp

Lines changed: 5 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ ur_command_list_manager::~ur_command_list_manager() {
3030
ur::level_zero::urDeviceRelease(device);
3131
}
3232

33-
std::pair<ze_event_handle_t *, uint32_t>
33+
wait_list_view
3434
ur_command_list_manager::getWaitListView(const ur_event_handle_t *phWaitEvents,
3535
uint32_t numWaitEvents) {
3636

@@ -80,32 +80,24 @@ ur_result_t ur_command_list_manager::appendKernelLaunch(
8080

8181
auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH);
8282

83-
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
83+
auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
8484

85-
bool memoryMigrated = false;
8685
auto memoryMigrate = [&](void *src, void *dst, size_t size) {
8786
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
8887
(zeCommandList.get(), dst, src, size, nullptr,
89-
waitList.second, waitList.first));
90-
memoryMigrated = true;
88+
waitListView.num, waitListView.handles));
89+
waitListView.clear();
9190
};
9291

9392
UR_CALL(hKernel->prepareForSubmission(context, device, pGlobalWorkOffset,
9493
workDim, WG[0], WG[1], WG[2],
9594
memoryMigrate));
9695

97-
if (memoryMigrated) {
98-
// If memory was migrated, we don't need to pass the wait list to
99-
// the copy command again.
100-
waitList.first = nullptr;
101-
waitList.second = 0;
102-
}
103-
10496
TRACK_SCOPE_LATENCY(
10597
"ur_command_list_manager::zeCommandListAppendLaunchKernel");
10698
ZE2UR_CALL(zeCommandListAppendLaunchKernel,
10799
(zeCommandList.get(), hZeKernel, &zeThreadGroupDimensions,
108-
zeSignalEvent, waitList.second, waitList.first));
100+
zeSignalEvent, waitListView.num, waitListView.handles));
109101

110102
return UR_RESULT_SUCCESS;
111103
}

source/adapters/level_zero/v2/command_list_manager.hpp

Lines changed: 17 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,21 @@
1515
#include "queue_api.hpp"
1616
#include <ze_api.h>
1717

18+
struct wait_list_view {
19+
ze_event_handle_t *handles;
20+
uint32_t num;
21+
22+
operator bool() const {
23+
assert((handles != nullptr) == (num > 0));
24+
return handles != nullptr;
25+
}
26+
27+
void clear() {
28+
handles = nullptr;
29+
num = 0;
30+
}
31+
};
32+
1833
struct ur_command_list_manager : public _ur_object {
1934

2035
ur_command_list_manager(ur_context_handle_t context,
@@ -34,9 +49,8 @@ struct ur_command_list_manager : public _ur_object {
3449

3550
ze_command_list_handle_t getZeCommandList();
3651

37-
std::pair<ze_event_handle_t *, uint32_t>
38-
getWaitListView(const ur_event_handle_t *phWaitEvents,
39-
uint32_t numWaitEvents);
52+
wait_list_view getWaitListView(const ur_event_handle_t *phWaitEvents,
53+
uint32_t numWaitEvents);
4054
ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent,
4155
ur_command_t commandType);
4256

source/adapters/level_zero/v2/queue_immediate_in_order.cpp

Lines changed: 59 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,7 @@
2121

2222
namespace v2 {
2323

24-
std::pair<ze_event_handle_t *, uint32_t>
25-
ur_queue_immediate_in_order_t::getWaitListView(
24+
wait_list_view ur_queue_immediate_in_order_t::getWaitListView(
2625
const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents) {
2726
return commandListManager.getWaitListView(phWaitEvents, numWaitEvents);
2827
}
@@ -291,37 +290,31 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCopyUnlocked(
291290
ur_command_t commandType) {
292291
auto zeSignalEvent = getSignalEvent(phEvent, commandType);
293292

294-
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
293+
auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
295294

296-
bool memoryMigrated = false;
297295
auto pSrc = ur_cast<char *>(src->getDevicePtr(
298296
hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, srcOffset,
299297
size, [&](void *src, void *dst, size_t size) {
300298
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
301299
(commandListManager.getZeCommandList(), dst, src,
302-
size, nullptr, waitList.second, waitList.first));
303-
memoryMigrated = true;
300+
size, nullptr, waitListView.num,
301+
waitListView.handles));
302+
waitListView.clear();
304303
}));
305304

306305
auto pDst = ur_cast<char *>(dst->getDevicePtr(
307306
hDevice, ur_mem_handle_t_::device_access_mode_t::write_only, dstOffset,
308307
size, [&](void *src, void *dst, size_t size) {
309308
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
310309
(commandListManager.getZeCommandList(), dst, src,
311-
size, nullptr, waitList.second, waitList.first));
312-
memoryMigrated = true;
310+
size, nullptr, waitListView.num,
311+
waitListView.handles));
312+
waitListView.clear();
313313
}));
314314

315-
if (memoryMigrated) {
316-
// If memory was migrated, we don't need to pass the wait list to
317-
// the copy command again.
318-
waitList.first = nullptr;
319-
waitList.second = 0;
320-
}
321-
322315
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
323316
(commandListManager.getZeCommandList(), pDst, pSrc, size,
324-
zeSignalEvent, waitList.second, waitList.first));
317+
zeSignalEvent, waitListView.num, waitListView.handles));
325318

326319
if (blocking) {
327320
ZE2UR_CALL(zeCommandListHostSynchronize,
@@ -379,38 +372,32 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueRegionCopyUnlocked(
379372

380373
auto zeSignalEvent = getSignalEvent(phEvent, commandType);
381374

382-
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
375+
auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
383376

384-
bool memoryMigrated = false;
385377
auto pSrc = ur_cast<char *>(src->getDevicePtr(
386378
hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, 0,
387379
src->getSize(), [&](void *src, void *dst, size_t size) {
388380
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
389381
(commandListManager.getZeCommandList(), dst, src,
390-
size, nullptr, waitList.second, waitList.first));
391-
memoryMigrated = true;
382+
size, nullptr, waitListView.num,
383+
waitListView.handles));
384+
waitListView.clear();
392385
}));
393386
auto pDst = ur_cast<char *>(dst->getDevicePtr(
394387
hDevice, ur_mem_handle_t_::device_access_mode_t::write_only, 0,
395388
dst->getSize(), [&](void *src, void *dst, size_t size) {
396389
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
397390
(commandListManager.getZeCommandList(), dst, src,
398-
size, nullptr, waitList.second, waitList.first));
399-
memoryMigrated = true;
391+
size, nullptr, waitListView.num,
392+
waitListView.handles));
393+
waitListView.clear();
400394
}));
401395

402-
if (memoryMigrated) {
403-
// If memory was migrated, we don't need to pass the wait list to
404-
// the copy command again.
405-
waitList.first = nullptr;
406-
waitList.second = 0;
407-
}
408-
409396
ZE2UR_CALL(zeCommandListAppendMemoryCopyRegion,
410397
(commandListManager.getZeCommandList(), pDst, &zeParams.dstRegion,
411398
zeParams.dstPitch, zeParams.dstSlicePitch, pSrc,
412399
&zeParams.srcRegion, zeParams.srcPitch, zeParams.srcSlicePitch,
413-
zeSignalEvent, waitList.second, waitList.first));
400+
zeSignalEvent, waitListView.num, waitListView.handles));
414401

415402
if (blocking) {
416403
ZE2UR_CALL(zeCommandListHostSynchronize,
@@ -580,23 +567,23 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap(
580567

581568
auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP);
582569

583-
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
570+
auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
584571

585-
bool memoryMigrated = false;
586572
auto pDst = ur_cast<char *>(hBuffer->mapHostPtr(
587573
mapFlags, offset, size, [&](void *src, void *dst, size_t size) {
588574
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
589575
(commandListManager.getZeCommandList(), dst, src,
590-
size, nullptr, waitList.second, waitList.first));
591-
memoryMigrated = true;
576+
size, nullptr, waitListView.num,
577+
waitListView.handles));
578+
waitListView.clear();
592579
}));
593580
*ppRetMap = pDst;
594581

595-
if (!memoryMigrated && waitList.second) {
582+
if (waitListView) {
596583
// If memory was not migrated, we need to wait on the events here.
597584
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
598-
(commandListManager.getZeCommandList(), waitList.second,
599-
waitList.first));
585+
(commandListManager.getZeCommandList(), waitListView.num,
586+
waitListView.handles));
600587
}
601588

602589
if (zeSignalEvent) {
@@ -621,21 +608,20 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap(
621608

622609
auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP);
623610

624-
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
611+
auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
625612

626613
// TODO: currently unmapHostPtr deallocates memory immediately,
627614
// since the memory might be used by the user, we need to make sure
628615
// all dependencies are completed.
629-
ZE2UR_CALL(
630-
zeCommandListAppendWaitOnEvents,
631-
(commandListManager.getZeCommandList(), waitList.second, waitList.first));
616+
ZE2UR_CALL(zeCommandListAppendWaitOnEvents,
617+
(commandListManager.getZeCommandList(), waitListView.num,
618+
waitListView.handles));
619+
waitListView.clear();
632620

633-
bool memoryMigrated = false;
634621
hMem->unmapHostPtr(pMappedPtr, [&](void *src, void *dst, size_t size) {
635622
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
636623
(commandListManager.getZeCommandList(), dst, src, size,
637-
nullptr, waitList.second, waitList.first));
638-
memoryMigrated = true;
624+
nullptr, waitListView.num, waitListView.handles));
639625
});
640626
if (zeSignalEvent) {
641627
ZE2UR_CALL(zeCommandListAppendSignalEvent,
@@ -652,33 +638,40 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericFillUnlocked(
652638

653639
auto zeSignalEvent = getSignalEvent(phEvent, commandType);
654640

655-
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
641+
auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
656642

657-
bool memoryMigrated = false;
658643
auto pDst = ur_cast<char *>(dst->getDevicePtr(
659644
hDevice, ur_mem_handle_t_::device_access_mode_t::read_only, offset, size,
660645
[&](void *src, void *dst, size_t size) {
661646
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
662647
(commandListManager.getZeCommandList(), dst, src,
663-
size, nullptr, waitList.second, waitList.first));
664-
memoryMigrated = true;
648+
size, nullptr, waitListView.num,
649+
waitListView.handles));
650+
waitListView.clear();
665651
}));
666652

667-
if (memoryMigrated) {
668-
// If memory was migrated, we don't need to pass the wait list to
669-
// the copy command again.
670-
waitList.first = nullptr;
671-
waitList.second = 0;
672-
}
673-
674-
// TODO: support non-power-of-two pattern sizes
675-
676653
// PatternSize must be a power of two for zeCommandListAppendMemoryFill.
677654
// When it's not, the fill is emulated with zeCommandListAppendMemoryCopy.
678-
ZE2UR_CALL(zeCommandListAppendMemoryFill,
679-
(commandListManager.getZeCommandList(), pDst, pPattern,
680-
patternSize, size, zeSignalEvent, waitList.second,
681-
waitList.first));
655+
if (isPowerOf2(patternSize)) {
656+
ZE2UR_CALL(zeCommandListAppendMemoryFill,
657+
(commandListManager.getZeCommandList(), pDst, pPattern,
658+
patternSize, size, zeSignalEvent, waitListView.num,
659+
waitListView.handles));
660+
} else {
661+
// Copy pattern into every entry in memory array pointed by Ptr.
662+
uint32_t numOfCopySteps = size / patternSize;
663+
const void *src = pPattern;
664+
665+
for (uint32_t step = 0; step < numOfCopySteps; ++step) {
666+
void *dst = reinterpret_cast<void *>(reinterpret_cast<uint8_t *>(pDst) +
667+
step * patternSize);
668+
ZE2UR_CALL(zeCommandListAppendMemoryCopy,
669+
(commandListManager.getZeCommandList(), dst, src, patternSize,
670+
step == numOfCopySteps - 1 ? zeSignalEvent : nullptr,
671+
waitListView.num, waitListView.handles));
672+
waitListView.clear();
673+
}
674+
}
682675

683676
return UR_RESULT_SUCCESS;
684677
}
@@ -988,33 +981,25 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp(
988981

989982
auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH);
990983

991-
auto waitList = getWaitListView(phEventWaitList, numEventsInWaitList);
984+
auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList);
992985

993-
bool memoryMigrated = false;
994986
auto memoryMigrate = [&](void *src, void *dst, size_t size) {
995987
ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy,
996988
(commandListManager.getZeCommandList(), dst, src, size,
997-
nullptr, waitList.second, waitList.first));
998-
memoryMigrated = true;
989+
nullptr, waitListView.num, waitListView.handles));
990+
waitListView.clear();
999991
};
1000992

1001993
UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset,
1002994
workDim, WG[0], WG[1], WG[2],
1003995
memoryMigrate));
1004996

1005-
if (memoryMigrated) {
1006-
// If memory was migrated, we don't need to pass the wait list to
1007-
// the copy command again.
1008-
waitList.first = nullptr;
1009-
waitList.second = 0;
1010-
}
1011-
1012997
TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::"
1013998
"zeCommandListAppendLaunchCooperativeKernel");
1014999
ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel,
10151000
(commandListManager.getZeCommandList(), hZeKernel,
1016-
&zeThreadGroupDimensions, zeSignalEvent, waitList.second,
1017-
waitList.first));
1001+
&zeThreadGroupDimensions, zeSignalEvent, waitListView.num,
1002+
waitListView.handles));
10181003

10191004
recordSubmittedKernel(hKernel);
10201005

source/adapters/level_zero/v2/queue_immediate_in_order.hpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,9 +35,8 @@ struct ur_queue_immediate_in_order_t : _ur_object, public ur_queue_t_ {
3535
std::vector<ur_event_handle_t> deferredEvents;
3636
std::vector<ur_kernel_handle_t> submittedKernels;
3737

38-
std::pair<ze_event_handle_t *, uint32_t>
39-
getWaitListView(const ur_event_handle_t *phWaitEvents,
40-
uint32_t numWaitEvents);
38+
wait_list_view getWaitListView(const ur_event_handle_t *phWaitEvents,
39+
uint32_t numWaitEvents);
4140

4241
ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent,
4342
ur_command_t commandType);

test/conformance/enqueue/urEnqueueUSMFill.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ static std::vector<testParametersFill> test_cases{
8686
{256, 256},
8787
/* pattern_size < size */
8888
{1024, 256},
89+
/* sizes which are not powers of 2 */
90+
{1000, 10},
8991
/* pattern sizes corresponding to some common scalar and vector types */
9092
{256, 4},
9193
{256, 8},

0 commit comments

Comments
 (0)