@@ -435,12 +435,46 @@ _pi_queue::resetCommandListFenceEntry(ze_command_list_handle_t ZeCommandList,
435
435
return PI_SUCCESS;
436
436
}
437
437
438
+ static const pi_uint32 ZeCommandListBatchSize = [] {
439
+ // Default value of 4. This has been seen as a good tradeoff between
440
+ // lower overhead of number of enqueue and fence calls, and getting
441
+ // commands seen as soon possible (i.e. lazy vs eager submission).
442
+ pi_uint32 BatchSizeVal = 4 ;
443
+ const auto BatchSizeStr = std::getenv (" SYCL_PI_LEVEL_ZERO_BATCH_SIZE" );
444
+ if (BatchSizeStr) {
445
+ pi_int32 BatchSizeStrVal = std::atoi (BatchSizeStr);
446
+ // Level Zero may only support a limted number of commands per command
447
+ // list. The actual upper limit is not specified by the Level Zero
448
+ // Specification. For now we allow an arbitrary upper limit.
449
+ // Negative numbers will be silently ignored.
450
+ if (BatchSizeStrVal >= 0 )
451
+ BatchSizeVal = BatchSizeStrVal;
452
+ }
453
+ return BatchSizeVal;
454
+ }();
455
+
438
456
// Retrieve an available command list to be used in a PI call
439
457
// Caller must hold a lock on the Queue passed in.
440
- pi_result
441
- _pi_device::getAvailableCommandList (pi_queue Queue,
442
- ze_command_list_handle_t *ZeCommandList,
443
- ze_fence_handle_t *ZeFence) {
458
+ pi_result _pi_device::getAvailableCommandList (
459
+ pi_queue Queue, ze_command_list_handle_t *ZeCommandList,
460
+ ze_fence_handle_t *ZeFence, bool AllowBatching) {
461
+ // First see if there is an command-list open for batching commands
462
+ // for this queue.
463
+ if (Queue->ZeOpenCommandList ) {
464
+ if (AllowBatching) {
465
+ *ZeCommandList = Queue->ZeOpenCommandList ;
466
+ *ZeFence = Queue->ZeOpenCommandListFence ;
467
+ return PI_SUCCESS;
468
+ }
469
+
470
+ // If this command isn't allowed to be batched, then we need to
471
+ // go ahead and execute what is already in the batched list,
472
+ // and then go on to process this. On exit from executeOpenCommandList
473
+ // ZeOpenCommandList will be nullptr.
474
+ if (auto Res = Queue->executeOpenCommandList ())
475
+ return Res;
476
+ }
477
+
444
478
// Create/Reuse the command list, because in Level Zero commands are added to
445
479
// the command lists, and later are then added to the command queue.
446
480
// Each command list is paired with an associated fence to track when the
@@ -525,6 +559,55 @@ pi_result _pi_queue::executeCommandList(ze_command_list_handle_t ZeCommandList,
525
559
return PI_SUCCESS;
526
560
}
527
561
562
+ bool _pi_queue::isBatchingAllowed () {
563
+ return (this ->QueueBatchSize > 1 && ((ZeSerialize & ZeSerializeBlock) == 0 ));
564
+ }
565
+
566
+ pi_result _pi_queue::batchCommandList (ze_command_list_handle_t ZeCommandList,
567
+ ze_fence_handle_t ZeFence) {
568
+ if (this ->isBatchingAllowed ()) {
569
+ assert (this ->ZeOpenCommandList == nullptr ||
570
+ this ->ZeOpenCommandList == ZeCommandList);
571
+
572
+ if (this ->ZeOpenCommandListSize + 1 < QueueBatchSize) {
573
+ this ->ZeOpenCommandList = ZeCommandList;
574
+ this ->ZeOpenCommandListFence = ZeFence;
575
+
576
+ // NOTE: we don't know here how many commands are in the ZeCommandList
577
+ // but most PI interfaces translate to a single Level-Zero command.
578
+ // Some do translate to multiple commands so we may be undercounting
579
+ // a bit here, but this is a heuristic, not an exact measure.
580
+ //
581
+ this ->ZeOpenCommandListSize += 1 ;
582
+
583
+ return PI_SUCCESS;
584
+ }
585
+
586
+ this ->ZeOpenCommandList = nullptr ;
587
+ this ->ZeOpenCommandListFence = nullptr ;
588
+ this ->ZeOpenCommandListSize = 0 ;
589
+ }
590
+
591
+ return executeCommandList (ZeCommandList, ZeFence);
592
+ }
593
+
594
+ pi_result _pi_queue::executeOpenCommandList () {
595
+ // If there are any commands still in the open command list for this
596
+ // queue, then close and execute that command list now.
597
+ auto OpenList = this ->ZeOpenCommandList ;
598
+ if (OpenList) {
599
+ auto OpenListFence = this ->ZeOpenCommandListFence ;
600
+
601
+ this ->ZeOpenCommandList = nullptr ;
602
+ this ->ZeOpenCommandListFence = nullptr ;
603
+ this ->ZeOpenCommandListSize = 0 ;
604
+
605
+ return executeCommandList (OpenList, OpenListFence);
606
+ }
607
+
608
+ return PI_SUCCESS;
609
+ }
610
+
528
611
ze_event_handle_t *_pi_event::createZeEventList (pi_uint32 EventListLength,
529
612
const pi_event *EventList) {
530
613
try {
@@ -1650,7 +1733,8 @@ pi_result piQueueCreate(pi_context Context, pi_device Device,
1650
1733
1651
1734
assert (Queue);
1652
1735
try {
1653
- *Queue = new _pi_queue (ZeCommandQueue, Context, Device);
1736
+ *Queue =
1737
+ new _pi_queue (ZeCommandQueue, Context, Device, ZeCommandListBatchSize);
1654
1738
} catch (const std::bad_alloc &) {
1655
1739
return PI_OUT_OF_HOST_MEMORY;
1656
1740
} catch (...) {
@@ -1706,6 +1790,12 @@ pi_result piQueueRelease(pi_queue Queue) {
1706
1790
std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
1707
1791
1708
1792
if (--(Queue->RefCount ) == 0 ) {
1793
+ // It is possible to get to here and still have an open command list
1794
+ // if no wait or finish ever occurred for this queue. But still need
1795
+ // to make sure commands get executed.
1796
+ if (auto Res = Queue->executeOpenCommandList ())
1797
+ return Res;
1798
+
1709
1799
// Destroy all the fences created associated with this queue.
1710
1800
for (const auto &MapEntry : Queue->ZeCommandListFenceMap ) {
1711
1801
ZE_CALL (zeFenceDestroy (MapEntry.second ));
@@ -1724,6 +1814,10 @@ pi_result piQueueFinish(pi_queue Queue) {
1724
1814
// Lock automatically releases when this goes out of scope.
1725
1815
std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
1726
1816
1817
+ // execute any command list that may still be open.
1818
+ if (auto Res = Queue->executeOpenCommandList ())
1819
+ return Res;
1820
+
1727
1821
ZE_CALL (zeCommandQueueSynchronize (Queue->ZeCommandQueue , UINT32_MAX));
1728
1822
return PI_SUCCESS;
1729
1823
}
@@ -1754,7 +1848,7 @@ pi_result piextQueueCreateWithNativeHandle(pi_native_handle NativeHandle,
1754
1848
// Attach the queue to the "0" device.
1755
1849
// TODO: see if we need to let user choose the device.
1756
1850
pi_device Device = Context->Devices [0 ];
1757
- *Queue = new _pi_queue (ZeQueue, Context, Device);
1851
+ *Queue = new _pi_queue (ZeQueue, Context, Device, ZeCommandListBatchSize );
1758
1852
return PI_SUCCESS;
1759
1853
}
1760
1854
@@ -3022,7 +3116,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
3022
3116
ze_command_list_handle_t ZeCommandList = nullptr ;
3023
3117
ze_fence_handle_t ZeFence = nullptr ;
3024
3118
if (auto Res = Queue->Device ->getAvailableCommandList (Queue, &ZeCommandList,
3025
- &ZeFence))
3119
+ &ZeFence, true ))
3026
3120
return Res;
3027
3121
3028
3122
ze_event_handle_t ZeEvent = nullptr ;
@@ -3059,7 +3153,7 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
3059
3153
3060
3154
// Execute command list asynchronously, as the event will be used
3061
3155
// to track down its completion.
3062
- if (auto Res = Queue->executeCommandList (ZeCommandList, ZeFence))
3156
+ if (auto Res = Queue->batchCommandList (ZeCommandList, ZeFence))
3063
3157
return Res;
3064
3158
3065
3159
_pi_event::deleteZeEventList (ZeEventWaitList);
@@ -3194,6 +3288,19 @@ pi_result piEventsWait(pi_uint32 NumEvents, const pi_event *EventList) {
3194
3288
return PI_INVALID_EVENT;
3195
3289
}
3196
3290
3291
+ // Submit dependent open command lists for execution, if any
3292
+ for (uint32_t I = 0 ; I < NumEvents; I++) {
3293
+ auto Queue = EventList[I]->Queue ;
3294
+
3295
+ // Lock automatically releases when this goes out of scope.
3296
+ std::lock_guard<std::mutex> lock (Queue->PiQueueMutex );
3297
+
3298
+ if (Queue->RefCount > 0 ) {
3299
+ if (auto Res = Queue->executeOpenCommandList ())
3300
+ return Res;
3301
+ }
3302
+ }
3303
+
3197
3304
for (uint32_t I = 0 ; I < NumEvents; I++) {
3198
3305
ze_event_handle_t ZeEvent = EventList[I]->ZeEvent ;
3199
3306
zePrint (" ZeEvent = %lx\n " , pi_cast<std::uintptr_t >(ZeEvent));
0 commit comments