Skip to content

Commit 53f07ab

Browse files
committed
[L0] optimize # of event status queries through batching
Currently, for out-of-order immediate commandlists, each enqueued operation creates an event on an internal vector to indicate when that operation is completed. When that vector reaches 1k (by default) entries, each event is individually queried for status and, if it completed, is cleaned up and removed. This is causing latency spikes for enqueue operations and reduces overall performance when using short-lived operations. This patch introduces an event batching mechanism, where instead of querying each event individually, the adapter will group events in batches of 512 entries (by default) that will be queried collectively with the use of a barrier. Most of the credit goes to Michal Mrozek <[email protected]> for profiling the problem and proposing a solution. This new mechanism is disabled by default. To use it, this patch is adding three new env variables: - UR_L0_IMMEDIATE_COMMANDLISTS_BATCH_EVENT_COMPLETIONS. Enables event completion batching. Defaults to off. - UR_L0_IMMEDIATE_COMMANDLISTS_BATCH_MAX. Controls the maximum number of batches for grouping the events. If there are no batches available, the adapter will fallback to querying events individually. Defaults to 10. - UR_L0_IMMEDIATE_COMMANDLISTS_EVENTS_PER_BATCH. The number of events in each batch. Once a batch reaches this threshold, the adapter appends a barrier to the command list, and then another batch is selected to group incoming events. Defaults to 512. Early results show appreciable reduction in enqueue tail latency, but more experiments are needed.
1 parent 6ccaf38 commit 53f07ab

File tree

6 files changed

+452
-44
lines changed

6 files changed

+452
-44
lines changed

source/adapters/level_zero/context.cpp

Lines changed: 7 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <mutex>
1414
#include <string.h>
1515

16+
#include "adapters/level_zero/queue.hpp"
1617
#include "context.hpp"
1718
#include "ur_level_zero.hpp"
1819

@@ -596,29 +597,6 @@ ur_context_handle_t_::decrementUnreleasedEventsInPool(ur_event_handle_t Event) {
596597
return UR_RESULT_SUCCESS;
597598
}
598599

599-
// Get value of the threshold for number of events in immediate command lists.
600-
// If number of events in the immediate command list exceeds this threshold then
601-
// cleanup process for those events is executed.
602-
static const size_t ImmCmdListsEventCleanupThreshold = [] {
603-
const char *UrRet =
604-
std::getenv("UR_L0_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
605-
const char *PiRet = std::getenv(
606-
"SYCL_PI_LEVEL_ZERO_IMMEDIATE_COMMANDLISTS_EVENT_CLEANUP_THRESHOLD");
607-
const char *ImmCmdListsEventCleanupThresholdStr =
608-
UrRet ? UrRet : (PiRet ? PiRet : nullptr);
609-
static constexpr int Default = 1000;
610-
if (!ImmCmdListsEventCleanupThresholdStr)
611-
return Default;
612-
613-
int Threshold = std::atoi(ImmCmdListsEventCleanupThresholdStr);
614-
615-
// Basically disable threshold if negative value is provided.
616-
if (Threshold < 0)
617-
return INT_MAX;
618-
619-
return Threshold;
620-
}();
621-
622600
// Get value of the threshold for number of active command lists allowed before
623601
// we start heuristically cleaning them up.
624602
static const size_t CmdListsCleanupThreshold = [] {
@@ -648,8 +626,8 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
648626
// Immediate commandlists have been pre-allocated and are always available.
649627
if (Queue->UsingImmCmdLists) {
650628
CommandList = Queue->getQueueGroup(UseCopyEngine).getImmCmdList();
651-
if (CommandList->second.EventList.size() >
652-
ImmCmdListsEventCleanupThreshold) {
629+
if (CommandList->second.EventList.size() >=
630+
Queue->getImmdCmmdListsEventCleanupThreshold()) {
653631
std::vector<ur_event_handle_t> EventListToCleanup;
654632
Queue->resetCommandList(CommandList, false, EventListToCleanup);
655633
CleanupEventListFromResetCmdList(EventListToCleanup, true);
@@ -743,11 +721,13 @@ ur_result_t ur_context_handle_t_::getAvailableCommandList(
743721
ZE2UR_CALL(zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
744722
ZeStruct<ze_command_queue_desc_t> ZeQueueDesc;
745723
ZeQueueDesc.ordinal = QueueGroupOrdinal;
724+
746725
CommandList =
747726
Queue->CommandListMap
748727
.emplace(ZeCommandList,
749-
ur_command_list_info_t{ZeFence, true, false,
750-
ZeCommandQueue, ZeQueueDesc})
728+
ur_command_list_info_t(ZeFence, true, false,
729+
ZeCommandQueue, ZeQueueDesc,
730+
Queue->useCompletionBatching()))
751731
.first;
752732
}
753733
ZeCommandListCache.erase(ZeCommandListIt);

source/adapters/level_zero/event.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <algorithm>
1212
#include <climits>
1313
#include <mutex>
14+
#include <optional>
1415
#include <string.h>
1516

1617
#include "command_buffer.hpp"
@@ -1129,6 +1130,7 @@ ur_result_t ur_event_handle_t_::reset() {
11291130
RefCountExternal = 0;
11301131
RefCount.reset();
11311132
CommandList = std::nullopt;
1133+
completionBatch = std::nullopt;
11321134

11331135
if (!isHostVisible())
11341136
HostVisibleEvent = nullptr;

source/adapters/level_zero/event.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,10 @@ struct ur_event_handle_t_ : _ur_object {
222222

223223
// Get the host-visible event or create one and enqueue its signal.
224224
ur_result_t getOrCreateHostVisibleEvent(ze_event_handle_t &HostVisibleEvent);
225+
226+
// completion batch for this event. Only used for out-of-order immediate
227+
// command lists.
228+
std::optional<ur_completion_batch_it> completionBatch;
225229
};
226230

227231
// Helper function to implement zeHostSynchronize.

0 commit comments

Comments
 (0)