Skip to content

Commit 68fda6c

Browse files
committed
[SYCL][UR][L0 v2] implement OOO immediate queue
by using multiple in-order queues and round-robin strategy to dispatch work. With this approach we don't need to worry about events' lifetime. Since we are still using counter-based events, we don't need any special logic to handle cases where event release is called right after being passed as signal event or as part of a wait list.
1 parent 133fee5 commit 68fda6c

File tree

5 files changed

+784
-4
lines changed

5 files changed

+784
-4
lines changed

unified-runtime/source/adapters/level_zero/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
171171
${CMAKE_CURRENT_SOURCE_DIR}/v2/lockable.hpp
172172
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.hpp
173173
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp
174+
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.hpp
174175
${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp
175176
${CMAKE_CURRENT_SOURCE_DIR}/v2/api.cpp
176177
${CMAKE_CURRENT_SOURCE_DIR}/v2/command_buffer.cpp
@@ -187,6 +188,7 @@ if(UR_BUILD_ADAPTER_L0_V2)
187188
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.cpp
188189
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_create.cpp
189190
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp
191+
${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.cpp
190192
${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.cpp
191193
)
192194
install_ur_library(ur_adapter_level_zero_v2)

unified-runtime/source/adapters/level_zero/v2/queue_create.cpp

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -69,9 +69,18 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext,
6969

7070
auto zeIndex = v2::getZeIndex(pProperties);
7171

72-
*phQueue = ur_queue_handle_t_::create<v2::ur_queue_immediate_in_order_t>(
73-
hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags),
74-
zeIndex, v2::eventFlagsFromQueueFlags(flags), flags);
72+
if ((flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0 &&
73+
!zeIndex.has_value()) {
74+
*phQueue =
75+
ur_queue_handle_t_::create<v2::ur_queue_immediate_out_of_order_t>(
76+
hContext, hDevice, v2::getZeOrdinal(hDevice),
77+
v2::getZePriority(flags), v2::eventFlagsFromQueueFlags(flags),
78+
flags);
79+
} else {
80+
*phQueue = ur_queue_handle_t_::create<v2::ur_queue_immediate_in_order_t>(
81+
hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags),
82+
zeIndex, v2::eventFlagsFromQueueFlags(flags), flags);
83+
}
7584

7685
return UR_RESULT_SUCCESS;
7786
} catch (...) {

unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@
1515

1616
#include "../common.hpp"
1717
#include "queue_immediate_in_order.hpp"
18+
#include "queue_immediate_out_of_order.hpp"
1819
#include <ur_api.h>
1920
#include <variant>
2021

2122
struct ur_queue_handle_t_ : ur::handle_base<ur::level_zero::ddi_getter> {
22-
using data_variant = std::variant<v2::ur_queue_immediate_in_order_t>;
23+
using data_variant = std::variant<v2::ur_queue_immediate_in_order_t,
24+
v2::ur_queue_immediate_out_of_order_t>;
2325
data_variant queue_data;
2426

2527
static constexpr uintptr_t queue_offset =
Lines changed: 197 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,197 @@
1+
//===--------- queue_immediate_in_order.cpp - Level Zero Adapter ---------===//
2+
//
3+
// Copyright (C) 2024 Intel Corporation
4+
//
5+
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
6+
// Exceptions. See LICENSE.TXT
7+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
8+
//
9+
//===----------------------------------------------------------------------===//
10+
11+
#include "queue_immediate_out_of_order.hpp"
12+
#include "../common/latency_tracker.hpp"
13+
#include "ur.hpp"
14+
15+
namespace v2 {
16+
17+
// Helper function to intialize std::array of command list manager.
18+
// This is needed because command list manager does not have a default
19+
// constructor.
20+
template <size_t... Is>
21+
std::array<lockable<ur_command_list_manager>, sizeof...(Is)>
22+
createCommandListManagers(ur_context_handle_t hContext,
23+
ur_device_handle_t hDevice, uint32_t ordinal,
24+
ze_command_queue_priority_t priority,
25+
std::index_sequence<Is...>) {
26+
return {
27+
((void)Is, lockable<ur_command_list_manager>(
28+
hContext, hDevice,
29+
hContext->getCommandListCache().getImmediateCommandList(
30+
hDevice->ZeDevice,
31+
{true, ordinal, true /* always enable copy offload */},
32+
ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority)))...};
33+
}
34+
35+
template <size_t N>
36+
std::array<lockable<ur_command_list_manager>, N>
37+
createCommandListManagers(ur_context_handle_t hContext,
38+
ur_device_handle_t hDevice, uint32_t ordinal,
39+
ze_command_queue_priority_t priority) {
40+
return createCommandListManagers(hContext, hDevice, ordinal, priority,
41+
std::make_index_sequence<N>{});
42+
}
43+
44+
void ur_queue_immediate_out_of_order_t::initializeSignalEvents() {
45+
// Counter-based events are not signaled by default. Use dummy operation
46+
// to ensure proper state. TODO: we can use zexCounterBasedEventCreate for
47+
// this.
48+
void *tmpMem = nullptr;
49+
uint32_t tmpPattern = 0;
50+
UR_CALL_THROWS(ur::level_zero::urUSMHostAlloc(hContext, nullptr, nullptr,
51+
sizeof(tmpPattern), &tmpMem));
52+
53+
for (size_t i = 0; i < numCommandLists; ++i) {
54+
internalSignalEvents[i] = eventPool->allocate();
55+
commandListManagers[i].get_no_lock()->appendUSMFill(
56+
tmpMem, sizeof(tmpPattern), &tmpPattern, sizeof(tmpPattern), 0, nullptr,
57+
internalSignalEvents[i]);
58+
ZE2UR_CALL_THROWS(
59+
zeCommandListHostSynchronize,
60+
(commandListManagers[i].get_no_lock()->getZeCommandList(), UINT64_MAX));
61+
62+
signalEvents.assign(i, internalSignalEvents[i], false);
63+
}
64+
65+
UR_CALL_THROWS(ur::level_zero::urUSMFree(hContext, tmpMem));
66+
}
67+
68+
ur_queue_immediate_out_of_order_t::ur_queue_immediate_out_of_order_t(
69+
ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal,
70+
ze_command_queue_priority_t priority, event_flags_t eventFlags,
71+
ur_queue_flags_t flags)
72+
: hContext(hContext), hDevice(hDevice),
73+
commandListManagers(createCommandListManagers<numCommandLists>(
74+
hContext, hDevice, ordinal, priority)),
75+
eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate)
76+
.borrow(hDevice->Id.value(), eventFlags)),
77+
flags(flags) {
78+
initializeSignalEvents();
79+
}
80+
81+
ur_result_t ur_queue_immediate_out_of_order_t::queueGetInfo(
82+
ur_queue_info_t propName, size_t propSize, void *pPropValue,
83+
size_t *pPropSizeRet) {
84+
UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
85+
// TODO: consider support for queue properties and size
86+
switch ((uint32_t)propName) { // cast to avoid warnings on EXT enum values
87+
case UR_QUEUE_INFO_CONTEXT:
88+
return ReturnValue(hContext);
89+
case UR_QUEUE_INFO_DEVICE:
90+
return ReturnValue(hDevice);
91+
case UR_QUEUE_INFO_REFERENCE_COUNT:
92+
return ReturnValue(uint32_t{RefCount.load()});
93+
case UR_QUEUE_INFO_FLAGS:
94+
return ReturnValue(flags);
95+
case UR_QUEUE_INFO_SIZE:
96+
case UR_QUEUE_INFO_DEVICE_DEFAULT:
97+
return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
98+
case UR_QUEUE_INFO_EMPTY: {
99+
auto isCmdListEmpty = [](ze_command_list_handle_t cmdList) {
100+
auto status = ZE_CALL_NOCHECK(zeCommandListHostSynchronize, (cmdList, 0));
101+
if (status == ZE_RESULT_SUCCESS) {
102+
return true;
103+
} else if (status == ZE_RESULT_NOT_READY) {
104+
return false;
105+
} else {
106+
throw ze2urResult(status);
107+
}
108+
};
109+
110+
bool empty = std::all_of(
111+
commandListManagers.begin(), commandListManagers.end(),
112+
[&](auto &cmdListManager) {
113+
return isCmdListEmpty(cmdListManager.lock()->getZeCommandList());
114+
});
115+
116+
return ReturnValue(empty);
117+
}
118+
default:
119+
UR_LOG(ERR,
120+
"Unsupported ParamName in urQueueGetInfo: "
121+
"ParamName=ParamName={}(0x{})",
122+
propName, logger::toHex(propName));
123+
return UR_RESULT_ERROR_INVALID_VALUE;
124+
}
125+
126+
return UR_RESULT_SUCCESS;
127+
}
128+
129+
ur_result_t ur_queue_immediate_out_of_order_t::queueGetNativeHandle(
130+
ur_queue_native_desc_t * /*pDesc*/, ur_native_handle_t *phNativeQueue) {
131+
*phNativeQueue = reinterpret_cast<ur_native_handle_t>(
132+
commandListManagers[getNextCommandListId()]
133+
.get_no_lock()
134+
->getZeCommandList());
135+
return UR_RESULT_SUCCESS;
136+
}
137+
138+
ur_result_t ur_queue_immediate_out_of_order_t::queueFinish() {
139+
TRACK_SCOPE_LATENCY("ur_queue_immediate_out_of_order_t::queueFinish");
140+
141+
auto lastCommandListId =
142+
commandListIndex.load(std::memory_order_relaxed) % numCommandLists;
143+
144+
UR_CALL(commandListManagers[lastCommandListId].lock()->appendEventsWait(
145+
numCommandLists, signalEvents.events.data(), nullptr));
146+
ZE2UR_CALL(zeCommandListHostSynchronize,
147+
(commandListManagers[lastCommandListId].lock()->getZeCommandList(),
148+
UINT64_MAX));
149+
150+
return UR_RESULT_SUCCESS;
151+
}
152+
153+
ur_result_t ur_queue_immediate_out_of_order_t::queueFlush() {
154+
return UR_RESULT_SUCCESS;
155+
}
156+
157+
ur_queue_immediate_out_of_order_t::~ur_queue_immediate_out_of_order_t() {
158+
try {
159+
UR_CALL_THROWS(queueFinish());
160+
} catch (...) {
161+
// Ignore errors during destruction
162+
}
163+
}
164+
165+
ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier(
166+
uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
167+
ur_event_handle_t *phEvent) {
168+
TRACK_SCOPE_LATENCY(
169+
"ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier");
170+
// For in-order queue we don't need a real L0 barrier, just wait for
171+
// requested events in potentially different queues and add a "barrier"
172+
// event signal because it is already guaranteed that previous commands
173+
// in this queue are completed when the signal is started. However, we do
174+
// need to use barrier if profiling is enabled: see
175+
// zeCommandListAppendWaitOnEvents
176+
if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) {
177+
for (size_t id = 0; id < numCommandLists; id++) {
178+
UR_CALL(commandListManagers[id].lock()->appendEventsWaitWithBarrier(
179+
numEventsInWaitList, phEventWaitList, internalSignalEvents[id]));
180+
}
181+
} else {
182+
for (size_t id = 0; id < numCommandLists; id++) {
183+
UR_CALL(commandListManagers[id].lock()->appendEventsWait(
184+
numEventsInWaitList, phEventWaitList, internalSignalEvents[id]));
185+
}
186+
}
187+
188+
if (phEvent) {
189+
UR_CALL(commandListManagers.front().lock()->appendEventsWait(
190+
numCommandLists, internalSignalEvents.data(),
191+
createOrUseInternalSignalEvent<false>(0, phEvent)));
192+
}
193+
194+
return UR_RESULT_SUCCESS;
195+
}
196+
197+
} // namespace v2

0 commit comments

Comments
 (0)