Skip to content

Commit f3d30b6

Browse files
[SYCL] Use scheduler in queue shortcuts to avoid waiting for deps (#11758)
Some queue shortcuts attempt to bypass the scheduler to avoid the overhead of creating a command node. In cases where dependencies couldn't be passed to the backend directly, there was a blocking wait during submission. This patch changes that behavior to bypass the scheduler only if all dependency events can be used directly: host task dependencies have been completed, others have been enqueued and there are no cross-context dependencies. --------- Signed-off-by: Tikhomirova, Kseniya <[email protected]> Co-authored-by: Sergey Semenov <[email protected]>
1 parent dbc0272 commit f3d30b6

File tree

10 files changed

+338
-149
lines changed

10 files changed

+338
-149
lines changed

sycl/include/sycl/queue.hpp

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -361,11 +361,9 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
361361
}
362362
};
363363

364-
auto Event = submit_impl_and_postprocess(CGF, CodeLoc, PostProcess);
365-
return discard_or_return(Event);
364+
return submit_impl_and_postprocess(CGF, CodeLoc, PostProcess);
366365
#else
367-
auto Event = submit_impl(CGF, CodeLoc);
368-
return discard_or_return(Event);
366+
return submit_impl(CGF, CodeLoc);
369367
#endif // __SYCL_USE_FALLBACK_ASSERT
370368
}
371369

@@ -400,12 +398,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
400398
}
401399
};
402400

403-
auto Event =
404-
submit_impl_and_postprocess(CGF, SecondaryQueue, CodeLoc, PostProcess);
405-
return discard_or_return(Event);
401+
return submit_impl_and_postprocess(CGF, SecondaryQueue, CodeLoc,
402+
PostProcess);
406403
#else
407-
auto Event = submit_impl(CGF, SecondaryQueue, CodeLoc);
408-
return discard_or_return(Event);
404+
return submit_impl(CGF, SecondaryQueue, CodeLoc);
409405
#endif // __SYCL_USE_FALLBACK_ASSERT
410406
}
411407

@@ -2819,6 +2815,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
28192815

28202816
/// Checks if the event needs to be discarded and if so, discards it and
28212817
/// returns a discarded event. Otherwise, it returns input event.
2818+
/// TODO: move to impl class in the next ABI Breaking window
28222819
event discard_or_return(const event &Event);
28232820

28242821
// Function to postprocess submitted command

sycl/source/detail/helpers.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@ namespace sycl {
2121
inline namespace _V1 {
2222
using ContextImplPtr = std::shared_ptr<sycl::detail::context_impl>;
2323
namespace detail {
24+
// TODO: remove from public header files and implementation during the next ABI
25+
// Breaking window. Not used any more.
2426
std::vector<sycl::detail::pi::PiEvent>
2527
getOrWaitEvents(std::vector<sycl::event> DepEvents, ContextImplPtr Context) {
2628
std::vector<sycl::detail::pi::PiEvent> Events;

sycl/source/detail/queue_impl.cpp

Lines changed: 141 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -26,9 +26,19 @@
2626
namespace sycl {
2727
inline namespace _V1 {
2828
namespace detail {
29-
3029
std::atomic<unsigned long long> queue_impl::MNextAvailableQueueID = 0;
3130

31+
static std::vector<sycl::detail::pi::PiEvent>
32+
getPIEvents(const std::vector<sycl::event> &DepEvents) {
33+
std::vector<sycl::detail::pi::PiEvent> RetPiEvents;
34+
for (const sycl::event &Event : DepEvents) {
35+
const EventImplPtr &EventImpl = detail::getSyclObjImpl(Event);
36+
if (EventImpl->getHandleRef() != nullptr)
37+
RetPiEvents.push_back(EventImpl->getHandleRef());
38+
}
39+
return RetPiEvents;
40+
}
41+
3242
template <>
3343
uint32_t queue_impl::get_info<info::queue::reference_count>() const {
3444
sycl::detail::pi::PiResult result = PI_SUCCESS;
@@ -63,16 +73,25 @@ static event createDiscardedEvent() {
6373

6474
const std::vector<event> &
6575
queue_impl::getExtendDependencyList(const std::vector<event> &DepEvents,
66-
std::vector<event> &MutableVec) {
67-
if (isInOrder()) {
68-
std::optional<event> ExternalEvent = popExternalEvent();
69-
if (ExternalEvent) {
70-
MutableVec = DepEvents;
71-
MutableVec.push_back(*ExternalEvent);
72-
return MutableVec;
73-
}
74-
}
75-
return DepEvents;
76+
std::vector<event> &MutableVec,
77+
std::unique_lock<std::mutex> &QueueLock) {
78+
if (!isInOrder())
79+
return DepEvents;
80+
81+
QueueLock.lock();
82+
EventImplPtr ExtraEvent =
83+
MGraph.expired() ? MLastEventPtr : MGraphLastEventPtr;
84+
std::optional<event> ExternalEvent = popExternalEvent();
85+
86+
if (!ExternalEvent && !ExtraEvent)
87+
return DepEvents;
88+
89+
MutableVec = DepEvents;
90+
if (ExternalEvent)
91+
MutableVec.push_back(*ExternalEvent);
92+
if (ExtraEvent)
93+
MutableVec.push_back(detail::createSyclObjFromImpl<event>(ExtraEvent));
94+
return MutableVec;
7695
}
7796

7897
event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
@@ -106,7 +125,7 @@ event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
106125
}
107126

108127
return submitMemOpHelper(
109-
Self, DepEvents,
128+
Self, DepEvents, [&](handler &CGH) { CGH.memset(Ptr, Value, Count); },
110129
[](const auto &...Args) { MemoryManager::fill_usm(Args...); }, Ptr, Self,
111130
Count, Value);
112131
}
@@ -152,21 +171,17 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
152171
#endif
153172
// If we have a command graph set we need to capture the copy through normal
154173
// queue submission rather than execute the copy directly.
155-
if (MGraph.lock()) {
156-
return submit(
157-
[&](handler &CGH) {
158-
CGH.depends_on(DepEvents);
159-
CGH.memcpy(Dest, Src, Count);
160-
},
161-
Self, {});
162-
}
174+
auto HandlerFunc = [&](handler &CGH) { CGH.memcpy(Dest, Src, Count); };
175+
if (MGraph.lock())
176+
return submitWithHandler(Self, DepEvents, HandlerFunc);
177+
163178
if ((!Src || !Dest) && Count != 0) {
164179
report(CodeLoc);
165180
throw runtime_error("NULL pointer argument in memory copy operation.",
166181
PI_ERROR_INVALID_VALUE);
167182
}
168183
return submitMemOpHelper(
169-
Self, DepEvents,
184+
Self, DepEvents, HandlerFunc,
170185
[](const auto &...Args) { MemoryManager::copy_usm(Args...); }, Src, Self,
171186
Count, Dest);
172187
}
@@ -177,17 +192,12 @@ event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
177192
const std::vector<event> &DepEvents) {
178193
// If we have a command graph set we need to capture the advise through normal
179194
// queue submission.
180-
if (MGraph.lock()) {
181-
return submit(
182-
[&](handler &CGH) {
183-
CGH.depends_on(DepEvents);
184-
CGH.mem_advise(Ptr, Length, Advice);
185-
},
186-
Self, {});
187-
}
195+
auto HandlerFunc = [&](handler &CGH) { CGH.mem_advise(Ptr, Length, Advice); };
196+
if (MGraph.lock())
197+
return submitWithHandler(Self, DepEvents, HandlerFunc);
188198

189199
return submitMemOpHelper(
190-
Self, DepEvents,
200+
Self, DepEvents, HandlerFunc,
191201
[](const auto &...Args) { MemoryManager::advise_usm(Args...); }, Ptr,
192202
Self, Length, Advice);
193203
}
@@ -198,6 +208,10 @@ event queue_impl::memcpyToDeviceGlobal(
198208
const std::vector<event> &DepEvents) {
199209
return submitMemOpHelper(
200210
Self, DepEvents,
211+
[&](handler &CGH) {
212+
CGH.memcpyToDeviceGlobal(DeviceGlobalPtr, Src, IsDeviceImageScope,
213+
NumBytes, Offset);
214+
},
201215
[](const auto &...Args) {
202216
MemoryManager::copy_to_device_global(Args...);
203217
},
@@ -210,15 +224,25 @@ event queue_impl::memcpyFromDeviceGlobal(
210224
size_t Offset, const std::vector<event> &DepEvents) {
211225
return submitMemOpHelper(
212226
Self, DepEvents,
227+
[&](handler &CGH) {
228+
CGH.memcpyFromDeviceGlobal(Dest, DeviceGlobalPtr, IsDeviceImageScope,
229+
NumBytes, Offset);
230+
},
213231
[](const auto &...Args) {
214232
MemoryManager::copy_from_device_global(Args...);
215233
},
216234
DeviceGlobalPtr, IsDeviceImageScope, Self, NumBytes, Offset, Dest);
217235
}
218236

219-
event queue_impl::getLastEvent() const {
220-
std::lock_guard<std::mutex> Lock{MLastEventMtx};
221-
return MDiscardEvents ? createDiscardedEvent() : MLastEvent;
237+
event queue_impl::getLastEvent() {
238+
std::lock_guard<std::mutex> Lock{MMutex};
239+
if (MDiscardEvents)
240+
return createDiscardedEvent();
241+
if (!MGraph.expired() && MGraphLastEventPtr)
242+
return detail::createSyclObjFromImpl<event>(MGraphLastEventPtr);
243+
if (!MLastEventPtr)
244+
MLastEventPtr = std::make_shared<event_impl>(std::nullopt);
245+
return detail::createSyclObjFromImpl<event>(MLastEventPtr);
222246
}
223247

224248
void queue_impl::addEvent(const event &Event) {
@@ -273,55 +297,90 @@ void queue_impl::addSharedEvent(const event &Event) {
273297
MEventsShared.push_back(Event);
274298
}
275299

276-
template <typename MemOpFuncT, typename... MemOpArgTs>
300+
static bool
301+
areEventsSafeForSchedulerBypass(const std::vector<sycl::event> &DepEvents,
302+
ContextImplPtr Context) {
303+
auto CheckEvent = [&Context](const sycl::event &Event) {
304+
const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event);
305+
// Events that don't have an initialized context are throwaway events that
306+
// don't represent actual dependencies. Calling getContextImpl() would set
307+
// their context, which we wish to avoid as it is expensive.
308+
if (!SyclEventImplPtr->isContextInitialized() &&
309+
!SyclEventImplPtr->is_host()) {
310+
return true;
311+
}
312+
if (SyclEventImplPtr->is_host()) {
313+
return SyclEventImplPtr->isCompleted();
314+
}
315+
// Cross-context dependencies can't be passed to the backend directly.
316+
if (SyclEventImplPtr->getContextImpl() != Context)
317+
return false;
318+
319+
// A nullptr here means that the commmand does not produce a PI event or it
320+
// hasn't been enqueued yet.
321+
return SyclEventImplPtr->getHandleRef() != nullptr;
322+
};
323+
324+
return std::all_of(DepEvents.begin(), DepEvents.end(),
325+
[&Context, &CheckEvent](const sycl::event &Event) {
326+
return CheckEvent(Event);
327+
});
328+
}
329+
330+
template <typename HandlerFuncT>
331+
event queue_impl::submitWithHandler(const std::shared_ptr<queue_impl> &Self,
332+
const std::vector<event> &DepEvents,
333+
HandlerFuncT HandlerFunc) {
334+
return submit(
335+
[&](handler &CGH) {
336+
CGH.depends_on(DepEvents);
337+
HandlerFunc(CGH);
338+
},
339+
Self, {});
340+
}
341+
342+
template <typename HandlerFuncT, typename MemOpFuncT, typename... MemOpArgTs>
277343
event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
278344
const std::vector<event> &DepEvents,
345+
HandlerFuncT HandlerFunc,
279346
MemOpFuncT MemOpFunc,
280347
MemOpArgTs... MemOpArgs) {
281-
if (MHasDiscardEventsSupport) {
282-
MemOpFunc(MemOpArgs..., getOrWaitEvents(DepEvents, MContext),
283-
/*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
284-
return createDiscardedEvent();
285-
}
286-
287-
event ResEvent = prepareSYCLEventAssociatedWithQueue(Self);
348+
// We need to submit command and update the last event under same lock if we
349+
// have in-order queue.
288350
{
289-
// We need to submit command and update the last event under same lock if we
290-
// have in-order queue.
291-
auto ScopeLock = isInOrder() ? std::unique_lock<std::mutex>(MLastEventMtx)
292-
: std::unique_lock<std::mutex>();
293-
// If the last submitted command in the in-order queue is host_task then
294-
// wait for it before submitting usm command.
295-
if (isInOrder() && MLastCGType == CG::CGTYPE::CodeplayHostTask)
296-
MLastEvent.wait();
351+
std::unique_lock<std::mutex> Lock(MMutex, std::defer_lock);
297352

298353
std::vector<event> MutableDepEvents;
299354
const std::vector<event> &ExpandedDepEvents =
300-
getExtendDependencyList(DepEvents, MutableDepEvents);
301-
302-
auto EventImpl = detail::getSyclObjImpl(ResEvent);
303-
MemOpFunc(MemOpArgs..., getOrWaitEvents(ExpandedDepEvents, MContext),
304-
&EventImpl->getHandleRef(), EventImpl);
305-
306-
if (MContext->is_host())
307-
return MDiscardEvents ? createDiscardedEvent() : event();
308-
309-
// When a queue is recorded by a graph, the dependencies are managed in the
310-
// graph implementaton. Additionally, CG recorded for a graph are outside of
311-
// the in-order queue execution sequence. Therefore, these CG must not
312-
// update MLastEvent.
313-
if (isInOrder() && (getCommandGraph() == nullptr)) {
314-
MLastEvent = ResEvent;
315-
// We don't create a command group for usm commands, so set it to None.
316-
// This variable is used to perform explicit dependency management when
317-
// required.
318-
MLastCGType = CG::CGTYPE::None;
355+
getExtendDependencyList(DepEvents, MutableDepEvents, Lock);
356+
357+
if (areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) {
358+
if (MHasDiscardEventsSupport) {
359+
MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
360+
/*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
361+
return createDiscardedEvent();
362+
}
363+
364+
event ResEvent = prepareSYCLEventAssociatedWithQueue(Self);
365+
auto EventImpl = detail::getSyclObjImpl(ResEvent);
366+
MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
367+
&EventImpl->getHandleRef(), EventImpl);
368+
369+
if (MContext->is_host())
370+
return MDiscardEvents ? createDiscardedEvent() : event();
371+
372+
if (isInOrder()) {
373+
auto &EventToStoreIn =
374+
MGraph.lock() ? MGraphLastEventPtr : MLastEventPtr;
375+
EventToStoreIn = EventImpl;
376+
}
377+
// Track only if we won't be able to handle it with piQueueFinish.
378+
if (MEmulateOOO)
379+
addSharedEvent(ResEvent);
380+
return discard_or_return(ResEvent);
319381
}
320382
}
321-
// Track only if we won't be able to handle it with piQueueFinish.
322-
if (MEmulateOOO)
323-
addSharedEvent(ResEvent);
324-
return MDiscardEvents ? createDiscardedEvent() : ResEvent;
383+
return submitWithHandler(Self, DepEvents, HandlerFunc);
325384
}
326385

327386
void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc,
@@ -495,9 +554,10 @@ bool queue_impl::ext_oneapi_empty() const {
495554
// If we have in-order queue where events are not discarded then just check
496555
// the status of the last event.
497556
if (isInOrder() && !MDiscardEvents) {
498-
std::lock_guard<std::mutex> Lock(MLastEventMtx);
499-
return MLastEvent.get_info<info::event::command_execution_status>() ==
500-
info::event_command_status::complete;
557+
std::lock_guard<std::mutex> Lock(MMutex);
558+
return !MLastEventPtr ||
559+
MLastEventPtr->get_info<info::event::command_execution_status>() ==
560+
info::event_command_status::complete;
501561
}
502562

503563
// Check the status of the backend queue if this is not a host queue.
@@ -533,6 +593,12 @@ bool queue_impl::ext_oneapi_empty() const {
533593
return true;
534594
}
535595

596+
event queue_impl::discard_or_return(const event &Event) {
597+
if (!(MDiscardEvents))
598+
return Event;
599+
return createDiscardedEvent();
600+
}
601+
536602
} // namespace detail
537603
} // namespace _V1
538604
} // namespace sycl

0 commit comments

Comments
 (0)