Skip to content

Commit f170c63

Browse files
[SYCL] Fix kernel shortcut path for inorder queue (#13333)
Commit #11758 added extra event as dependency to fix host task vs kernel dependencies for inorder queue. That logic missed part when we try to bypass scheduler for kernel with no dependencies. This commit fixes it and allows to bypass scheduler if previous commands are enqueued (no host task is involved). --------- Signed-off-by: Tikhomirova, Kseniya <[email protected]>
1 parent e6d9d4c commit f170c63

File tree

6 files changed

+146
-106
lines changed

6 files changed

+146
-106
lines changed

sycl/source/detail/queue_impl.cpp

Lines changed: 2 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -322,37 +322,6 @@ void queue_impl::addSharedEvent(const event &Event) {
322322
MEventsShared.push_back(Event);
323323
}
324324

325-
static bool
326-
areEventsSafeForSchedulerBypass(const std::vector<sycl::event> &DepEvents,
327-
ContextImplPtr Context) {
328-
auto CheckEvent = [&Context](const sycl::event &Event) {
329-
const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event);
330-
// Events that don't have an initialized context are throwaway events that
331-
// don't represent actual dependencies. Calling getContextImpl() would set
332-
// their context, which we wish to avoid as it is expensive.
333-
// NOP events also don't represent actual dependencies.
334-
if ((!SyclEventImplPtr->isContextInitialized() &&
335-
!SyclEventImplPtr->is_host()) ||
336-
SyclEventImplPtr->isNOP()) {
337-
return true;
338-
}
339-
if (SyclEventImplPtr->is_host()) {
340-
return SyclEventImplPtr->isCompleted();
341-
}
342-
// Cross-context dependencies can't be passed to the backend directly.
343-
if (SyclEventImplPtr->getContextImpl() != Context)
344-
return false;
345-
346-
// A nullptr here means that the commmand does not produce a PI event or it
347-
// hasn't been enqueued yet.
348-
return SyclEventImplPtr->getHandleRef() != nullptr;
349-
};
350-
351-
return std::all_of(
352-
DepEvents.begin(), DepEvents.end(),
353-
[&CheckEvent](const sycl::event &Event) { return CheckEvent(Event); });
354-
}
355-
356325
template <typename HandlerFuncT>
357326
event queue_impl::submitWithHandler(const std::shared_ptr<queue_impl> &Self,
358327
const std::vector<event> &DepEvents,
@@ -382,8 +351,8 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
382351

383352
// If we have a command graph set we need to capture the op through the
384353
// handler rather than by-passing the scheduler.
385-
if (MGraph.expired() &&
386-
areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) {
354+
if (MGraph.expired() && Scheduler::areEventsSafeForSchedulerBypass(
355+
ExpandedDepEvents, MContext)) {
387356
if (MSupportsDiscardingPiEvents) {
388357
MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
389358
/*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);

sycl/source/detail/queue_impl.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -783,7 +783,6 @@ class queue_impl {
783783
EventRet = Handler.finalize();
784784
}
785785

786-
protected:
787786
/// Performs command group submission to the queue.
788787
///
789788
/// \param CGF is a function object containing command group.

sycl/source/detail/scheduler/scheduler.cpp

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -716,6 +716,48 @@ EventImplPtr Scheduler::addCommandGraphUpdate(
716716
return NewCmdEvent;
717717
}
718718

719+
bool CheckEventReadiness(const ContextImplPtr &Context,
720+
const EventImplPtr &SyclEventImplPtr) {
721+
// Events that don't have an initialized context are throwaway events that
722+
// don't represent actual dependencies. Calling getContextImpl() would set
723+
// their context, which we wish to avoid as it is expensive.
724+
// NOP events also don't represent actual dependencies.
725+
if ((!SyclEventImplPtr->isContextInitialized() &&
726+
!SyclEventImplPtr->is_host()) ||
727+
SyclEventImplPtr->isNOP()) {
728+
return true;
729+
}
730+
if (SyclEventImplPtr->is_host()) {
731+
return SyclEventImplPtr->isCompleted();
732+
}
733+
// Cross-context dependencies can't be passed to the backend directly.
734+
if (SyclEventImplPtr->getContextImpl() != Context)
735+
return false;
736+
737+
// A nullptr here means that the commmand does not produce a PI event or it
738+
// hasn't been enqueued yet.
739+
return SyclEventImplPtr->getHandleRef() != nullptr;
740+
}
741+
742+
bool Scheduler::areEventsSafeForSchedulerBypass(
743+
const std::vector<sycl::event> &DepEvents, ContextImplPtr Context) {
744+
745+
return std::all_of(
746+
DepEvents.begin(), DepEvents.end(), [&Context](const sycl::event &Event) {
747+
const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event);
748+
return CheckEventReadiness(Context, SyclEventImplPtr);
749+
});
750+
}
751+
752+
bool Scheduler::areEventsSafeForSchedulerBypass(
753+
const std::vector<EventImplPtr> &DepEvents, ContextImplPtr Context) {
754+
755+
return std::all_of(DepEvents.begin(), DepEvents.end(),
756+
[&Context](const EventImplPtr &SyclEventImplPtr) {
757+
return CheckEventReadiness(Context, SyclEventImplPtr);
758+
});
759+
}
760+
719761
} // namespace detail
720762
} // namespace _V1
721763
} // namespace sycl

sycl/source/detail/scheduler/scheduler.hpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -491,6 +491,13 @@ class Scheduler {
491491
const QueueImplPtr &Queue, std::vector<Requirement *> Requirements,
492492
std::vector<detail::EventImplPtr> &Events);
493493

494+
static bool
495+
areEventsSafeForSchedulerBypass(const std::vector<sycl::event> &DepEvents,
496+
ContextImplPtr Context);
497+
static bool
498+
areEventsSafeForSchedulerBypass(const std::vector<EventImplPtr> &DepEvents,
499+
ContextImplPtr Context);
500+
494501
protected:
495502
using RWLockT = std::shared_timed_mutex;
496503
using ReadLockT = std::shared_lock<RWLockT>;

sycl/source/handler.cpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -241,10 +241,12 @@ event handler::finalize() {
241241
}
242242

243243
if (MQueue && !MGraph && !MSubgraphNode && !MQueue->getCommandGraph() &&
244-
!MQueue->is_in_fusion_mode() &&
245-
CGData.MRequirements.size() + CGData.MEvents.size() +
246-
MStreamStorage.size() ==
247-
0) {
244+
!MQueue->is_in_fusion_mode() && !CGData.MRequirements.size() &&
245+
!MStreamStorage.size() &&
246+
(!CGData.MEvents.size() ||
247+
(MQueue->isInOrder() &&
248+
detail::Scheduler::areEventsSafeForSchedulerBypass(
249+
CGData.MEvents, MQueue->getContextImplPtr())))) {
248250
// if user does not add a new dependency to the dependency graph, i.e.
249251
// the graph is not changed, and the queue is not in fusion mode, then
250252
// this faster path is used to submit kernel bypassing scheduler and

sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp

Lines changed: 89 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -312,74 +312,95 @@ TEST_F(CommandGraphTest, InOrderQueueWithPreviousHostTask) {
312312
}
313313

314314
TEST_F(CommandGraphTest, InOrderQueueHostTaskAndGraph) {
315-
sycl::property_list Properties{sycl::property::queue::in_order()};
316-
sycl::queue InOrderQueue{Dev, Properties};
317-
experimental::command_graph<experimental::graph_state::modifiable>
318-
InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};
319-
320-
auto EventInitial =
321-
InOrderQueue.submit([&](handler &CGH) { CGH.host_task([=]() {}); });
322-
auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial);
323-
324-
// Record in-order queue with three nodes.
325-
InOrderGraph.begin_recording(InOrderQueue);
326-
auto Node1Graph = InOrderQueue.submit(
327-
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
328-
329-
auto PtrNode1 =
330-
sycl::detail::getSyclObjImpl(InOrderGraph)
331-
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
332-
ASSERT_NE(PtrNode1, nullptr);
333-
ASSERT_TRUE(PtrNode1->MPredecessors.empty());
334-
335-
auto Node2Graph = InOrderQueue.submit(
336-
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
337-
338-
auto PtrNode2 =
339-
sycl::detail::getSyclObjImpl(InOrderGraph)
340-
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
341-
ASSERT_NE(PtrNode2, nullptr);
342-
ASSERT_NE(PtrNode2, PtrNode1);
343-
ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu);
344-
ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2);
345-
ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu);
346-
ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1);
347-
348-
auto Node3Graph = InOrderQueue.submit(
349-
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
350-
351-
auto PtrNode3 =
352-
sycl::detail::getSyclObjImpl(InOrderGraph)
353-
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
354-
ASSERT_NE(PtrNode3, nullptr);
355-
ASSERT_NE(PtrNode3, PtrNode2);
356-
ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu);
357-
ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3);
358-
ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu);
359-
ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2);
360-
361-
InOrderGraph.end_recording(InOrderQueue);
362-
363-
auto InOrderGraphExec = InOrderGraph.finalize();
364-
auto EventGraph = InOrderQueue.submit(
365-
[&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); });
366-
367-
auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph);
368-
auto EventGraphWaitList = EventGraphImpl->getWaitList();
369-
// Previous task is a host task. Explicit dependency is needed to enforce the
370-
// execution order.
371-
ASSERT_EQ(EventGraphWaitList.size(), 1lu);
372-
ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl);
373-
374-
auto EventLast = InOrderQueue.submit(
375-
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
376-
auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast);
377-
auto EventLastWaitList = EventLastImpl->getWaitList();
378-
// Previous task is not a host task. Explicit dependency is still needed
379-
// to properly handle blocked tasks (the event will be filtered out before
380-
// submission to the backend).
381-
ASSERT_EQ(EventLastWaitList.size(), 1lu);
382-
ASSERT_EQ(EventLastWaitList[0], EventGraphImpl);
315+
auto TestBody = [&](bool BlockHostTask) {
316+
sycl::property_list Properties{sycl::property::queue::in_order()};
317+
sycl::queue InOrderQueue{Dev, Properties};
318+
experimental::command_graph<experimental::graph_state::modifiable>
319+
InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};
320+
// Event dependency build depends on host task completion. Making it
321+
// predictable with mutex in host task.
322+
std::mutex HostTaskMutex;
323+
std::unique_lock<std::mutex> Lock(HostTaskMutex, std::defer_lock);
324+
if (BlockHostTask)
325+
Lock.lock();
326+
auto EventInitial = InOrderQueue.submit([&](handler &CGH) {
327+
CGH.host_task([&HostTaskMutex]() {
328+
std::lock_guard<std::mutex> HostTaskLock(HostTaskMutex);
329+
});
330+
});
331+
auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial);
332+
333+
// Record in-order queue with three nodes.
334+
InOrderGraph.begin_recording(InOrderQueue);
335+
auto Node1Graph = InOrderQueue.submit(
336+
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
337+
338+
auto PtrNode1 =
339+
sycl::detail::getSyclObjImpl(InOrderGraph)
340+
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
341+
ASSERT_NE(PtrNode1, nullptr);
342+
ASSERT_TRUE(PtrNode1->MPredecessors.empty());
343+
344+
auto Node2Graph = InOrderQueue.submit(
345+
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
346+
347+
auto PtrNode2 =
348+
sycl::detail::getSyclObjImpl(InOrderGraph)
349+
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
350+
ASSERT_NE(PtrNode2, nullptr);
351+
ASSERT_NE(PtrNode2, PtrNode1);
352+
ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu);
353+
ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2);
354+
ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu);
355+
ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1);
356+
357+
auto Node3Graph = InOrderQueue.submit(
358+
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
359+
360+
auto PtrNode3 =
361+
sycl::detail::getSyclObjImpl(InOrderGraph)
362+
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
363+
ASSERT_NE(PtrNode3, nullptr);
364+
ASSERT_NE(PtrNode3, PtrNode2);
365+
ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu);
366+
ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3);
367+
ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu);
368+
ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2);
369+
370+
InOrderGraph.end_recording(InOrderQueue);
371+
372+
auto InOrderGraphExec = InOrderGraph.finalize();
373+
374+
if (!BlockHostTask)
375+
EventInitial.wait();
376+
auto EventGraph = InOrderQueue.submit(
377+
[&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); });
378+
379+
auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph);
380+
auto EventGraphWaitList = EventGraphImpl->getWaitList();
381+
// Previous task is a host task. Explicit dependency is needed to enforce
382+
// the execution order.
383+
ASSERT_EQ(EventGraphWaitList.size(), 1lu);
384+
ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl);
385+
386+
auto EventLast = InOrderQueue.submit(
387+
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
388+
auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast);
389+
auto EventLastWaitList = EventLastImpl->getWaitList();
390+
// Previous task is not a host task. Explicit dependency is still needed
391+
// to properly handle blocked tasks (the event will be filtered out before
392+
// submission to the backend).
393+
if (BlockHostTask)
394+
Lock.unlock();
395+
ASSERT_EQ(EventLastWaitList.size(), size_t(BlockHostTask));
396+
if (EventLastWaitList.size()) {
397+
ASSERT_EQ(EventLastWaitList[0], EventGraphImpl);
398+
}
399+
EventLast.wait();
400+
};
401+
402+
TestBody(false);
403+
TestBody(true);
383404
}
384405

385406
TEST_F(CommandGraphTest, InOrderQueueMemsetAndGraph) {

0 commit comments

Comments
 (0)