Skip to content

[SYCL] Fix kernel shortcut path for inorder queue #13333

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Apr 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 2 additions & 33 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -322,37 +322,6 @@ void queue_impl::addSharedEvent(const event &Event) {
MEventsShared.push_back(Event);
}

static bool
areEventsSafeForSchedulerBypass(const std::vector<sycl::event> &DepEvents,
ContextImplPtr Context) {
auto CheckEvent = [&Context](const sycl::event &Event) {
const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event);
// Events that don't have an initialized context are throwaway events that
// don't represent actual dependencies. Calling getContextImpl() would set
// their context, which we wish to avoid as it is expensive.
// NOP events also don't represent actual dependencies.
if ((!SyclEventImplPtr->isContextInitialized() &&
!SyclEventImplPtr->is_host()) ||
SyclEventImplPtr->isNOP()) {
return true;
}
if (SyclEventImplPtr->is_host()) {
return SyclEventImplPtr->isCompleted();
}
// Cross-context dependencies can't be passed to the backend directly.
if (SyclEventImplPtr->getContextImpl() != Context)
return false;

// A nullptr here means that the commmand does not produce a PI event or it
// hasn't been enqueued yet.
return SyclEventImplPtr->getHandleRef() != nullptr;
};

return std::all_of(
DepEvents.begin(), DepEvents.end(),
[&CheckEvent](const sycl::event &Event) { return CheckEvent(Event); });
}

template <typename HandlerFuncT>
event queue_impl::submitWithHandler(const std::shared_ptr<queue_impl> &Self,
const std::vector<event> &DepEvents,
Expand Down Expand Up @@ -382,8 +351,8 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,

// If we have a command graph set we need to capture the op through the
// handler rather than by-passing the scheduler.
if (MGraph.expired() &&
areEventsSafeForSchedulerBypass(ExpandedDepEvents, MContext)) {
if (MGraph.expired() && Scheduler::areEventsSafeForSchedulerBypass(
ExpandedDepEvents, MContext)) {
if (MSupportsDiscardingPiEvents) {
MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
/*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr);
Expand Down
1 change: 0 additions & 1 deletion sycl/source/detail/queue_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,6 @@ class queue_impl {
EventRet = Handler.finalize();
}

protected:
/// Performs command group submission to the queue.
///
/// \param CGF is a function object containing command group.
Expand Down
42 changes: 42 additions & 0 deletions sycl/source/detail/scheduler/scheduler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,48 @@ EventImplPtr Scheduler::addCommandGraphUpdate(
return NewCmdEvent;
}

bool CheckEventReadiness(const ContextImplPtr &Context,
const EventImplPtr &SyclEventImplPtr) {
// Events that don't have an initialized context are throwaway events that
// don't represent actual dependencies. Calling getContextImpl() would set
// their context, which we wish to avoid as it is expensive.
// NOP events also don't represent actual dependencies.
if ((!SyclEventImplPtr->isContextInitialized() &&
!SyclEventImplPtr->is_host()) ||
SyclEventImplPtr->isNOP()) {
return true;
}
if (SyclEventImplPtr->is_host()) {
return SyclEventImplPtr->isCompleted();
}
// Cross-context dependencies can't be passed to the backend directly.
if (SyclEventImplPtr->getContextImpl() != Context)
return false;

// A nullptr here means that the commmand does not produce a PI event or it
// hasn't been enqueued yet.
return SyclEventImplPtr->getHandleRef() != nullptr;
}

bool Scheduler::areEventsSafeForSchedulerBypass(
const std::vector<sycl::event> &DepEvents, ContextImplPtr Context) {

return std::all_of(
DepEvents.begin(), DepEvents.end(), [&Context](const sycl::event &Event) {
const EventImplPtr &SyclEventImplPtr = detail::getSyclObjImpl(Event);
return CheckEventReadiness(Context, SyclEventImplPtr);
});
}

bool Scheduler::areEventsSafeForSchedulerBypass(
const std::vector<EventImplPtr> &DepEvents, ContextImplPtr Context) {

return std::all_of(DepEvents.begin(), DepEvents.end(),
[&Context](const EventImplPtr &SyclEventImplPtr) {
return CheckEventReadiness(Context, SyclEventImplPtr);
});
}

} // namespace detail
} // namespace _V1
} // namespace sycl
7 changes: 7 additions & 0 deletions sycl/source/detail/scheduler/scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,13 @@ class Scheduler {
const QueueImplPtr &Queue, std::vector<Requirement *> Requirements,
std::vector<detail::EventImplPtr> &Events);

static bool
areEventsSafeForSchedulerBypass(const std::vector<sycl::event> &DepEvents,
ContextImplPtr Context);
static bool
areEventsSafeForSchedulerBypass(const std::vector<EventImplPtr> &DepEvents,
ContextImplPtr Context);

protected:
using RWLockT = std::shared_timed_mutex;
using ReadLockT = std::shared_lock<RWLockT>;
Expand Down
10 changes: 6 additions & 4 deletions sycl/source/handler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -241,10 +241,12 @@ event handler::finalize() {
}

if (MQueue && !MGraph && !MSubgraphNode && !MQueue->getCommandGraph() &&
!MQueue->is_in_fusion_mode() &&
CGData.MRequirements.size() + CGData.MEvents.size() +
MStreamStorage.size() ==
0) {
!MQueue->is_in_fusion_mode() && !CGData.MRequirements.size() &&
!MStreamStorage.size() &&
(!CGData.MEvents.size() ||
(MQueue->isInOrder() &&
detail::Scheduler::areEventsSafeForSchedulerBypass(
CGData.MEvents, MQueue->getContextImplPtr())))) {
// if user does not add a new dependency to the dependency graph, i.e.
// the graph is not changed, and the queue is not in fusion mode, then
// this faster path is used to submit kernel bypassing scheduler and
Expand Down
157 changes: 89 additions & 68 deletions sycl/unittests/Extensions/CommandGraph/InOrderQueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -312,74 +312,95 @@ TEST_F(CommandGraphTest, InOrderQueueWithPreviousHostTask) {
}

TEST_F(CommandGraphTest, InOrderQueueHostTaskAndGraph) {
sycl::property_list Properties{sycl::property::queue::in_order()};
sycl::queue InOrderQueue{Dev, Properties};
experimental::command_graph<experimental::graph_state::modifiable>
InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};

auto EventInitial =
InOrderQueue.submit([&](handler &CGH) { CGH.host_task([=]() {}); });
auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial);

// Record in-order queue with three nodes.
InOrderGraph.begin_recording(InOrderQueue);
auto Node1Graph = InOrderQueue.submit(
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });

auto PtrNode1 =
sycl::detail::getSyclObjImpl(InOrderGraph)
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
ASSERT_NE(PtrNode1, nullptr);
ASSERT_TRUE(PtrNode1->MPredecessors.empty());

auto Node2Graph = InOrderQueue.submit(
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });

auto PtrNode2 =
sycl::detail::getSyclObjImpl(InOrderGraph)
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
ASSERT_NE(PtrNode2, nullptr);
ASSERT_NE(PtrNode2, PtrNode1);
ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu);
ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2);
ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu);
ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1);

auto Node3Graph = InOrderQueue.submit(
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });

auto PtrNode3 =
sycl::detail::getSyclObjImpl(InOrderGraph)
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
ASSERT_NE(PtrNode3, nullptr);
ASSERT_NE(PtrNode3, PtrNode2);
ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu);
ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3);
ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu);
ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2);

InOrderGraph.end_recording(InOrderQueue);

auto InOrderGraphExec = InOrderGraph.finalize();
auto EventGraph = InOrderQueue.submit(
[&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); });

auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph);
auto EventGraphWaitList = EventGraphImpl->getWaitList();
// Previous task is a host task. Explicit dependency is needed to enforce the
// execution order.
ASSERT_EQ(EventGraphWaitList.size(), 1lu);
ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl);

auto EventLast = InOrderQueue.submit(
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast);
auto EventLastWaitList = EventLastImpl->getWaitList();
// Previous task is not a host task. Explicit dependency is still needed
// to properly handle blocked tasks (the event will be filtered out before
// submission to the backend).
ASSERT_EQ(EventLastWaitList.size(), 1lu);
ASSERT_EQ(EventLastWaitList[0], EventGraphImpl);
auto TestBody = [&](bool BlockHostTask) {
sycl::property_list Properties{sycl::property::queue::in_order()};
sycl::queue InOrderQueue{Dev, Properties};
experimental::command_graph<experimental::graph_state::modifiable>
InOrderGraph{InOrderQueue.get_context(), InOrderQueue.get_device()};
// Event dependency build depends on host task completion. Making it
// predictable with mutex in host task.
std::mutex HostTaskMutex;
std::unique_lock<std::mutex> Lock(HostTaskMutex, std::defer_lock);
if (BlockHostTask)
Lock.lock();
auto EventInitial = InOrderQueue.submit([&](handler &CGH) {
CGH.host_task([&HostTaskMutex]() {
std::lock_guard<std::mutex> HostTaskLock(HostTaskMutex);
});
});
auto EventInitialImpl = sycl::detail::getSyclObjImpl(EventInitial);

// Record in-order queue with three nodes.
InOrderGraph.begin_recording(InOrderQueue);
auto Node1Graph = InOrderQueue.submit(
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });

auto PtrNode1 =
sycl::detail::getSyclObjImpl(InOrderGraph)
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
ASSERT_NE(PtrNode1, nullptr);
ASSERT_TRUE(PtrNode1->MPredecessors.empty());

auto Node2Graph = InOrderQueue.submit(
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });

auto PtrNode2 =
sycl::detail::getSyclObjImpl(InOrderGraph)
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
ASSERT_NE(PtrNode2, nullptr);
ASSERT_NE(PtrNode2, PtrNode1);
ASSERT_EQ(PtrNode1->MSuccessors.size(), 1lu);
ASSERT_EQ(PtrNode1->MSuccessors.front().lock(), PtrNode2);
ASSERT_EQ(PtrNode2->MPredecessors.size(), 1lu);
ASSERT_EQ(PtrNode2->MPredecessors.front().lock(), PtrNode1);

auto Node3Graph = InOrderQueue.submit(
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });

auto PtrNode3 =
sycl::detail::getSyclObjImpl(InOrderGraph)
->getLastInorderNode(sycl::detail::getSyclObjImpl(InOrderQueue));
ASSERT_NE(PtrNode3, nullptr);
ASSERT_NE(PtrNode3, PtrNode2);
ASSERT_EQ(PtrNode2->MSuccessors.size(), 1lu);
ASSERT_EQ(PtrNode2->MSuccessors.front().lock(), PtrNode3);
ASSERT_EQ(PtrNode3->MPredecessors.size(), 1lu);
ASSERT_EQ(PtrNode3->MPredecessors.front().lock(), PtrNode2);

InOrderGraph.end_recording(InOrderQueue);

auto InOrderGraphExec = InOrderGraph.finalize();

if (!BlockHostTask)
EventInitial.wait();
auto EventGraph = InOrderQueue.submit(
[&](sycl::handler &CGH) { CGH.ext_oneapi_graph(InOrderGraphExec); });

auto EventGraphImpl = sycl::detail::getSyclObjImpl(EventGraph);
auto EventGraphWaitList = EventGraphImpl->getWaitList();
// Previous task is a host task. Explicit dependency is needed to enforce
// the execution order.
ASSERT_EQ(EventGraphWaitList.size(), 1lu);
ASSERT_EQ(EventGraphWaitList[0], EventInitialImpl);

auto EventLast = InOrderQueue.submit(
[&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
auto EventLastImpl = sycl::detail::getSyclObjImpl(EventLast);
auto EventLastWaitList = EventLastImpl->getWaitList();
// Previous task is not a host task. Explicit dependency is still needed
// to properly handle blocked tasks (the event will be filtered out before
// submission to the backend).
if (BlockHostTask)
Lock.unlock();
ASSERT_EQ(EventLastWaitList.size(), size_t(BlockHostTask));
if (EventLastWaitList.size()) {
ASSERT_EQ(EventLastWaitList[0], EventGraphImpl);
}
EventLast.wait();
};

TestBody(false);
TestBody(true);
}

TEST_F(CommandGraphTest, InOrderQueueMemsetAndGraph) {
Expand Down