Skip to content

[SYCL] Lower queue::wait() to piQueueFinish when possible #4044

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 8, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions sycl/source/detail/event_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,8 +200,7 @@ void event_impl::wait(
waitInternal();
else if (MCommand)
detail::Scheduler::getInstance().waitForEvent(Self);
if (MCommand && !SYCLConfig<SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::get())
detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self));
cleanupCommand(std::move(Self));

#ifdef XPTI_ENABLE_INSTRUMENTATION
instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
Expand All @@ -222,6 +221,12 @@ void event_impl::wait_and_throw(
Cmd->getQueue()->throw_asynchronous();
}

void event_impl::cleanupCommand(
std::shared_ptr<cl::sycl::detail::event_impl> Self) const {
if (MCommand && !SYCLConfig<SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::get())
detail::Scheduler::getInstance().cleanupFinishedCommands(std::move(Self));
}

template <>
cl_ulong
event_impl::get_profiling_info<info::event_profiling::command_submit>() const {
Expand Down
6 changes: 6 additions & 0 deletions sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,12 @@ class event_impl {
/// \param Self is a pointer to this event.
void wait_and_throw(std::shared_ptr<cl::sycl::detail::event_impl> Self);

/// Clean up the command associated with the event. Assumes that the task this
/// event is associated with has been completed.
///
/// \param Self is a pointer to this event.
void cleanupCommand(std::shared_ptr<cl::sycl::detail::event_impl> Self) const;

/// Queries this event for profiling information.
///
/// If the requested info is not available when this member function is
Expand Down
86 changes: 68 additions & 18 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,11 @@ event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
return event();

event ResEvent = prepareUSMEvent(Self, NativeEvent);
addSharedEvent(ResEvent);
// Track only if we won't be able to handle it with piQueueFinish.
// FIXME these events are stored for level zero until as a workaround, remove
// once piEventRelease no longer calls wait on the event in the plugin.
if (!MSupportOOO || getPlugin().getBackend() == backend::level_zero)
addSharedEvent(ResEvent);
return ResEvent;
}

Expand All @@ -76,7 +80,11 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
return event();

event ResEvent = prepareUSMEvent(Self, NativeEvent);
addSharedEvent(ResEvent);
// Track only if we won't be able to handle it with piQueueFinish.
// FIXME these events are stored for level zero until as a workaround, remove
// once piEventRelease no longer calls wait on the event in the plugin.
if (!MSupportOOO || getPlugin().getBackend() == backend::level_zero)
addSharedEvent(ResEvent);
return ResEvent;
}

Expand All @@ -92,7 +100,11 @@ event queue_impl::mem_advise(const std::shared_ptr<detail::queue_impl> &Self,
return event();

event ResEvent = prepareUSMEvent(Self, NativeEvent);
addSharedEvent(ResEvent);
// Track only if we won't be able to handle it with piQueueFinish.
// FIXME these events are stored for level zero until as a workaround, remove
// once piEventRelease no longer calls wait on the event in the plugin.
if (!MSupportOOO || getPlugin().getBackend() == backend::level_zero)
addSharedEvent(ResEvent);
return ResEvent;
}

Expand All @@ -101,8 +113,14 @@ void queue_impl::addEvent(const event &Event) {
Command *Cmd = (Command *)(Eimpl->getCommand());
if (!Cmd) {
// if there is no command on the event, we cannot track it with MEventsWeak
// as that will leave it with no owner. Track in MEventsShared
addSharedEvent(Event);
// as that will leave it with no owner. Track in MEventsShared only if we're
// unable to call piQueueFinish during wait.
// FIXME these events are stored for level zero until as a workaround,
// remove once piEventRelease no longer calls wait on the event in the
// plugin.
if (is_host() || !MSupportOOO ||
getPlugin().getBackend() == backend::level_zero)
addSharedEvent(Event);
} else {
std::weak_ptr<event_impl> EventWeakPtr{Eimpl};
std::lock_guard<std::mutex> Lock{MMutex};
Expand All @@ -114,6 +132,10 @@ void queue_impl::addEvent(const event &Event) {
/// but some events have no other owner. In this case,
/// addSharedEvent will have the queue track the events via a shared pointer.
void queue_impl::addSharedEvent(const event &Event) {
// FIXME The assertion should be corrected once the Level Zero workaround is
// removed.
assert(is_host() || !MSupportOOO ||
getPlugin().getBackend() == backend::level_zero);
std::lock_guard<std::mutex> Lock(MMutex);
// Events stored in MEventsShared are not released anywhere else aside from
// calls to queue::wait/wait_and_throw, which a user application might not
Expand Down Expand Up @@ -234,21 +256,49 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
TelemetryEvent = instrumentationProlog(CodeLoc, Name, StreamID, IId);
#endif

std::vector<std::weak_ptr<event_impl>> Events;
std::vector<event> USMEvents;
std::vector<std::weak_ptr<event_impl>> WeakEvents;
std::vector<event> SharedEvents;
{
std::lock_guard<std::mutex> Lock(MMutex);
Events.swap(MEventsWeak);
USMEvents.swap(MEventsShared);
std::lock_guard<mutex_class> Lock(MMutex);
WeakEvents.swap(MEventsWeak);
SharedEvents.swap(MEventsShared);
}
// If the queue is either a host one or does not support OOO (and we use
// multiple in-order queues as a result of that), wait for each event
// directly. Otherwise, only wait for unenqueued or host task events, starting
// from the latest submitted task in order to minimize total amount of calls,
// then handle the rest with piQueueFinish.
bool SupportsPiFinish = !is_host() && MSupportOOO;
for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
if (std::shared_ptr<event_impl> EventImplSharedPtr =
EventImplWeakPtrIt->lock()) {
// A nullptr PI event indicates that piQueueFinish will not cover it,
// either because it's a host task event or an unenqueued one.
if (!SupportsPiFinish || nullptr == EventImplSharedPtr->getHandleRef()) {
EventImplSharedPtr->wait(EventImplSharedPtr);
}
}
}
if (SupportsPiFinish) {
const detail::plugin &Plugin = getPlugin();
Plugin.call<detail::PiApiKind::piQueueFinish>(getHandleRef());
for (std::weak_ptr<event_impl> &EventImplWeakPtr : WeakEvents)
if (std::shared_ptr<event_impl> EventImplSharedPtr =
EventImplWeakPtr.lock())
EventImplSharedPtr->cleanupCommand(EventImplSharedPtr);
// FIXME these events are stored for level zero until as a workaround,
// remove once piEventRelease no longer calls wait on the event in the
// plugin.
if (Plugin.getBackend() == backend::level_zero) {
SharedEvents.clear();
}
assert(SharedEvents.empty() && "Queues that support calling piQueueFinish "
"shouldn't have shared events");
} else {
for (event &Event : SharedEvents)
Event.wait();
}

for (std::weak_ptr<event_impl> &EventImplWeakPtr : Events)
if (std::shared_ptr<event_impl> EventImplPtr = EventImplWeakPtr.lock())
EventImplPtr->wait(EventImplPtr);

for (event &Event : USMEvents)
Event.wait();

#ifdef XPTI_ENABLE_INSTRUMENTATION
instrumentationEpilog(TelemetryEvent, Name, StreamID, IId);
#endif
Expand Down
24 changes: 20 additions & 4 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -486,10 +486,14 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep) {
const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();

// 1. Async work is not supported for host device.
// 2. The event handle can be null in case of, for example, alloca command,
// which is currently synchronous, so don't generate OpenCL event.
// Though, this event isn't host one as it's context isn't host one.
if (DepEvent->is_host() || DepEvent->getHandleRef() == nullptr) {
// 2. Some types of commands do not produce PI events after they are enqueued
// (e.g. alloca). Note that we can't check the pi event to make that
// distinction since the command might still be unenqueued at this point.
bool PiEventExpected = !DepEvent->is_host();
if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
PiEventExpected &= DepCmd->producesPiEvent();

if (!PiEventExpected) {
// call to waitInternal() is in waitForPreparedHostEvents() as it's called
// from enqueue process functions
MPreparedHostDepsEvents.push_back(DepEvent);
Expand Down Expand Up @@ -520,6 +524,8 @@ const ContextImplPtr &Command::getWorkerContext() const {

const QueueImplPtr &Command::getWorkerQueue() const { return MQueue; }

bool Command::producesPiEvent() const { return true; }

Command *Command::addDep(DepDesc NewDep) {
Command *ConnectionCmd = nullptr;

Expand Down Expand Up @@ -731,6 +737,8 @@ void AllocaCommandBase::emitInstrumentationData() {
#endif
}

bool AllocaCommandBase::producesPiEvent() const { return false; }

AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req,
bool InitFromUserData,
AllocaCommandBase *LinkedAllocaCmd)
Expand Down Expand Up @@ -998,6 +1006,8 @@ void ReleaseCommand::printDot(std::ostream &Stream) const {
}
}

bool ReleaseCommand::producesPiEvent() const { return false; }

MapMemObject::MapMemObject(AllocaCommandBase *SrcAllocaCmd, Requirement Req,
void **DstPtr, QueueImplPtr Queue,
access::mode MapMode)
Expand Down Expand Up @@ -1392,6 +1402,8 @@ void EmptyCommand::printDot(std::ostream &Stream) const {
}
}

bool EmptyCommand::producesPiEvent() const { return false; }

void MemCpyCommandHost::printDot(std::ostream &Stream) const {
Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";

Expand Down Expand Up @@ -2193,6 +2205,10 @@ cl_int ExecCGCommand::enqueueImp() {
return PI_INVALID_OPERATION;
}

bool ExecCGCommand::producesPiEvent() const {
return MCommandGroup->getType() != CG::CGTYPE::CODEPLAY_HOST_TASK;
}

} // namespace detail
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
10 changes: 10 additions & 0 deletions sycl/source/detail/scheduler/commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,9 @@ class Command {
/// for memory copy commands.
virtual const QueueImplPtr &getWorkerQueue() const;

/// Returns true iff the command produces a PI event on non-host devices.
virtual bool producesPiEvent() const;

protected:
EventImplPtr MEvent;
QueueImplPtr MQueue;
Expand Down Expand Up @@ -306,6 +309,8 @@ class EmptyCommand : public Command {

void emitInstrumentationData() override;

bool producesPiEvent() const final;

private:
cl_int enqueueImp() final;

Expand All @@ -323,6 +328,7 @@ class ReleaseCommand : public Command {

void printDot(std::ostream &Stream) const final;
void emitInstrumentationData() override;
bool producesPiEvent() const final;

private:
cl_int enqueueImp() final;
Expand All @@ -347,6 +353,8 @@ class AllocaCommandBase : public Command {

void emitInstrumentationData() override;

bool producesPiEvent() const final;

void *MMemAllocation = nullptr;

/// Alloca command linked with current command.
Expand Down Expand Up @@ -518,6 +526,8 @@ class ExecCGCommand : public Command {
MCommandGroup.release();
}

bool producesPiEvent() const final;

private:
cl_int enqueueImp() final;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,10 +25,10 @@
// CHECK: ZE ---> zeCommandListClose
// CHECK: ZE ---> zeCommandQueueExecuteCommandLists
// CHECK: ---> piEventGetInfo
// CHECK-NOT: piEventsWait
// CHECK-NOT: piQueueFinish
// CHECK: ---> piEnqueueKernelLaunch
// CHECK: ZE ---> zeCommandListAppendLaunchKernel
// CHECK: ---> piEventsWait
// CHECK: ---> piQueueFinish
// Look for close and Execute after piEventsWait
// CHECK: ZE ---> zeCommandListClose
// CHECK: ZE ---> zeCommandQueueExecuteCommandLists
Expand Down
6 changes: 3 additions & 3 deletions sycl/test/on-device/plugins/level_zero_batch_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@
// CKB4: ZE ---> zeCommandQueueExecuteCommandLists(
// CKB8: ZE ---> zeCommandListClose(
// CKB8: ZE ---> zeCommandQueueExecuteCommandLists(
// CKALL: ---> piEventsWait(
// CKALL: ---> piQueueFinish(
// CKB3: ZE ---> zeCommandListClose(
// CKB3: ZE ---> zeCommandQueueExecuteCommandLists(
// CKB5: ZE ---> zeCommandListClose(
Expand Down Expand Up @@ -142,7 +142,7 @@
// CKB4: ZE ---> zeCommandQueueExecuteCommandLists(
// CKB8: ZE ---> zeCommandListClose(
// CKB8: ZE ---> zeCommandQueueExecuteCommandLists(
// CKALL: ---> piEventsWait(
// CKALL: ---> piQueueFinish(
// CKB3: ZE ---> zeCommandListClose(
// CKB3: ZE ---> zeCommandQueueExecuteCommandLists(
// CKB5: ZE ---> zeCommandListClose(
Expand Down Expand Up @@ -198,7 +198,7 @@
// CKB4: ZE ---> zeCommandQueueExecuteCommandLists(
// CKB8: ZE ---> zeCommandListClose(
// CKB8: ZE ---> zeCommandQueueExecuteCommandLists(
// CKALL: ---> piEventsWait(
// CKALL: ---> piQueueFinish(
// CKB3: ZE ---> zeCommandListClose(
// CKB3: ZE ---> zeCommandQueueExecuteCommandLists(
// CKB5: ZE ---> zeCommandListClose(
Expand Down
1 change: 1 addition & 0 deletions sycl/unittests/queue/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
add_sycl_unittest(QueueTests OBJECT
EventClear.cpp
Wait.cpp
)
15 changes: 15 additions & 0 deletions sycl/unittests/queue/EventClear.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,19 @@ std::unique_ptr<TestCtx> TestContext;

const int ExpectedEventThreshold = 128;

pi_result redefinedQueueCreate(pi_context context, pi_device device,
pi_queue_properties properties,
pi_queue *queue) {
// Use in-order queues to force storing events for calling wait on them,
// rather than calling piQueueFinish.
if (properties & PI_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) {
return PI_INVALID_QUEUE_PROPERTIES;
}
return PI_SUCCESS;
}

pi_result redefinedQueueRelease(pi_queue Queue) { return PI_SUCCESS; }

pi_result redefinedUSMEnqueueMemset(pi_queue queue, void *ptr, pi_int32 value,
size_t count,
pi_uint32 num_events_in_waitlist,
Expand Down Expand Up @@ -83,6 +96,8 @@ bool preparePiMock(platform &Plt) {
}

unittest::PiMock Mock{Plt};
Mock.redefine<detail::PiApiKind::piQueueCreate>(redefinedQueueCreate);
Mock.redefine<detail::PiApiKind::piQueueRelease>(redefinedQueueRelease);
Mock.redefine<detail::PiApiKind::piextUSMEnqueueMemset>(
redefinedUSMEnqueueMemset);
Mock.redefine<detail::PiApiKind::piEventsWait>(redefinedEventsWait);
Expand Down
Loading