Skip to content

[SYCL] Make host task blocking and detach empty command. Part 1 #6995

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 14 commits into from
Dec 13, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions sycl/source/detail/event_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -243,6 +243,9 @@ class event_impl {
/// state.
bool isInitialized() const noexcept { return MIsInitialized; }

/// Checks if this event is complete.
///
/// \return true if this event is complete.
bool isCompleted();

void attachEventToComplete(const EventImplPtr &Event) {
Expand Down
96 changes: 46 additions & 50 deletions sycl/source/detail/scheduler/commands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -321,11 +321,7 @@ class DispatchHostTask {

HostTask.MHostTask.reset();

// unblock user empty command here
EmptyCommand *EmptyCmd = MThisCmd->MEmptyCmd;
assert(EmptyCmd && "No empty command found");

Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd, EmptyCmd);
Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
}
};

Expand All @@ -349,11 +345,10 @@ void Command::waitForEvents(QueueImplPtr Queue,
// we will have two different contexts for the same CPU device: C1, C2.
// Also we have default host queue. This queue is accessible via
// Scheduler. Now, let's assume we have three different events: E1(C1),
// E2(C1), E3(C2). Also, we have an EmptyCommand which is to be executed
// on host queue. The command's MPreparedDepsEvents will contain all three
// events (E1, E2, E3). Now, if piEventsWait is called for all three
// events we'll experience failure with CL_INVALID_CONTEXT 'cause these
// events refer to different contexts.
// E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
// three events (E1, E2, E3). Now, if piEventsWait is called for all
// three events we'll experience failure with CL_INVALID_CONTEXT 'cause
// these events refer to different contexts.
std::map<context_impl *, std::vector<EventImplPtr>>
RequiredEventsPerContext;

Expand Down Expand Up @@ -419,19 +414,19 @@ void Command::emitInstrumentationDataProxy() {
/// Method takes in void * for the address as adding a template function to
/// the command group object maybe undesirable.
/// @param Cmd The command object of the source of the edge
/// @param ObjAddr The address that defines the edge dependency; it is the event
/// address when the edge is for an event and a memory object address if it is
/// due to an accessor
/// @param Prefix Contains "event" if the dependency is an edge and contains the
/// access mode to the buffer if it is due to an accessor
/// @param IsCommand True if the dependency has a command object as the source,
/// false otherwise
/// @param ObjAddr The address that defines the edge dependency; it is the
/// event address when the edge is for an event and a memory object address if
/// it is due to an accessor
/// @param Prefix Contains "event" if the dependency is an edge and contains
/// the access mode to the buffer if it is due to an accessor
/// @param IsCommand True if the dependency has a command object as the
/// source, false otherwise
void Command::emitEdgeEventForCommandDependence(
Command *Cmd, void *ObjAddr, bool IsCommand,
std::optional<access::mode> AccMode) {
#ifdef XPTI_ENABLE_INSTRUMENTATION
// Bail early if either the source or the target node for the given dependency
// is undefined or NULL
// Bail early if either the source or the target node for the given
// dependency is undefined or NULL
if (!(xptiTraceEnabled() && MTraceEvent && Cmd && Cmd->MTraceEvent))
return;

Expand Down Expand Up @@ -583,11 +578,11 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,

// 1. Async work is not supported for host device.
// 2. Non-host events can be ignored if they are not fully initialized.
// 3. Some types of commands do not produce PI events after they are enqueued
// 3. Some types of commands do not produce PI events after they are
// enqueued
// (e.g. alloca). Note that we can't check the pi event to make that
// distinction since the command might still be unenqueued at this point.
bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized()) ||
getType() == CommandType::HOST_TASK;
bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
PiEventExpected &= DepCmd->producesPiEvent();

Expand Down Expand Up @@ -776,10 +771,10 @@ void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
// nodes have to be completed first before the current node can begin to
// execute; these edges model control flow
xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
// We have all the Commands that must be completed before the release command
// can be enqueued; here we'll find the command that is an Alloca with the
// same SYCLMemObject address and create a dependency line (edge) between them
// in our sematic modeling
// We have all the Commands that must be completed before the release
// command can be enqueued; here we'll find the command that is an Alloca
// with the same SYCLMemObject address and create a dependency line (edge)
// between them in our sematic modeling
for (auto &Item : DepList) {
if (Item->MTraceEvent && Item->MAddress == MAddress) {
xpti::utils::StringHelper SH;
Expand Down Expand Up @@ -862,8 +857,8 @@ AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req,
: AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
LinkedAllocaCmd, IsConst),
MInitFromUserData(InitFromUserData) {
// Node event must be created before the dependent edge is added to this node,
// so this call must be before the addDep() call.
// Node event must be created before the dependent edge is added to this
// node, so this call must be before the addDep() call.
emitInstrumentationDataProxy();
// "Nothing to depend on"
std::vector<Command *> ToCleanUp;
Expand Down Expand Up @@ -1060,14 +1055,15 @@ pi_int32 ReleaseCommand::enqueueImp() {
bool NeedUnmap = false;
if (MAllocaCmd->MLinkedAllocaCmd) {

// When releasing one of the "linked" allocations special rules take place:
// When releasing one of the "linked" allocations special rules take
// place:
// 1. Device allocation should always be released.
// 2. Host allocation should be released if host allocation is "leader".
// 3. Device alloca in the pair should be in active state in order to be
// correctly released.

// There is no actual memory allocation if a host alloca command is created
// being linked to a device allocation.
// There is no actual memory allocation if a host alloca command is
// created being linked to a device allocation.
SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;

NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
Expand Down Expand Up @@ -1555,7 +1551,8 @@ void EmptyCommand::addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd,
MRequirements.emplace_back(ReqRef);
const Requirement *const StoredReq = &MRequirements.back();

// EmptyCommand is always host one, so we believe that result of addDep is nil
// EmptyCommand is always host one, so we believe that result of addDep is
// nil
std::vector<Command *> ToCleanUp;
Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
Expand Down Expand Up @@ -1822,9 +1819,9 @@ void ExecCGCommand::emitInstrumentationData() {
auto KernelBundleImplPtr = KernelCG->getKernelBundle();

// Use kernel_bundle if available unless it is interop.
// Interop bundles can't be used in the first branch, because the kernels
// in interop kernel bundles (if any) do not have kernel_id
// and can therefore not be looked up, but since they are self-contained
// Interop bundles can't be used in the first branch, because the
// kernels in interop kernel bundles (if any) do not have kernel_id and
// can therefore not be looked up, but since they are self-contained
// they can simply be launched directly.
if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
kernel_id KernelID =
Expand Down Expand Up @@ -1913,16 +1910,15 @@ void ExecCGCommand::printDot(std::ostream &Stream) const {
}

// SYCL has a parallel_for_work_group variant where the only NDRange
// characteristics set by a user is the number of work groups. This does not map
// to the OpenCL clEnqueueNDRangeAPI, which requires global work size to be set
// as well. This function determines local work size based on the device
// characteristics and the number of work groups requested by the user, then
// calculates the global work size.
// SYCL specification (from 4.8.5.3):
// The member function handler::parallel_for_work_group is parameterized by the
// number of work - groups, such that the size of each group is chosen by the
// runtime, or by the number of work - groups and number of work - items for
// users who need more control.
// characteristics set by a user is the number of work groups. This does not
// map to the OpenCL clEnqueueNDRangeAPI, which requires global work size to
// be set as well. This function determines local work size based on the
// device characteristics and the number of work groups requested by the user,
// then calculates the global work size. SYCL specification (from 4.8.5.3):
// The member function handler::parallel_for_work_group is parameterized by
// the number of work - groups, such that the size of each group is chosen by
// the runtime, or by the number of work - groups and number of work - items
// for users who need more control.
static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
const device_impl &DeviceImpl) {
if (NDR.GlobalSize[0] != 0)
Expand Down Expand Up @@ -2310,9 +2306,9 @@ pi_int32 ExecCGCommand::enqueueImp() {
}

std::vector<pi_mem> Buffers;
// piEnqueueNativeKernel requires additional array of pointers to args blob,
// values that pointers point to are replaced with actual pointers to the
// memory before execution of user function.
// piEnqueueNativeKernel requires additional array of pointers to args
// blob, values that pointers point to are replaced with actual pointers
// to the memory before execution of user function.
std::vector<void *> MemLocs;

for (ArgDesc &Arg : HostTask->MArgs) {
Expand Down Expand Up @@ -2438,7 +2434,8 @@ pi_int32 ExecCGCommand::enqueueImp() {
const detail::plugin &Plugin = MQueue->getPlugin();
CGInteropTask *ExecInterop = (CGInteropTask *)MCommandGroup.get();
// Wait for dependencies to complete before dispatching work on the host
// TODO: Use a callback to dispatch the interop task instead of waiting for
// TODO: Use a callback to dispatch the interop task instead of waiting
// for
// the event
if (!RawEvents.empty()) {
Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
Expand Down Expand Up @@ -2564,7 +2561,6 @@ bool ExecCGCommand::supportsPostEnqueueCleanup() const {
!static_cast<CGExecKernel *>(MCommandGroup.get())
->hasAuxiliaryResources()));
}

} // namespace detail
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
28 changes: 21 additions & 7 deletions sycl/source/detail/scheduler/commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,20 @@ class Command {
return MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess;
}

// Shows that command could not be enqueued, now it may be true for empty task
// only
bool isEnqueueBlocked() const {
return MEnqueueStatus == EnqueueResultT::SyclEnqueueBlocked;
return MIsBlockable && MEnqueueStatus == EnqueueResultT::SyclEnqueueBlocked;
}
// Shows that command could be enqueued, but is blocking enqueue of all
// commands depending on it. Regular usage - host task.
bool isBlocking() const { return isHostTask() && !MEvent->isCompleted(); }

void addBlockedUserUnique(const EventImplPtr &NewUser) {
if (std::find(MBlockedUsers.begin(), MBlockedUsers.end(), NewUser) !=
MBlockedUsers.end())
return;
MBlockedUsers.push_back(NewUser);
}

const QueueImplPtr &getQueue() const { return MQueue; }
Expand Down Expand Up @@ -325,6 +337,14 @@ class Command {
/// Indicates that the node will be freed by cleanup after enqueue. Such nodes
/// should be ignored by other cleanup mechanisms.
bool MPostEnqueueCleanup = false;

/// Contains list of commands that depends on the host command explicitly (by
/// depends_on). Not involved in the cleanup process since it is one-way link
/// and does not hold resources.
/// Using EventImplPtr since enqueueUnblockedCommands and event.wait may
/// intersect with command enqueue.
std::vector<EventImplPtr> MBlockedUsers;
std::mutex MBlockedUsersMutex;
};

/// The empty command does nothing during enqueue. The task can be used to
Expand Down Expand Up @@ -561,12 +581,6 @@ class ExecCGCommand : public Command {

detail::CG &getCG() const { return *MCommandGroup; }

// MEmptyCmd is only employed if this command refers to host-task.
// The mechanism of lookup for single EmptyCommand amongst users of
// host-task-representing command is unreliable. This unreliability roots in
// the cleanup process.
EmptyCommand *MEmptyCmd = nullptr;

bool producesPiEvent() const final;

bool supportsPostEnqueueCleanup() const final;
Expand Down
52 changes: 7 additions & 45 deletions sycl/source/detail/scheduler/graph_builder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -923,7 +923,6 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
std::vector<Command *> &ToEnqueue) {
std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
const CG::CGTYPE CGType = CommandGroup->getType();

auto NewCmd = std::make_unique<ExecCGCommand>(std::move(CommandGroup), Queue);
if (!NewCmd)
Expand Down Expand Up @@ -1019,11 +1018,6 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
ToEnqueue.push_back(ConnCmd);
}

if (CGType == CG::CGTYPE::CodeplayHostTask)
NewCmd->MEmptyCmd =
addEmptyCmd(NewCmd.get(), NewCmd->getCG().MRequirements, Queue,
Command::BlockReason::HostTask, ToEnqueue);

if (MPrintOptionsArray[AfterAddCG])
printGraphAsDot("after_addCG");

Expand Down Expand Up @@ -1323,8 +1317,6 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
}

EmptyCommand *EmptyCmd = nullptr;

if (Dep.MDepRequirement) {
// make ConnectCmd depend on requirement
// Dismiss the result here as it's not a connection now,
Expand All @@ -1333,57 +1325,27 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
assert(reinterpret_cast<Command *>(DepEvent->getCommand()) ==
Dep.MDepCommand);
// add user to Dep.MDepCommand is already performed beyond this if branch

// ConnectCmd is added as dependency to Cmd
// We build the following structure Cmd->EmptyCmd/ConnectCmd->DepCmd
// No need to add ConnectCmd to leaves buffer since it is a dependency
// for command Cmd that will be added there

std::vector<Command *> ToEnqueue;
const std::vector<const Requirement *> Reqs(1, Dep.MDepRequirement);
EmptyCmd = addEmptyCmd(ConnectCmd, Reqs,
Scheduler::getInstance().getDefaultHostQueue(),
Command::BlockReason::HostTask, ToEnqueue, false);
assert(ToEnqueue.size() == 0);

// Depend Cmd on empty command
{
DepDesc CmdDep = Dep;
CmdDep.MDepCommand = EmptyCmd;
DepDesc DepOnConnect = Dep;
DepOnConnect.MDepCommand = ConnectCmd;

// Dismiss the result here as it's not a connection now,
// 'cause EmptyCmd is host one
(void)Cmd->addDep(CmdDep, ToCleanUp);
// 'cause ConnectCmd is host one
std::ignore = Cmd->addDep(DepOnConnect, ToCleanUp);
}
} else {
// It is required condition in another a path and addUser will be set in
// addDep
if (Command *DepCmd = reinterpret_cast<Command *>(DepEvent->getCommand()))
DepCmd->addUser(ConnectCmd);

std::vector<Command *> ToEnqueue;
EmptyCmd = addEmptyCmd<Requirement>(
ConnectCmd, {}, Scheduler::getInstance().getDefaultHostQueue(),
Command::BlockReason::HostTask, ToEnqueue);
assert(ToEnqueue.size() == 0);
std::ignore = ConnectCmd->addDep(DepEvent, ToCleanUp);

// There is no requirement thus, empty command will only depend on
// ConnectCmd via its event.
// Dismiss the result here as it's not a connection now,
// 'cause ConnectCmd is host one.
(void)EmptyCmd->addDep(ConnectCmd->getEvent(), ToCleanUp);
(void)ConnectCmd->addDep(DepEvent, ToCleanUp);
std::ignore = Cmd->addDep(ConnectCmd->getEvent(), ToCleanUp);

// Depend Cmd on empty command
// Dismiss the result here as it's not a connection now,
// 'cause EmptyCmd is host one
(void)Cmd->addDep(EmptyCmd->getEvent(), ToCleanUp);
// Added by addDep in another path
EmptyCmd->addUser(Cmd);
ConnectCmd->addUser(Cmd);
}

ConnectCmd->MEmptyCmd = EmptyCmd;

return ConnectCmd;
}

Expand Down
Loading