Skip to content

Commit c44050a

Browse files
[SYCL] Make host task blocking and detach empty command. Part 1 (#6995)
It is intended to solve existing gap in implementation that kernel enqueue depending on host task via depends_on will wait for host task completion in its own enqueue. I want to change this approach and make it similar with the one we use when we have memory dependencies. It also should help with cleanup process because now host task is represented with two instances - ExecCGCommand and EmptyCommand that introduces leaks in some scenarios. Signed-off-by: Tikhomirova, Kseniya <[email protected]>
1 parent d2ec964 commit c44050a

File tree

12 files changed

+352
-369
lines changed

12 files changed

+352
-369
lines changed

sycl/source/detail/event_impl.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,9 @@ class event_impl {
243243
/// state.
244244
bool isInitialized() const noexcept { return MIsInitialized; }
245245

246+
/// Checks if this event is complete.
247+
///
248+
/// \return true if this event is complete.
246249
bool isCompleted();
247250

248251
void attachEventToComplete(const EventImplPtr &Event) {

sycl/source/detail/scheduler/commands.cpp

Lines changed: 46 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -321,11 +321,7 @@ class DispatchHostTask {
321321

322322
HostTask.MHostTask.reset();
323323

324-
// unblock user empty command here
325-
EmptyCommand *EmptyCmd = MThisCmd->MEmptyCmd;
326-
assert(EmptyCmd && "No empty command found");
327-
328-
Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd, EmptyCmd);
324+
Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
329325
}
330326
};
331327

@@ -349,11 +345,10 @@ void Command::waitForEvents(QueueImplPtr Queue,
349345
// we will have two different contexts for the same CPU device: C1, C2.
350346
// Also we have default host queue. This queue is accessible via
351347
// Scheduler. Now, let's assume we have three different events: E1(C1),
352-
// E2(C1), E3(C2). Also, we have an EmptyCommand which is to be executed
353-
// on host queue. The command's MPreparedDepsEvents will contain all three
354-
// events (E1, E2, E3). Now, if piEventsWait is called for all three
355-
// events we'll experience failure with CL_INVALID_CONTEXT 'cause these
356-
// events refer to different contexts.
348+
// E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
349+
// three events (E1, E2, E3). Now, if piEventsWait is called for all
350+
// three events we'll experience failure with CL_INVALID_CONTEXT 'cause
351+
// these events refer to different contexts.
357352
std::map<context_impl *, std::vector<EventImplPtr>>
358353
RequiredEventsPerContext;
359354

@@ -419,19 +414,19 @@ void Command::emitInstrumentationDataProxy() {
419414
/// Method takes in void * for the address as adding a template function to
420415
/// the command group object maybe undesirable.
421416
/// @param Cmd The command object of the source of the edge
422-
/// @param ObjAddr The address that defines the edge dependency; it is the event
423-
/// address when the edge is for an event and a memory object address if it is
424-
/// due to an accessor
425-
/// @param Prefix Contains "event" if the dependency is an edge and contains the
426-
/// access mode to the buffer if it is due to an accessor
427-
/// @param IsCommand True if the dependency has a command object as the source,
428-
/// false otherwise
417+
/// @param ObjAddr The address that defines the edge dependency; it is the
418+
/// event address when the edge is for an event and a memory object address if
419+
/// it is due to an accessor
420+
/// @param Prefix Contains "event" if the dependency is an edge and contains
421+
/// the access mode to the buffer if it is due to an accessor
422+
/// @param IsCommand True if the dependency has a command object as the
423+
/// source, false otherwise
429424
void Command::emitEdgeEventForCommandDependence(
430425
Command *Cmd, void *ObjAddr, bool IsCommand,
431426
std::optional<access::mode> AccMode) {
432427
#ifdef XPTI_ENABLE_INSTRUMENTATION
433-
// Bail early if either the source or the target node for the given dependency
434-
// is undefined or NULL
428+
// Bail early if either the source or the target node for the given
429+
// dependency is undefined or NULL
435430
if (!(xptiTraceEnabled() && MTraceEvent && Cmd && Cmd->MTraceEvent))
436431
return;
437432

@@ -583,11 +578,11 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
583578

584579
// 1. Async work is not supported for host device.
585580
// 2. Non-host events can be ignored if they are not fully initialized.
586-
// 3. Some types of commands do not produce PI events after they are enqueued
581+
// 3. Some types of commands do not produce PI events after they are
582+
// enqueued
587583
// (e.g. alloca). Note that we can't check the pi event to make that
588584
// distinction since the command might still be unenqueued at this point.
589-
bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized()) ||
590-
getType() == CommandType::HOST_TASK;
585+
bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
591586
if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
592587
PiEventExpected &= DepCmd->producesPiEvent();
593588

@@ -776,10 +771,10 @@ void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
776771
// nodes have to be completed first before the current node can begin to
777772
// execute; these edges model control flow
778773
xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
779-
// We have all the Commands that must be completed before the release command
780-
// can be enqueued; here we'll find the command that is an Alloca with the
781-
// same SYCLMemObject address and create a dependency line (edge) between them
782-
// in our sematic modeling
774+
// We have all the Commands that must be completed before the release
775+
// command can be enqueued; here we'll find the command that is an Alloca
776+
// with the same SYCLMemObject address and create a dependency line (edge)
777+
// between them in our sematic modeling
783778
for (auto &Item : DepList) {
784779
if (Item->MTraceEvent && Item->MAddress == MAddress) {
785780
xpti::utils::StringHelper SH;
@@ -862,8 +857,8 @@ AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req,
862857
: AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
863858
LinkedAllocaCmd, IsConst),
864859
MInitFromUserData(InitFromUserData) {
865-
// Node event must be created before the dependent edge is added to this node,
866-
// so this call must be before the addDep() call.
860+
// Node event must be created before the dependent edge is added to this
861+
// node, so this call must be before the addDep() call.
867862
emitInstrumentationDataProxy();
868863
// "Nothing to depend on"
869864
std::vector<Command *> ToCleanUp;
@@ -1060,14 +1055,15 @@ pi_int32 ReleaseCommand::enqueueImp() {
10601055
bool NeedUnmap = false;
10611056
if (MAllocaCmd->MLinkedAllocaCmd) {
10621057

1063-
// When releasing one of the "linked" allocations special rules take place:
1058+
// When releasing one of the "linked" allocations special rules take
1059+
// place:
10641060
// 1. Device allocation should always be released.
10651061
// 2. Host allocation should be released if host allocation is "leader".
10661062
// 3. Device alloca in the pair should be in active state in order to be
10671063
// correctly released.
10681064

1069-
// There is no actual memory allocation if a host alloca command is created
1070-
// being linked to a device allocation.
1065+
// There is no actual memory allocation if a host alloca command is
1066+
// created being linked to a device allocation.
10711067
SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;
10721068

10731069
NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
@@ -1555,7 +1551,8 @@ void EmptyCommand::addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd,
15551551
MRequirements.emplace_back(ReqRef);
15561552
const Requirement *const StoredReq = &MRequirements.back();
15571553

1558-
// EmptyCommand is always host one, so we believe that result of addDep is nil
1554+
// EmptyCommand is always host one, so we believe that result of addDep is
1555+
// nil
15591556
std::vector<Command *> ToCleanUp;
15601557
Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
15611558
assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
@@ -1822,9 +1819,9 @@ void ExecCGCommand::emitInstrumentationData() {
18221819
auto KernelBundleImplPtr = KernelCG->getKernelBundle();
18231820

18241821
// Use kernel_bundle if available unless it is interop.
1825-
// Interop bundles can't be used in the first branch, because the kernels
1826-
// in interop kernel bundles (if any) do not have kernel_id
1827-
// and can therefore not be looked up, but since they are self-contained
1822+
// Interop bundles can't be used in the first branch, because the
1823+
// kernels in interop kernel bundles (if any) do not have kernel_id and
1824+
// can therefore not be looked up, but since they are self-contained
18281825
// they can simply be launched directly.
18291826
if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
18301827
kernel_id KernelID =
@@ -1913,16 +1910,15 @@ void ExecCGCommand::printDot(std::ostream &Stream) const {
19131910
}
19141911

19151912
// SYCL has a parallel_for_work_group variant where the only NDRange
1916-
// characteristics set by a user is the number of work groups. This does not map
1917-
// to the OpenCL clEnqueueNDRangeAPI, which requires global work size to be set
1918-
// as well. This function determines local work size based on the device
1919-
// characteristics and the number of work groups requested by the user, then
1920-
// calculates the global work size.
1921-
// SYCL specification (from 4.8.5.3):
1922-
// The member function handler::parallel_for_work_group is parameterized by the
1923-
// number of work - groups, such that the size of each group is chosen by the
1924-
// runtime, or by the number of work - groups and number of work - items for
1925-
// users who need more control.
1913+
// characteristics set by a user is the number of work groups. This does not
1914+
// map to the OpenCL clEnqueueNDRangeAPI, which requires global work size to
1915+
// be set as well. This function determines local work size based on the
1916+
// device characteristics and the number of work groups requested by the user,
1917+
// then calculates the global work size. SYCL specification (from 4.8.5.3):
1918+
// The member function handler::parallel_for_work_group is parameterized by
1919+
// the number of work - groups, such that the size of each group is chosen by
1920+
// the runtime, or by the number of work - groups and number of work - items
1921+
// for users who need more control.
19261922
static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
19271923
const device_impl &DeviceImpl) {
19281924
if (NDR.GlobalSize[0] != 0)
@@ -2310,9 +2306,9 @@ pi_int32 ExecCGCommand::enqueueImp() {
23102306
}
23112307

23122308
std::vector<pi_mem> Buffers;
2313-
// piEnqueueNativeKernel requires additional array of pointers to args blob,
2314-
// values that pointers point to are replaced with actual pointers to the
2315-
// memory before execution of user function.
2309+
// piEnqueueNativeKernel requires additional array of pointers to args
2310+
// blob, values that pointers point to are replaced with actual pointers
2311+
// to the memory before execution of user function.
23162312
std::vector<void *> MemLocs;
23172313

23182314
for (ArgDesc &Arg : HostTask->MArgs) {
@@ -2438,7 +2434,8 @@ pi_int32 ExecCGCommand::enqueueImp() {
24382434
const detail::plugin &Plugin = MQueue->getPlugin();
24392435
CGInteropTask *ExecInterop = (CGInteropTask *)MCommandGroup.get();
24402436
// Wait for dependencies to complete before dispatching work on the host
2441-
// TODO: Use a callback to dispatch the interop task instead of waiting for
2437+
// TODO: Use a callback to dispatch the interop task instead of waiting
2438+
// for
24422439
// the event
24432440
if (!RawEvents.empty()) {
24442441
Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
@@ -2564,7 +2561,6 @@ bool ExecCGCommand::supportsPostEnqueueCleanup() const {
25642561
!static_cast<CGExecKernel *>(MCommandGroup.get())
25652562
->hasAuxiliaryResources()));
25662563
}
2567-
25682564
} // namespace detail
25692565
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
25702566
} // namespace sycl

sycl/source/detail/scheduler/commands.hpp

Lines changed: 21 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,20 @@ class Command {
142142
return MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess;
143143
}
144144

145+
// Shows that command could not be enqueued, now it may be true for empty task
146+
// only
145147
bool isEnqueueBlocked() const {
146-
return MEnqueueStatus == EnqueueResultT::SyclEnqueueBlocked;
148+
return MIsBlockable && MEnqueueStatus == EnqueueResultT::SyclEnqueueBlocked;
149+
}
150+
// Shows that command could be enqueued, but is blocking enqueue of all
151+
// commands depending on it. Regular usage - host task.
152+
bool isBlocking() const { return isHostTask() && !MEvent->isCompleted(); }
153+
154+
void addBlockedUserUnique(const EventImplPtr &NewUser) {
155+
if (std::find(MBlockedUsers.begin(), MBlockedUsers.end(), NewUser) !=
156+
MBlockedUsers.end())
157+
return;
158+
MBlockedUsers.push_back(NewUser);
147159
}
148160

149161
const QueueImplPtr &getQueue() const { return MQueue; }
@@ -325,6 +337,14 @@ class Command {
325337
/// Indicates that the node will be freed by cleanup after enqueue. Such nodes
326338
/// should be ignored by other cleanup mechanisms.
327339
bool MPostEnqueueCleanup = false;
340+
341+
/// Contains list of commands that depends on the host command explicitly (by
342+
/// depends_on). Not involved in the cleanup process since it is one-way link
343+
/// and does not hold resources.
344+
/// Using EventImplPtr since enqueueUnblockedCommands and event.wait may
345+
/// intersect with command enqueue.
346+
std::vector<EventImplPtr> MBlockedUsers;
347+
std::mutex MBlockedUsersMutex;
328348
};
329349

330350
/// The empty command does nothing during enqueue. The task can be used to
@@ -561,12 +581,6 @@ class ExecCGCommand : public Command {
561581

562582
detail::CG &getCG() const { return *MCommandGroup; }
563583

564-
// MEmptyCmd is only employed if this command refers to host-task.
565-
// The mechanism of lookup for single EmptyCommand amongst users of
566-
// host-task-representing command is unreliable. This unreliability roots in
567-
// the cleanup process.
568-
EmptyCommand *MEmptyCmd = nullptr;
569-
570584
bool producesPiEvent() const final;
571585

572586
bool supportsPostEnqueueCleanup() const final;

sycl/source/detail/scheduler/graph_builder.cpp

Lines changed: 7 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -923,7 +923,6 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
923923
std::vector<Command *> &ToEnqueue) {
924924
std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
925925
const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
926-
const CG::CGTYPE CGType = CommandGroup->getType();
927926

928927
auto NewCmd = std::make_unique<ExecCGCommand>(std::move(CommandGroup), Queue);
929928
if (!NewCmd)
@@ -1019,11 +1018,6 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
10191018
ToEnqueue.push_back(ConnCmd);
10201019
}
10211020

1022-
if (CGType == CG::CGTYPE::CodeplayHostTask)
1023-
NewCmd->MEmptyCmd =
1024-
addEmptyCmd(NewCmd.get(), NewCmd->getCG().MRequirements, Queue,
1025-
Command::BlockReason::HostTask, ToEnqueue);
1026-
10271021
if (MPrintOptionsArray[AfterAddCG])
10281022
printGraphAsDot("after_addCG");
10291023

@@ -1323,8 +1317,6 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
13231317
throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
13241318
}
13251319

1326-
EmptyCommand *EmptyCmd = nullptr;
1327-
13281320
if (Dep.MDepRequirement) {
13291321
// make ConnectCmd depend on requirement
13301322
// Dismiss the result here as it's not a connection now,
@@ -1333,57 +1325,27 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
13331325
assert(reinterpret_cast<Command *>(DepEvent->getCommand()) ==
13341326
Dep.MDepCommand);
13351327
// add user to Dep.MDepCommand is already performed beyond this if branch
1336-
1337-
// ConnectCmd is added as dependency to Cmd
1338-
// We build the following structure Cmd->EmptyCmd/ConnectCmd->DepCmd
1339-
// No need to add ConnectCmd to leaves buffer since it is a dependency
1340-
// for command Cmd that will be added there
1341-
1342-
std::vector<Command *> ToEnqueue;
1343-
const std::vector<const Requirement *> Reqs(1, Dep.MDepRequirement);
1344-
EmptyCmd = addEmptyCmd(ConnectCmd, Reqs,
1345-
Scheduler::getInstance().getDefaultHostQueue(),
1346-
Command::BlockReason::HostTask, ToEnqueue, false);
1347-
assert(ToEnqueue.size() == 0);
1348-
1349-
// Depend Cmd on empty command
13501328
{
1351-
DepDesc CmdDep = Dep;
1352-
CmdDep.MDepCommand = EmptyCmd;
1329+
DepDesc DepOnConnect = Dep;
1330+
DepOnConnect.MDepCommand = ConnectCmd;
13531331

13541332
// Dismiss the result here as it's not a connection now,
1355-
// 'cause EmptyCmd is host one
1356-
(void)Cmd->addDep(CmdDep, ToCleanUp);
1333+
// 'cause ConnectCmd is host one
1334+
std::ignore = Cmd->addDep(DepOnConnect, ToCleanUp);
13571335
}
13581336
} else {
13591337
// It is required condition in another a path and addUser will be set in
13601338
// addDep
13611339
if (Command *DepCmd = reinterpret_cast<Command *>(DepEvent->getCommand()))
13621340
DepCmd->addUser(ConnectCmd);
13631341

1364-
std::vector<Command *> ToEnqueue;
1365-
EmptyCmd = addEmptyCmd<Requirement>(
1366-
ConnectCmd, {}, Scheduler::getInstance().getDefaultHostQueue(),
1367-
Command::BlockReason::HostTask, ToEnqueue);
1368-
assert(ToEnqueue.size() == 0);
1342+
std::ignore = ConnectCmd->addDep(DepEvent, ToCleanUp);
13691343

1370-
// There is no requirement thus, empty command will only depend on
1371-
// ConnectCmd via its event.
1372-
// Dismiss the result here as it's not a connection now,
1373-
// 'cause ConnectCmd is host one.
1374-
(void)EmptyCmd->addDep(ConnectCmd->getEvent(), ToCleanUp);
1375-
(void)ConnectCmd->addDep(DepEvent, ToCleanUp);
1344+
std::ignore = Cmd->addDep(ConnectCmd->getEvent(), ToCleanUp);
13761345

1377-
// Depend Cmd on empty command
1378-
// Dismiss the result here as it's not a connection now,
1379-
// 'cause EmptyCmd is host one
1380-
(void)Cmd->addDep(EmptyCmd->getEvent(), ToCleanUp);
1381-
// Added by addDep in another path
1382-
EmptyCmd->addUser(Cmd);
1346+
ConnectCmd->addUser(Cmd);
13831347
}
13841348

1385-
ConnectCmd->MEmptyCmd = EmptyCmd;
1386-
13871349
return ConnectCmd;
13881350
}
13891351

0 commit comments

Comments
 (0)