intel · againull · Dec 13, 2022 · Oct 7, 2022 · Oct 7, 2022 · Oct 7, 2022
@@ -243,6 +243,9 @@ class event_impl {
   /// state.
   bool isInitialized() const noexcept { return MIsInitialized; }
 
+  /// Checks if this event is complete.
+  ///
+  /// \return true if this event is complete.
   bool isCompleted();
 
   void attachEventToComplete(const EventImplPtr &Event) {

@@ -321,11 +321,7 @@ class DispatchHostTask {
 
     HostTask.MHostTask.reset();
 
-    // unblock user empty command here
-    EmptyCommand *EmptyCmd = MThisCmd->MEmptyCmd;
-    assert(EmptyCmd && "No empty command found");
-
-    Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd, EmptyCmd);
+    Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
   }
 };
 
@@ -349,11 +345,10 @@ void Command::waitForEvents(QueueImplPtr Queue,
       // we will have two different contexts for the same CPU device: C1, C2.
       // Also we have default host queue. This queue is accessible via
       // Scheduler. Now, let's assume we have three different events: E1(C1),
-      // E2(C1), E3(C2). Also, we have an EmptyCommand which is to be executed
-      // on host queue. The command's MPreparedDepsEvents will contain all three
-      // events (E1, E2, E3). Now, if piEventsWait is called for all three
-      // events we'll experience failure with CL_INVALID_CONTEXT 'cause these
-      // events refer to different contexts.
+      // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
+      // three events (E1, E2, E3). Now, if piEventsWait is called for all
+      // three events we'll experience failure with CL_INVALID_CONTEXT 'cause
+      // these events refer to different contexts.
       std::map<context_impl *, std::vector<EventImplPtr>>
           RequiredEventsPerContext;
 
@@ -419,19 +414,19 @@ void Command::emitInstrumentationDataProxy() {
 /// Method takes in void * for the address as adding a template function to
 /// the command group object maybe undesirable.
 /// @param Cmd The command object of the source of the edge
-/// @param ObjAddr The address that defines the edge dependency; it is the event
-/// address when the edge is for an event and a memory object address if it is
-/// due to an accessor
-/// @param Prefix Contains "event" if the dependency is an edge and contains the
-/// access mode to the buffer if it is due to an accessor
-/// @param IsCommand True if the dependency has a command object as the source,
-/// false otherwise
+/// @param ObjAddr The address that defines the edge dependency; it is the
+/// event address when the edge is for an event and a memory object address if
+/// it is due to an accessor
+/// @param Prefix Contains "event" if the dependency is an edge and contains
+/// the access mode to the buffer if it is due to an accessor
+/// @param IsCommand True if the dependency has a command object as the
+/// source, false otherwise
 void Command::emitEdgeEventForCommandDependence(
     Command *Cmd, void *ObjAddr, bool IsCommand,
     std::optional<access::mode> AccMode) {
 #ifdef XPTI_ENABLE_INSTRUMENTATION
-  // Bail early if either the source or the target node for the given dependency
-  // is undefined or NULL
+  // Bail early if either the source or the target node for the given
+  // dependency is undefined or NULL
   if (!(xptiTraceEnabled() && MTraceEvent && Cmd && Cmd->MTraceEvent))
     return;
 
@@ -583,11 +578,11 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
 
   // 1. Async work is not supported for host device.
   // 2. Non-host events can be ignored if they are not fully initialized.
-  // 3. Some types of commands do not produce PI events after they are enqueued
+  // 3. Some types of commands do not produce PI events after they are
+  // enqueued
   //    (e.g. alloca). Note that we can't check the pi event to make that
   //    distinction since the command might still be unenqueued at this point.
-  bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized()) ||
-                         getType() == CommandType::HOST_TASK;
+  bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
   if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
     PiEventExpected &= DepCmd->producesPiEvent();
 
@@ -776,10 +771,10 @@ void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
   // nodes have to be completed first before the current node can begin to
   // execute; these edges model control flow
   xpti_td *TgtTraceEvent = static_cast<xpti_td *>(MTraceEvent);
-  // We have all the Commands that must be completed before the release command
-  // can be enqueued; here we'll find the command that is an Alloca with the
-  // same SYCLMemObject address and create a dependency line (edge) between them
-  // in our sematic modeling
+  // We have all the Commands that must be completed before the release
+  // command can be enqueued; here we'll find the command that is an Alloca
+  // with the same SYCLMemObject address and create a dependency line (edge)
+  // between them in our sematic modeling
   for (auto &Item : DepList) {
     if (Item->MTraceEvent && Item->MAddress == MAddress) {
       xpti::utils::StringHelper SH;
@@ -862,8 +857,8 @@ AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req,
     : AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
                         LinkedAllocaCmd, IsConst),
       MInitFromUserData(InitFromUserData) {
-  // Node event must be created before the dependent edge is added to this node,
-  // so this call must be before the addDep() call.
+  // Node event must be created before the dependent edge is added to this
+  // node, so this call must be before the addDep() call.
   emitInstrumentationDataProxy();
   // "Nothing to depend on"
   std::vector<Command *> ToCleanUp;
@@ -1060,14 +1055,15 @@ pi_int32 ReleaseCommand::enqueueImp() {
   bool NeedUnmap = false;
   if (MAllocaCmd->MLinkedAllocaCmd) {
 
-    // When releasing one of the "linked" allocations special rules take place:
+    // When releasing one of the "linked" allocations special rules take
+    // place:
     // 1. Device allocation should always be released.
     // 2. Host allocation should be released if host allocation is "leader".
     // 3. Device alloca in the pair should be in active state in order to be
     //    correctly released.
 
-    // There is no actual memory allocation if a host alloca command is created
-    // being linked to a device allocation.
+    // There is no actual memory allocation if a host alloca command is
+    // created being linked to a device allocation.
     SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca;
 
     NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive;
@@ -1555,7 +1551,8 @@ void EmptyCommand::addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd,
   MRequirements.emplace_back(ReqRef);
   const Requirement *const StoredReq = &MRequirements.back();
 
-  // EmptyCommand is always host one, so we believe that result of addDep is nil
+  // EmptyCommand is always host one, so we believe that result of addDep is
+  // nil
   std::vector<Command *> ToCleanUp;
   Command *Cmd = addDep(DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
   assert(Cmd == nullptr && "Conection command should be null for EmptyCommand");
@@ -1822,9 +1819,9 @@ void ExecCGCommand::emitInstrumentationData() {
       auto KernelBundleImplPtr = KernelCG->getKernelBundle();
 
       // Use kernel_bundle if available unless it is interop.
-      // Interop bundles can't be used in the first branch, because the kernels
-      // in interop kernel bundles (if any) do not have kernel_id
-      // and can therefore not be looked up, but since they are self-contained
+      // Interop bundles can't be used in the first branch, because the
+      // kernels in interop kernel bundles (if any) do not have kernel_id and
+      // can therefore not be looked up, but since they are self-contained
       // they can simply be launched directly.
       if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop()) {
         kernel_id KernelID =
@@ -1913,16 +1910,15 @@ void ExecCGCommand::printDot(std::ostream &Stream) const {
 }
 
 // SYCL has a parallel_for_work_group variant where the only NDRange
-// characteristics set by a user is the number of work groups. This does not map
-// to the OpenCL clEnqueueNDRangeAPI, which requires global work size to be set
-// as well. This function determines local work size based on the device
-// characteristics and the number of work groups requested by the user, then
-// calculates the global work size.
-// SYCL specification (from 4.8.5.3):
-// The member function handler::parallel_for_work_group is parameterized by the
-// number of work - groups, such that the size of each group is chosen by the
-// runtime, or by the number of work - groups and number of work - items for
-// users who need more control.
+// characteristics set by a user is the number of work groups. This does not
+// map to the OpenCL clEnqueueNDRangeAPI, which requires global work size to
+// be set as well. This function determines local work size based on the
+// device characteristics and the number of work groups requested by the user,
+// then calculates the global work size. SYCL specification (from 4.8.5.3):
+// The member function handler::parallel_for_work_group is parameterized by
+// the number of work - groups, such that the size of each group is chosen by
+// the runtime, or by the number of work - groups and number of work - items
+// for users who need more control.
 static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
                                    const device_impl &DeviceImpl) {
   if (NDR.GlobalSize[0] != 0)
@@ -2310,9 +2306,9 @@ pi_int32 ExecCGCommand::enqueueImp() {
     }
 
     std::vector<pi_mem> Buffers;
-    // piEnqueueNativeKernel requires additional array of pointers to args blob,
-    // values that pointers point to are replaced with actual pointers to the
-    // memory before execution of user function.
+    // piEnqueueNativeKernel requires additional array of pointers to args
+    // blob, values that pointers point to are replaced with actual pointers
+    // to the memory before execution of user function.
     std::vector<void *> MemLocs;
 
     for (ArgDesc &Arg : HostTask->MArgs) {
@@ -2438,7 +2434,8 @@ pi_int32 ExecCGCommand::enqueueImp() {
     const detail::plugin &Plugin = MQueue->getPlugin();
     CGInteropTask *ExecInterop = (CGInteropTask *)MCommandGroup.get();
     // Wait for dependencies to complete before dispatching work on the host
-    // TODO: Use a callback to dispatch the interop task instead of waiting for
+    // TODO: Use a callback to dispatch the interop task instead of waiting
+    // for
     //  the event
     if (!RawEvents.empty()) {
       Plugin.call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
@@ -2564,7 +2561,6 @@ bool ExecCGCommand::supportsPostEnqueueCleanup() const {
            !static_cast<CGExecKernel *>(MCommandGroup.get())
                 ->hasAuxiliaryResources()));
 }
-
 } // namespace detail
 } // __SYCL_INLINE_VER_NAMESPACE(_V1)
 } // namespace sycl
@@ -142,8 +142,20 @@ class Command {
     return MEnqueueStatus == EnqueueResultT::SyclEnqueueSuccess;
   }
 
+  // Shows that command could not be enqueued, now it may be true for empty task
+  // only
   bool isEnqueueBlocked() const {
-    return MEnqueueStatus == EnqueueResultT::SyclEnqueueBlocked;
+    return MIsBlockable && MEnqueueStatus == EnqueueResultT::SyclEnqueueBlocked;
+  }
+  // Shows that command could be enqueued, but is blocking enqueue of all
+  // commands depending on it. Regular usage - host task.
+  bool isBlocking() const { return isHostTask() && !MEvent->isCompleted(); }
+
+  void addBlockedUserUnique(const EventImplPtr &NewUser) {
+    if (std::find(MBlockedUsers.begin(), MBlockedUsers.end(), NewUser) !=
+        MBlockedUsers.end())
+      return;
+    MBlockedUsers.push_back(NewUser);
   }
 
   const QueueImplPtr &getQueue() const { return MQueue; }
@@ -325,6 +337,14 @@ class Command {
   /// Indicates that the node will be freed by cleanup after enqueue. Such nodes
   /// should be ignored by other cleanup mechanisms.
   bool MPostEnqueueCleanup = false;
+
+  /// Contains list of commands that depends on the host command explicitly (by
+  /// depends_on). Not involved in the cleanup process since it is one-way link
+  /// and does not hold resources.
+  /// Using EventImplPtr since enqueueUnblockedCommands and event.wait may
+  /// intersect with command enqueue.
+  std::vector<EventImplPtr> MBlockedUsers;
+  std::mutex MBlockedUsersMutex;
 };
 
 /// The empty command does nothing during enqueue. The task can be used to
@@ -561,12 +581,6 @@ class ExecCGCommand : public Command {
 
   detail::CG &getCG() const { return *MCommandGroup; }
 
-  // MEmptyCmd is only employed if this command refers to host-task.
-  // The mechanism of lookup for single EmptyCommand amongst users of
-  // host-task-representing command is unreliable. This unreliability roots in
-  // the cleanup process.
-  EmptyCommand *MEmptyCmd = nullptr;
-
   bool producesPiEvent() const final;
 
   bool supportsPostEnqueueCleanup() const final;

@@ -923,7 +923,6 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
                                std::vector<Command *> &ToEnqueue) {
   std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
   const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
-  const CG::CGTYPE CGType = CommandGroup->getType();
 
   auto NewCmd = std::make_unique<ExecCGCommand>(std::move(CommandGroup), Queue);
   if (!NewCmd)
@@ -1019,11 +1018,6 @@ Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
       ToEnqueue.push_back(ConnCmd);
   }
 
-  if (CGType == CG::CGTYPE::CodeplayHostTask)
-    NewCmd->MEmptyCmd =
-        addEmptyCmd(NewCmd.get(), NewCmd->getCG().MRequirements, Queue,
-                    Command::BlockReason::HostTask, ToEnqueue);
-
   if (MPrintOptionsArray[AfterAddCG])
     printGraphAsDot("after_addCG");
 
@@ -1323,8 +1317,6 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
   }
 
-  EmptyCommand *EmptyCmd = nullptr;
-
   if (Dep.MDepRequirement) {
     // make ConnectCmd depend on requirement
     // Dismiss the result here as it's not a connection now,
@@ -1333,57 +1325,27 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
     assert(reinterpret_cast<Command *>(DepEvent->getCommand()) ==
            Dep.MDepCommand);
     // add user to Dep.MDepCommand is already performed beyond this if branch
-
-    // ConnectCmd is added as dependency to Cmd
-    // We build the following structure Cmd->EmptyCmd/ConnectCmd->DepCmd
-    // No need to add ConnectCmd to leaves buffer since it is a dependency
-    // for command Cmd that will be added there
-
-    std::vector<Command *> ToEnqueue;
-    const std::vector<const Requirement *> Reqs(1, Dep.MDepRequirement);
-    EmptyCmd = addEmptyCmd(ConnectCmd, Reqs,
-                           Scheduler::getInstance().getDefaultHostQueue(),
-                           Command::BlockReason::HostTask, ToEnqueue, false);
-    assert(ToEnqueue.size() == 0);
-
-    // Depend Cmd on empty command
     {
-      DepDesc CmdDep = Dep;
-      CmdDep.MDepCommand = EmptyCmd;
+      DepDesc DepOnConnect = Dep;
+      DepOnConnect.MDepCommand = ConnectCmd;
 
       // Dismiss the result here as it's not a connection now,
-      // 'cause EmptyCmd is host one
-      (void)Cmd->addDep(CmdDep, ToCleanUp);
+      // 'cause ConnectCmd is host one
+      std::ignore = Cmd->addDep(DepOnConnect, ToCleanUp);
     }
   } else {
     // It is required condition in another a path and addUser will be set in
     // addDep
     if (Command *DepCmd = reinterpret_cast<Command *>(DepEvent->getCommand()))
       DepCmd->addUser(ConnectCmd);
 
-    std::vector<Command *> ToEnqueue;
-    EmptyCmd = addEmptyCmd<Requirement>(
-        ConnectCmd, {}, Scheduler::getInstance().getDefaultHostQueue(),
-        Command::BlockReason::HostTask, ToEnqueue);
-    assert(ToEnqueue.size() == 0);
+    std::ignore = ConnectCmd->addDep(DepEvent, ToCleanUp);
 
-    // There is no requirement thus, empty command will only depend on
-    // ConnectCmd via its event.
-    // Dismiss the result here as it's not a connection now,
-    // 'cause ConnectCmd is host one.
-    (void)EmptyCmd->addDep(ConnectCmd->getEvent(), ToCleanUp);
-    (void)ConnectCmd->addDep(DepEvent, ToCleanUp);
+    std::ignore = Cmd->addDep(ConnectCmd->getEvent(), ToCleanUp);
 
-    // Depend Cmd on empty command
-    // Dismiss the result here as it's not a connection now,
-    // 'cause EmptyCmd is host one
-    (void)Cmd->addDep(EmptyCmd->getEvent(), ToCleanUp);
-    // Added by addDep in another path
-    EmptyCmd->addUser(Cmd);
+    ConnectCmd->addUser(Cmd);
   }
 
-  ConnectCmd->MEmptyCmd = EmptyCmd;
-
   return ConnectCmd;
 }