@@ -321,11 +321,7 @@ class DispatchHostTask {
321
321
322
322
HostTask.MHostTask .reset ();
323
323
324
- // unblock user empty command here
325
- EmptyCommand *EmptyCmd = MThisCmd->MEmptyCmd ;
326
- assert (EmptyCmd && " No empty command found" );
327
-
328
- Scheduler::getInstance ().NotifyHostTaskCompletion (MThisCmd, EmptyCmd);
324
+ Scheduler::getInstance ().NotifyHostTaskCompletion (MThisCmd);
329
325
}
330
326
};
331
327
@@ -349,11 +345,10 @@ void Command::waitForEvents(QueueImplPtr Queue,
349
345
// we will have two different contexts for the same CPU device: C1, C2.
350
346
// Also we have default host queue. This queue is accessible via
351
347
// Scheduler. Now, let's assume we have three different events: E1(C1),
352
- // E2(C1), E3(C2). Also, we have an EmptyCommand which is to be executed
353
- // on host queue. The command's MPreparedDepsEvents will contain all three
354
- // events (E1, E2, E3). Now, if piEventsWait is called for all three
355
- // events we'll experience failure with CL_INVALID_CONTEXT 'cause these
356
- // events refer to different contexts.
348
+ // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
349
+ // three events (E1, E2, E3). Now, if piEventsWait is called for all
350
+ // three events we'll experience failure with CL_INVALID_CONTEXT 'cause
351
+ // these events refer to different contexts.
357
352
std::map<context_impl *, std::vector<EventImplPtr>>
358
353
RequiredEventsPerContext;
359
354
@@ -419,19 +414,19 @@ void Command::emitInstrumentationDataProxy() {
419
414
// / Method takes in void * for the address as adding a template function to
420
415
// / the command group object maybe undesirable.
421
416
// / @param Cmd The command object of the source of the edge
422
- // / @param ObjAddr The address that defines the edge dependency; it is the event
423
- // / address when the edge is for an event and a memory object address if it is
424
- // / due to an accessor
425
- // / @param Prefix Contains "event" if the dependency is an edge and contains the
426
- // / access mode to the buffer if it is due to an accessor
427
- // / @param IsCommand True if the dependency has a command object as the source,
428
- // / false otherwise
417
+ // / @param ObjAddr The address that defines the edge dependency; it is the
418
+ // / event address when the edge is for an event and a memory object address if
419
+ // / it is due to an accessor
420
+ // / @param Prefix Contains "event" if the dependency is an edge and contains
421
+ // / the access mode to the buffer if it is due to an accessor
422
+ // / @param IsCommand True if the dependency has a command object as the
423
+ // / source, false otherwise
429
424
void Command::emitEdgeEventForCommandDependence (
430
425
Command *Cmd, void *ObjAddr, bool IsCommand,
431
426
std::optional<access::mode> AccMode) {
432
427
#ifdef XPTI_ENABLE_INSTRUMENTATION
433
- // Bail early if either the source or the target node for the given dependency
434
- // is undefined or NULL
428
+ // Bail early if either the source or the target node for the given
429
+ // dependency is undefined or NULL
435
430
if (!(xptiTraceEnabled () && MTraceEvent && Cmd && Cmd->MTraceEvent ))
436
431
return ;
437
432
@@ -583,11 +578,11 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
583
578
584
579
// 1. Async work is not supported for host device.
585
580
// 2. Non-host events can be ignored if they are not fully initialized.
586
- // 3. Some types of commands do not produce PI events after they are enqueued
581
+ // 3. Some types of commands do not produce PI events after they are
582
+ // enqueued
587
583
// (e.g. alloca). Note that we can't check the pi event to make that
588
584
// distinction since the command might still be unenqueued at this point.
589
- bool PiEventExpected = (!DepEvent->is_host () && DepEvent->isInitialized ()) ||
590
- getType () == CommandType::HOST_TASK;
585
+ bool PiEventExpected = (!DepEvent->is_host () && DepEvent->isInitialized ());
591
586
if (auto *DepCmd = static_cast <Command *>(DepEvent->getCommand ()))
592
587
PiEventExpected &= DepCmd->producesPiEvent ();
593
588
@@ -776,10 +771,10 @@ void Command::resolveReleaseDependencies(std::set<Command *> &DepList) {
776
771
// nodes have to be completed first before the current node can begin to
777
772
// execute; these edges model control flow
778
773
xpti_td *TgtTraceEvent = static_cast <xpti_td *>(MTraceEvent);
779
- // We have all the Commands that must be completed before the release command
780
- // can be enqueued; here we'll find the command that is an Alloca with the
781
- // same SYCLMemObject address and create a dependency line (edge) between them
782
- // in our sematic modeling
774
+ // We have all the Commands that must be completed before the release
775
+ // command can be enqueued; here we'll find the command that is an Alloca
776
+ // with the same SYCLMemObject address and create a dependency line (edge)
777
+ // between them in our sematic modeling
783
778
for (auto &Item : DepList) {
784
779
if (Item->MTraceEvent && Item->MAddress == MAddress) {
785
780
xpti::utils::StringHelper SH;
@@ -862,8 +857,8 @@ AllocaCommand::AllocaCommand(QueueImplPtr Queue, Requirement Req,
862
857
: AllocaCommandBase(CommandType::ALLOCA, std::move(Queue), std::move(Req),
863
858
LinkedAllocaCmd, IsConst),
864
859
MInitFromUserData(InitFromUserData) {
865
- // Node event must be created before the dependent edge is added to this node,
866
- // so this call must be before the addDep() call.
860
+ // Node event must be created before the dependent edge is added to this
861
+ // node, so this call must be before the addDep() call.
867
862
emitInstrumentationDataProxy ();
868
863
// "Nothing to depend on"
869
864
std::vector<Command *> ToCleanUp;
@@ -1060,14 +1055,15 @@ pi_int32 ReleaseCommand::enqueueImp() {
1060
1055
bool NeedUnmap = false ;
1061
1056
if (MAllocaCmd->MLinkedAllocaCmd ) {
1062
1057
1063
- // When releasing one of the "linked" allocations special rules take place:
1058
+ // When releasing one of the "linked" allocations special rules take
1059
+ // place:
1064
1060
// 1. Device allocation should always be released.
1065
1061
// 2. Host allocation should be released if host allocation is "leader".
1066
1062
// 3. Device alloca in the pair should be in active state in order to be
1067
1063
// correctly released.
1068
1064
1069
- // There is no actual memory allocation if a host alloca command is created
1070
- // being linked to a device allocation.
1065
+ // There is no actual memory allocation if a host alloca command is
1066
+ // created being linked to a device allocation.
1071
1067
SkipRelease |= CurAllocaIsHost && !MAllocaCmd->MIsLeaderAlloca ;
1072
1068
1073
1069
NeedUnmap |= CurAllocaIsHost == MAllocaCmd->MIsActive ;
@@ -1555,7 +1551,8 @@ void EmptyCommand::addRequirement(Command *DepCmd, AllocaCommandBase *AllocaCmd,
1555
1551
MRequirements.emplace_back (ReqRef);
1556
1552
const Requirement *const StoredReq = &MRequirements.back ();
1557
1553
1558
- // EmptyCommand is always host one, so we believe that result of addDep is nil
1554
+ // EmptyCommand is always host one, so we believe that result of addDep is
1555
+ // nil
1559
1556
std::vector<Command *> ToCleanUp;
1560
1557
Command *Cmd = addDep (DepDesc{DepCmd, StoredReq, AllocaCmd}, ToCleanUp);
1561
1558
assert (Cmd == nullptr && " Conection command should be null for EmptyCommand" );
@@ -1822,9 +1819,9 @@ void ExecCGCommand::emitInstrumentationData() {
1822
1819
auto KernelBundleImplPtr = KernelCG->getKernelBundle ();
1823
1820
1824
1821
// Use kernel_bundle if available unless it is interop.
1825
- // Interop bundles can't be used in the first branch, because the kernels
1826
- // in interop kernel bundles (if any) do not have kernel_id
1827
- // and can therefore not be looked up, but since they are self-contained
1822
+ // Interop bundles can't be used in the first branch, because the
1823
+ // kernels in interop kernel bundles (if any) do not have kernel_id and
1824
+ // can therefore not be looked up, but since they are self-contained
1828
1825
// they can simply be launched directly.
1829
1826
if (KernelBundleImplPtr && !KernelBundleImplPtr->isInterop ()) {
1830
1827
kernel_id KernelID =
@@ -1913,16 +1910,15 @@ void ExecCGCommand::printDot(std::ostream &Stream) const {
1913
1910
}
1914
1911
1915
1912
// SYCL has a parallel_for_work_group variant where the only NDRange
1916
- // characteristics set by a user is the number of work groups. This does not map
1917
- // to the OpenCL clEnqueueNDRangeAPI, which requires global work size to be set
1918
- // as well. This function determines local work size based on the device
1919
- // characteristics and the number of work groups requested by the user, then
1920
- // calculates the global work size.
1921
- // SYCL specification (from 4.8.5.3):
1922
- // The member function handler::parallel_for_work_group is parameterized by the
1923
- // number of work - groups, such that the size of each group is chosen by the
1924
- // runtime, or by the number of work - groups and number of work - items for
1925
- // users who need more control.
1913
+ // characteristics set by a user is the number of work groups. This does not
1914
+ // map to the OpenCL clEnqueueNDRangeAPI, which requires global work size to
1915
+ // be set as well. This function determines local work size based on the
1916
+ // device characteristics and the number of work groups requested by the user,
1917
+ // then calculates the global work size. SYCL specification (from 4.8.5.3):
1918
+ // The member function handler::parallel_for_work_group is parameterized by
1919
+ // the number of work - groups, such that the size of each group is chosen by
1920
+ // the runtime, or by the number of work - groups and number of work - items
1921
+ // for users who need more control.
1926
1922
static void adjustNDRangePerKernel (NDRDescT &NDR, RT::PiKernel Kernel,
1927
1923
const device_impl &DeviceImpl) {
1928
1924
if (NDR.GlobalSize [0 ] != 0 )
@@ -2310,9 +2306,9 @@ pi_int32 ExecCGCommand::enqueueImp() {
2310
2306
}
2311
2307
2312
2308
std::vector<pi_mem> Buffers;
2313
- // piEnqueueNativeKernel requires additional array of pointers to args blob,
2314
- // values that pointers point to are replaced with actual pointers to the
2315
- // memory before execution of user function.
2309
+ // piEnqueueNativeKernel requires additional array of pointers to args
2310
+ // blob, values that pointers point to are replaced with actual pointers
2311
+ // to the memory before execution of user function.
2316
2312
std::vector<void *> MemLocs;
2317
2313
2318
2314
for (ArgDesc &Arg : HostTask->MArgs ) {
@@ -2438,7 +2434,8 @@ pi_int32 ExecCGCommand::enqueueImp() {
2438
2434
const detail::plugin &Plugin = MQueue->getPlugin ();
2439
2435
CGInteropTask *ExecInterop = (CGInteropTask *)MCommandGroup.get ();
2440
2436
// Wait for dependencies to complete before dispatching work on the host
2441
- // TODO: Use a callback to dispatch the interop task instead of waiting for
2437
+ // TODO: Use a callback to dispatch the interop task instead of waiting
2438
+ // for
2442
2439
// the event
2443
2440
if (!RawEvents.empty ()) {
2444
2441
Plugin.call <PiApiKind::piEventsWait>(RawEvents.size (), &RawEvents[0 ]);
@@ -2564,7 +2561,6 @@ bool ExecCGCommand::supportsPostEnqueueCleanup() const {
2564
2561
!static_cast <CGExecKernel *>(MCommandGroup.get ())
2565
2562
->hasAuxiliaryResources ()));
2566
2563
}
2567
-
2568
2564
} // namespace detail
2569
2565
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
2570
2566
} // namespace sycl
0 commit comments