Skip to content

Commit fd00434

Browse files
[SYCL] Discard events created with enqueue function submission (#14224)
This patch makes the [enqueue free functions](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc) create discarded events when they do not return events. This allows calls to the backend to pass `nullptr` when enqueuing the functions, avoiding the creation of native events. This only happens under certain scenarios and currently only applies to in-order queues. However, in-order queues relies on knowing the last event when enqueuing `host_task` commands. To address this case, the `host_task` will insert a barrier when the last event was discarded and use that event for synchronization instead. This case also applies to discarded events resulting from the use of the [sycl_ext_oneapi_discard_queue_events](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/supported/sycl_ext_oneapi_discard_queue_events.asciidoc) extension. --------- Signed-off-by: Larsen, Steffen <[email protected]>
1 parent 1f3f02b commit fd00434

33 files changed

+1040
-188
lines changed

sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88

99
#pragma once
1010

11-
#include <utility> // for std::forward
11+
#include <utility>
1212

1313
#include <sycl/detail/common.hpp>
1414
#include <sycl/event.hpp>
@@ -72,14 +72,20 @@ template <typename LCRangeT, typename LCPropertiesT> struct LaunchConfigAccess {
7272
return MLaunchConfig.getProperties();
7373
}
7474
};
75+
76+
template <typename CommandGroupFunc>
77+
void submit_impl(queue &Q, CommandGroupFunc &&CGF,
78+
const sycl::detail::code_location &CodeLoc) {
79+
Q.submit_without_event(std::forward<CommandGroupFunc>(CGF), CodeLoc);
80+
}
7581
} // namespace detail
7682

7783
template <typename CommandGroupFunc>
7884
void submit(queue Q, CommandGroupFunc &&CGF,
7985
const sycl::detail::code_location &CodeLoc =
8086
sycl::detail::code_location::current()) {
81-
// TODO: Use new submit without Events.
82-
Q.submit(std::forward<CommandGroupFunc>(CGF), CodeLoc);
87+
sycl::ext::oneapi::experimental::detail::submit_impl(
88+
Q, std::forward<CommandGroupFunc>(CGF), CodeLoc);
8389
}
8490

8591
template <typename CommandGroupFunc>
@@ -205,7 +211,8 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
205211
void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
206212
ReductionsT &&...Reductions) {
207213
submit(Q, [&](handler &CGH) {
208-
nd_launch(CGH, Range, KernelObj, std::forward<ReductionsT>(Reductions)...);
214+
nd_launch<KernelName>(CGH, Range, KernelObj,
215+
std::forward<ReductionsT>(Reductions)...);
209216
});
210217
}
211218

@@ -228,7 +235,8 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
228235
void nd_launch(queue Q, launch_config<nd_range<Dimensions>, Properties> Config,
229236
const KernelType &KernelObj, ReductionsT &&...Reductions) {
230237
submit(Q, [&](handler &CGH) {
231-
nd_launch(CGH, Config, KernelObj, std::forward<ReductionsT>(Reductions)...);
238+
nd_launch<KernelName>(CGH, Config, KernelObj,
239+
std::forward<ReductionsT>(Reductions)...);
232240
});
233241
}
234242

@@ -270,11 +278,9 @@ inline void memcpy(handler &CGH, void *Dest, const void *Src, size_t NumBytes) {
270278
CGH.memcpy(Dest, Src, NumBytes);
271279
}
272280

273-
inline void memcpy(queue Q, void *Dest, const void *Src, size_t NumBytes,
274-
const sycl::detail::code_location &CodeLoc =
275-
sycl::detail::code_location::current()) {
276-
submit(Q, [&](handler &CGH) { memcpy(CGH, Dest, Src, NumBytes); }, CodeLoc);
277-
}
281+
__SYCL_EXPORT void memcpy(queue Q, void *Dest, const void *Src, size_t NumBytes,
282+
const sycl::detail::code_location &CodeLoc =
283+
sycl::detail::code_location::current());
278284

279285
template <typename T>
280286
void copy(handler &CGH, const T *Src, T *Dest, size_t Count) {
@@ -292,11 +298,9 @@ inline void memset(handler &CGH, void *Ptr, int Value, size_t NumBytes) {
292298
CGH.memset(Ptr, Value, NumBytes);
293299
}
294300

295-
inline void memset(queue Q, void *Ptr, int Value, size_t NumBytes,
296-
const sycl::detail::code_location &CodeLoc =
297-
sycl::detail::code_location::current()) {
298-
submit(Q, [&](handler &CGH) { memset(CGH, Ptr, Value, NumBytes); }, CodeLoc);
299-
}
301+
__SYCL_EXPORT void memset(queue Q, void *Ptr, int Value, size_t NumBytes,
302+
const sycl::detail::code_location &CodeLoc =
303+
sycl::detail::code_location::current());
300304

301305
template <typename T>
302306
void fill(sycl::handler &CGH, T *Ptr, const T &Pattern, size_t Count) {
@@ -324,13 +328,9 @@ inline void mem_advise(handler &CGH, void *Ptr, size_t NumBytes, int Advice) {
324328
CGH.mem_advise(Ptr, NumBytes, Advice);
325329
}
326330

327-
inline void mem_advise(queue Q, void *Ptr, size_t NumBytes, int Advice,
328-
const sycl::detail::code_location &CodeLoc =
329-
sycl::detail::code_location::current()) {
330-
submit(
331-
Q, [&](handler &CGH) { mem_advise(CGH, Ptr, NumBytes, Advice); },
332-
CodeLoc);
333-
}
331+
__SYCL_EXPORT void mem_advise(queue Q, void *Ptr, size_t NumBytes, int Advice,
332+
const sycl::detail::code_location &CodeLoc =
333+
sycl::detail::code_location::current());
334334

335335
inline void barrier(handler &CGH) { CGH.ext_oneapi_barrier(); }
336336

sycl/include/sycl/handler.hpp

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,7 @@ class __SYCL_EXPORT handler {
465465
///
466466
/// \param Queue is a SYCL queue.
467467
/// \param IsHost indicates if this handler is created for SYCL host device.
468+
/// TODO: Unused. Remove with ABI break.
468469
handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost);
469470

470471
/// Constructs SYCL handler from the associated queue and the submission's
@@ -476,10 +477,36 @@ class __SYCL_EXPORT handler {
476477
/// \param SecondaryQueue is the secondary SYCL queue of the submission. This
477478
/// is null if no secondary queue is associated with the submission.
478479
/// \param IsHost indicates if this handler is created for SYCL host device.
480+
/// TODO: Unused. Remove with ABI break.
479481
handler(std::shared_ptr<detail::queue_impl> Queue,
480482
std::shared_ptr<detail::queue_impl> PrimaryQueue,
481483
std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
482484

485+
/// Constructs SYCL handler from queue.
486+
///
487+
/// \param Queue is a SYCL queue.
488+
/// \param IsHost indicates if this handler is created for SYCL host device.
489+
/// \param CallerNeedsEvent indicates if the event resulting from this handler
490+
/// is needed by the caller.
491+
handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost,
492+
bool CallerNeedsEvent);
493+
494+
/// Constructs SYCL handler from the associated queue and the submission's
495+
/// primary and secondary queue.
496+
///
497+
/// \param Queue is a SYCL queue. This is equal to either PrimaryQueue or
498+
/// SecondaryQueue.
499+
/// \param PrimaryQueue is the primary SYCL queue of the submission.
500+
/// \param SecondaryQueue is the secondary SYCL queue of the submission. This
501+
/// is null if no secondary queue is associated with the submission.
502+
/// \param IsHost indicates if this handler is created for SYCL host device.
503+
/// \param CallerNeedsEvent indicates if the event resulting from this handler
504+
/// is needed by the caller.
505+
handler(std::shared_ptr<detail::queue_impl> Queue,
506+
std::shared_ptr<detail::queue_impl> PrimaryQueue,
507+
std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost,
508+
bool CallerNeedsEvent);
509+
483510
/// Constructs SYCL handler from Graph.
484511
///
485512
/// The hander will add the command-group as a node to the graph rather than
@@ -575,6 +602,16 @@ class __SYCL_EXPORT handler {
575602
/// \return a SYCL event object representing the command group
576603
event finalize();
577604

605+
/// Constructs CG object of specific type, passes it to Scheduler and
606+
/// returns sycl::event object representing the command group.
607+
/// It's expected that the method is the latest method executed before
608+
/// object destruction.
609+
/// \param CallerNeedsEvent Specifies if the caller needs an event
610+
/// representing the work related to this handler.
611+
///
612+
/// \return a SYCL event object representing the command group
613+
event finalize(bool CallerNeedsEvent);
614+
578615
/// Saves streams associated with this handler.
579616
///
580617
/// Streams are then forwarded to command group and flushed in the scheduler.
@@ -1184,6 +1221,8 @@ class __SYCL_EXPORT handler {
11841221
Size == 32 || Size == 64 || Size == 128;
11851222
}
11861223

1224+
bool eventNeeded() const;
1225+
11871226
template <int Dims, typename LambdaArgType> struct TransformUserItemType {
11881227
using type = std::conditional_t<
11891228
std::is_convertible_v<nd_item<Dims>, LambdaArgType>, nd_item<Dims>,

sycl/include/sycl/queue.hpp

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,12 @@ namespace ext ::oneapi ::experimental {
9595
// returned by info::queue::state
9696
enum class queue_state { executing, recording };
9797
struct image_descriptor;
98+
99+
namespace detail {
100+
template <typename CommandGroupFunc>
101+
void submit_impl(queue &Q, CommandGroupFunc &&CGF,
102+
const sycl::detail::code_location &CodeLoc);
103+
} // namespace detail
98104
} // namespace ext::oneapi::experimental
99105

100106
/// Encapsulates a single SYCL queue which schedules kernels on a SYCL device.
@@ -2689,13 +2695,40 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
26892695
const detail::code_location &);
26902696
#endif
26912697

2698+
template <typename CommandGroupFunc>
2699+
friend void ext::oneapi::experimental::detail::submit_impl(
2700+
queue &Q, CommandGroupFunc &&CGF,
2701+
const sycl::detail::code_location &CodeLoc);
2702+
26922703
/// A template-free version of submit.
26932704
event submit_impl(std::function<void(handler &)> CGH,
26942705
const detail::code_location &CodeLoc);
26952706
/// A template-free version of submit.
26962707
event submit_impl(std::function<void(handler &)> CGH, queue secondQueue,
26972708
const detail::code_location &CodeLoc);
26982709

2710+
/// A template-free version of submit_without_event.
2711+
void submit_without_event_impl(std::function<void(handler &)> CGH,
2712+
const detail::code_location &CodeLoc);
2713+
2714+
/// Submits a command group function object to the queue, in order to be
2715+
/// scheduled for execution on the device.
2716+
///
2717+
/// \param CGF is a function object containing command group.
2718+
/// \param CodeLoc is the code location of the submit call (default argument)
2719+
template <typename T>
2720+
std::enable_if_t<std::is_invocable_r_v<void, T, handler &>, void>
2721+
submit_without_event(T CGF, const detail::code_location &CodeLoc) {
2722+
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
2723+
#if __SYCL_USE_FALLBACK_ASSERT
2724+
// If post-processing is needed, fall back to the regular submit.
2725+
// TODO: Revisit whether we can avoid this.
2726+
submit(CGF, CodeLoc);
2727+
#else
2728+
submit_without_event_impl(CGF, CodeLoc);
2729+
#endif // __SYCL_USE_FALLBACK_ASSERT
2730+
}
2731+
26992732
/// Checks if the event needs to be discarded and if so, discards it and
27002733
/// returns a discarded event. Otherwise, it returns input event.
27012734
/// TODO: move to impl class in the next ABI Breaking window

sycl/include/sycl/reduction.hpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,8 +1178,9 @@ namespace reduction {
11781178
inline void finalizeHandler(handler &CGH) { CGH.finalize(); }
11791179
template <class FunctorTy> void withAuxHandler(handler &CGH, FunctorTy Func) {
11801180
event E = CGH.finalize();
1181-
handler AuxHandler(CGH.MQueue, CGH.MIsHost);
1182-
AuxHandler.depends_on(E);
1181+
handler AuxHandler(CGH.MQueue, CGH.MIsHost, CGH.eventNeeded());
1182+
if (!createSyclObjFromImpl<queue>(CGH.MQueue).is_in_order())
1183+
AuxHandler.depends_on(E);
11831184
AuxHandler.saveCodeLoc(CGH.MCodeLoc);
11841185
Func(AuxHandler);
11851186
CGH.MLastEvent = AuxHandler.finalize();

sycl/source/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ set(SYCL_COMMON_SOURCES
248248
"context.cpp"
249249
"device.cpp"
250250
"device_selector.cpp"
251+
"enqueue_functions.cpp"
251252
"event.cpp"
252253
"exception.cpp"
253254
"exception_list.cpp"

sycl/source/detail/graph_impl.cpp

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -690,7 +690,8 @@ sycl::detail::pi::PiExtSyncPoint exec_graph_impl::enqueueNode(
690690

691691
sycl::detail::EventImplPtr Event =
692692
sycl::detail::Scheduler::getInstance().addCG(
693-
Node->getCGCopy(), AllocaQueue, CommandBuffer, Deps);
693+
Node->getCGCopy(), AllocaQueue, /*EventNeeded=*/true, CommandBuffer,
694+
Deps);
694695

695696
MCommandMap[Node] = Event->getCommandBufferCommand();
696697
return Event->getSyncPoint();
@@ -928,7 +929,7 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
928929
CommandBuffer, nullptr, std::move(CGData));
929930

930931
NewEvent = sycl::detail::Scheduler::getInstance().addCG(
931-
std::move(CommandGroup), Queue);
932+
std::move(CommandGroup), Queue, /*EventNeeded=*/true);
932933
}
933934
NewEvent->setEventFromSubmittedExecCommandBuffer(true);
934935
} else if ((CurrentPartition->MSchedule.size() > 0) &&
@@ -946,7 +947,7 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
946947
.MQueue = Queue;
947948

948949
NewEvent = sycl::detail::Scheduler::getInstance().addCG(
949-
NodeImpl->getCGCopy(), Queue);
950+
NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true);
950951
} else {
951952
std::vector<std::shared_ptr<sycl::detail::event_impl>> ScheduledEvents;
952953
for (auto &NodeImpl : CurrentPartition->MSchedule) {
@@ -982,7 +983,7 @@ exec_graph_impl::enqueue(const std::shared_ptr<sycl::detail::queue_impl> &Queue,
982983
// dependencies are propagated in findRealDeps
983984
sycl::detail::EventImplPtr EventImpl =
984985
sycl::detail::Scheduler::getInstance().addCG(
985-
NodeImpl->getCGCopy(), Queue);
986+
NodeImpl->getCGCopy(), Queue, /*EventNeeded=*/true);
986987

987988
ScheduledEvents.push_back(EventImpl);
988989
}

sycl/source/detail/handler_impl.hpp

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,11 @@ enum class HandlerSubmissionState : std::uint8_t {
3131
class handler_impl {
3232
public:
3333
handler_impl(std::shared_ptr<queue_impl> SubmissionPrimaryQueue,
34-
std::shared_ptr<queue_impl> SubmissionSecondaryQueue)
34+
std::shared_ptr<queue_impl> SubmissionSecondaryQueue,
35+
bool EventNeeded)
3536
: MSubmissionPrimaryQueue(std::move(SubmissionPrimaryQueue)),
36-
MSubmissionSecondaryQueue(std::move(SubmissionSecondaryQueue)){};
37+
MSubmissionSecondaryQueue(std::move(SubmissionSecondaryQueue)),
38+
MEventNeeded(EventNeeded) {};
3739

3840
handler_impl() = default;
3941

@@ -74,6 +76,10 @@ class handler_impl {
7476
/// submission is a fallback from a previous submission.
7577
std::shared_ptr<queue_impl> MSubmissionSecondaryQueue;
7678

79+
/// Bool stores information about whether the event resulting from the
80+
/// corresponding work is required.
81+
bool MEventNeeded = true;
82+
7783
// Stores auxiliary resources used by internal operations.
7884
std::vector<std::shared_ptr<const void>> MAuxiliaryResources;
7985

0 commit comments

Comments
 (0)