Skip to content

Commit ae3fd5c

Browse files
authored
[SYCL] Host task implementation (#1471)
This patch is part 1 in a series of patches for host-interop-task proposal by Codeplay. See the proposal at [1]. This patch implements: - host-task execution mechanism; - enqueue of host-task without interop_handle argument; - spin-lock to await for host and synchronous device events completion. This patch reimplements glue/connection of events within different contexts to eliminate use of event callback in favor of host-task. Host-task execution mechanism involves: - thread-pool for queue to execute host-task's user lambda in; - explicit call to event_impl::setComplete() for host events and device-side synchronous events; - helper class DispatchHostTask which wraps call to host-task's user lambda. Thread pool's size is set via `SYCL_QUEUE_THREAD_POOL_SIZE` environment variable and defaults to 1. Even though host-task is enqueued to device queue it'll be executed on the default host queue. Host-task is represented via distinct ExecCGCommand paired with EmptyCommand. Any other command, which depends on host-task will really depend on it's EmptyCommand. The EmptyCommand is in blocked state initially. Class DispatchHostTask awaits for host-task's dependency events, then calls to host-task's user lambda and unblocks any dependent commands via unblocking it's EmptyCommand and enqueueing of leaves for requirements (i.e. host accessors required for execution of this host-task). [1] https://github.com/codeplaysoftware/standards-proposals/blob/master/host_task/host_task.md
1 parent 075361e commit ae3fd5c

23 files changed

+1070
-135
lines changed

sycl/doc/EnvironmentVariables.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ subject to change. Do not rely on these variables in production code.
2323
| SYCL_THROW_ON_BLOCK | Any(\*) | Throw an exception on attempt to wait for a blocked command. |
2424
| SYCL_DEVICELIB_INHIBIT_NATIVE | String of device library extensions (separated by a whitespace) | Do not rely on device native support for devicelib extensions listed in this option. |
2525
| SYCL_DEVICE_ALLOWLIST | A list of devices and their minimum driver version following the pattern: DeviceName:{{XXX}},DriverVersion:{{X.Y.Z.W}}. Also may contain PlatformName and PlatformVersion | Filter out devices that do not match the pattern specified. Regular expression can be passed and the DPC++ runtime will select only those devices which satisfy the regex. |
26+
| SYCL_QUEUE_THREAD_POOL_SIZE | Positive integer | Number of threads in thread pool of queue. |
2627
`(*) Note: Any means this environment variable is effective when set to any non-null value.`
2728

2829
### SYCL_PRINT_EXECUTION_GRAPH Options

sycl/include/CL/sycl/accessor.hpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -778,7 +778,7 @@ class accessor :
778778
template <int Dims = Dimensions, typename AllocatorT,
779779
typename = typename detail::enable_if_t<
780780
(Dims == 0) &&
781-
(!IsPlaceH && (IsGlobalBuf || IsConstantBuf))>
781+
(!IsPlaceH && (IsGlobalBuf || IsConstantBuf || IsHostBuf))>
782782
>
783783
accessor(buffer<DataT,1,AllocatorT> &BufferRef,
784784
handler &CommandGroupHandler)
@@ -817,9 +817,9 @@ class accessor :
817817
#endif
818818

819819
template <int Dims = Dimensions, typename AllocatorT,
820-
typename = detail::enable_if_t<(Dims > 0) && (Dims == Dimensions) &&
821-
(!IsPlaceH &&
822-
(IsGlobalBuf || IsConstantBuf))>>
820+
typename = detail::enable_if_t<
821+
(Dims > 0) && (Dims == Dimensions) &&
822+
(!IsPlaceH && (IsGlobalBuf || IsConstantBuf || IsHostBuf))>>
823823
accessor(buffer<DataT, Dims, AllocatorT> &BufferRef,
824824
handler &CommandGroupHandler)
825825
#ifdef __SYCL_DEVICE_ONLY__

sycl/include/CL/sycl/detail/cg.hpp

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,16 @@ class InteropTask {
215215
void call(cl::sycl::interop_handler &h) { MFunc(h); }
216216
};
217217

218+
class HostTask {
219+
std::function<void()> MHostTask;
220+
221+
public:
222+
HostTask() : MHostTask([]() {}) {}
223+
HostTask(std::function<void()> &&Func) : MHostTask(Func) {}
224+
225+
void call() { MHostTask(); }
226+
};
227+
218228
// Class which stores specific lambda object.
219229
template <class KernelType, class KernelArgType, int Dims>
220230
class HostKernel : public HostKernelBase {
@@ -391,7 +401,8 @@ class CG {
391401
COPY_USM,
392402
FILL_USM,
393403
PREFETCH_USM,
394-
INTEROP_TASK_CODEPLAY
404+
CODEPLAY_INTEROP_TASK,
405+
CODEPLAY_HOST_TASK
395406
};
396407

397408
CG(CGTYPE Type, vector_class<vector_class<char>> ArgsStorage,
@@ -631,6 +642,24 @@ class CGInteropTask : public CG {
631642
MInteropTask(std::move(InteropTask)) {}
632643
};
633644

645+
class CGHostTask : public CG {
646+
public:
647+
std::unique_ptr<HostTask> MHostTask;
648+
vector_class<ArgDesc> MArgs;
649+
650+
CGHostTask(std::unique_ptr<HostTask> HostTask, vector_class<ArgDesc> Args,
651+
std::vector<std::vector<char>> ArgsStorage,
652+
std::vector<detail::AccessorImplPtr> AccStorage,
653+
std::vector<std::shared_ptr<const void>> SharedPtrStorage,
654+
std::vector<Requirement *> Requirements,
655+
std::vector<detail::EventImplPtr> Events, CGTYPE Type,
656+
detail::code_location loc = {})
657+
: CG(Type, std::move(ArgsStorage), std::move(AccStorage),
658+
std::move(SharedPtrStorage), std::move(Requirements),
659+
std::move(Events), std::move(loc)),
660+
MHostTask(std::move(HostTask)), MArgs(std::move(Args)) {}
661+
};
662+
634663
} // namespace detail
635664
} // namespace sycl
636665
} // __SYCL_INLINE_NAMESPACE(cl)

sycl/include/CL/sycl/handler.hpp

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,27 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
105105
using name = Type;
106106
};
107107

108+
template <typename, typename T> struct check_fn_signature {
109+
static_assert(std::integral_constant<T, false>::value,
110+
"Second template parameter is required to be of function type");
111+
};
112+
113+
template <typename F, typename RetT, typename... Args>
114+
struct check_fn_signature<F, RetT(Args...)> {
115+
private:
116+
template <typename T>
117+
static constexpr auto check(T *) -> typename std::is_same<
118+
decltype(std::declval<T>().operator()(std::declval<Args>()...)),
119+
RetT>::type;
120+
121+
template <typename> static constexpr std::false_type check(...);
122+
123+
using type = decltype(check<F>(0));
124+
125+
public:
126+
static constexpr bool value = type::value;
127+
};
128+
108129
__SYCL_EXPORT device getDeviceFromHandler(handler &);
109130

110131
} // namespace detail
@@ -789,6 +810,20 @@ class __SYCL_EXPORT handler {
789810
MCGType = detail::CG::RUN_ON_HOST_INTEL;
790811
}
791812

813+
template <typename FuncT>
814+
typename std::enable_if<detail::check_fn_signature<
815+
typename std::remove_reference<FuncT>::type, void()>::value>::type
816+
codeplay_host_task(FuncT Func) {
817+
throwIfActionIsCreated();
818+
819+
MNDRDesc.set(range<1>(1));
820+
MArgs = std::move(MAssociatedAccesors);
821+
822+
MHostTask.reset(new detail::HostTask(std::move(Func)));
823+
824+
MCGType = detail::CG::CODEPLAY_HOST_TASK;
825+
}
826+
792827
/// Defines and invokes a SYCL kernel function for the specified range and
793828
/// offset.
794829
///
@@ -1140,7 +1175,7 @@ class __SYCL_EXPORT handler {
11401175
template <typename FuncT> void interop_task(FuncT Func) {
11411176

11421177
MInteropTask.reset(new detail::InteropTask(std::move(Func)));
1143-
MCGType = detail::CG::INTEROP_TASK_CODEPLAY;
1178+
MCGType = detail::CG::CODEPLAY_INTEROP_TASK;
11441179
}
11451180

11461181
/// Defines and invokes a SYCL kernel function for the specified range.
@@ -1598,6 +1633,8 @@ class __SYCL_EXPORT handler {
15981633
vector_class<char> MPattern;
15991634
/// Storage for a lambda or function object.
16001635
unique_ptr_class<detail::HostKernelBase> MHostKernel;
1636+
/// Storage for lambda/function when using HostTask
1637+
unique_ptr_class<detail::HostTask> MHostTask;
16011638
detail::OSModuleHandle MOSModuleHandle;
16021639
// Storage for a lambda or function when using InteropTasks
16031640
std::unique_ptr<detail::InteropTask> MInteropTask;

sycl/source/detail/event_impl.cpp

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,31 @@ event_impl::~event_impl() {
5050
}
5151

5252
void event_impl::waitInternal() const {
53-
if (!MHostEvent) {
53+
if (!MHostEvent && MEvent) {
5454
getPlugin().call<PiApiKind::piEventsWait>(1, &MEvent);
55+
return;
5556
}
56-
// Waiting of host events is NOP so far as all operations on host device
57-
// are blocking.
57+
58+
while (MState != HES_Complete)
59+
;
60+
}
61+
62+
void event_impl::setComplete() {
63+
if (MHostEvent || !MEvent) {
64+
#ifndef NDEBUG
65+
int Expected = HES_NotComplete;
66+
int Desired = HES_Complete;
67+
68+
bool Succeeded = MState.compare_exchange_strong(Expected, Desired);
69+
70+
assert(Succeeded && "Unexpected state of event");
71+
#else
72+
MState.store(static_cast<int>(HES_Complete));
73+
#endif
74+
return;
75+
}
76+
77+
assert(false && "setComplete is not supported for non-host event");
5878
}
5979

6080
const RT::PiEvent &event_impl::getHandleRef() const { return MEvent; }
@@ -68,11 +88,15 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) {
6888
MHostEvent = Context->is_host();
6989
MOpenCLInterop = !MHostEvent;
7090
MContext = Context;
91+
92+
MState = HES_NotComplete;
7193
}
7294

95+
event_impl::event_impl() : MState(HES_Complete) {}
96+
7397
event_impl::event_impl(RT::PiEvent Event, const context &SyclContext)
7498
: MEvent(Event), MContext(detail::getSyclObjImpl(SyclContext)),
75-
MOpenCLInterop(true), MHostEvent(false) {
99+
MOpenCLInterop(true), MHostEvent(false), MState(HES_Complete) {
76100

77101
if (MContext->is_host()) {
78102
throw cl::sycl::invalid_parameter_error(
@@ -96,12 +120,19 @@ event_impl::event_impl(RT::PiEvent Event, const context &SyclContext)
96120
}
97121

98122
event_impl::event_impl(QueueImplPtr Queue) : MQueue(Queue) {
99-
if (Queue->is_host() &&
100-
Queue->has_property<property::queue::enable_profiling>()) {
101-
MHostProfilingInfo.reset(new HostProfilingInfo());
102-
if (!MHostProfilingInfo)
103-
throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
123+
if (Queue->is_host()) {
124+
MState.store(HES_NotComplete);
125+
126+
if (Queue->has_property<property::queue::enable_profiling>()) {
127+
MHostProfilingInfo.reset(new HostProfilingInfo());
128+
if (!MHostProfilingInfo)
129+
throw runtime_error("Out of host memory", PI_OUT_OF_HOST_MEMORY);
130+
}
131+
132+
return;
104133
}
134+
135+
MState.store(HES_Complete);
105136
}
106137

107138
void *event_impl::instrumentationProlog(string_class &Name, int32_t StreamID,

sycl/source/detail/event_impl.hpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include <CL/sycl/info/info_desc.hpp>
1515
#include <CL/sycl/stl.hpp>
1616

17+
#include <atomic>
1718
#include <cassert>
1819

1920
__SYCL_INLINE_NAMESPACE(cl) {
@@ -32,7 +33,7 @@ class event_impl {
3233
/// Constructs a ready SYCL event.
3334
///
3435
/// If the constructed SYCL event is waited on it will complete immediately.
35-
event_impl() = default;
36+
event_impl();
3637
/// Constructs an event instance from a plug-in event handle.
3738
///
3839
/// The SyclContext must match the plug-in context associated with the
@@ -166,6 +167,13 @@ class event_impl {
166167
bool MHostEvent = true;
167168
std::unique_ptr<HostProfilingInfo> MHostProfilingInfo;
168169
void *MCommand = nullptr;
170+
171+
enum HostEventState : int { HES_NotComplete = 0, HES_Complete };
172+
173+
// State of host event. Employed only for host events and event with no
174+
// backend's representation (e.g. alloca). Used values are listed in
175+
// HostEventState enum.
176+
std::atomic<int> MState;
169177
};
170178

171179
} // namespace detail

sycl/source/detail/queue_impl.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,30 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
195195
#endif
196196
}
197197

198+
void queue_impl::initHostTaskAndEventCallbackThreadPool() {
199+
if (MHostTaskThreadPool)
200+
return;
201+
202+
int Size = 1;
203+
204+
if (const char *val = std::getenv("SYCL_QUEUE_THREAD_POOL_SIZE"))
205+
try {
206+
Size = std::stoi(val);
207+
} catch (...) {
208+
throw invalid_parameter_error(
209+
"Invalid value for SYCL_QUEUE_THREAD_POOL_SIZE environment variable",
210+
PI_INVALID_VALUE);
211+
}
212+
213+
if (Size < 1)
214+
throw invalid_parameter_error(
215+
"Invalid value for SYCL_QUEUE_THREAD_POOL_SIZE environment variable",
216+
PI_INVALID_VALUE);
217+
218+
MHostTaskThreadPool.reset(new ThreadPool(Size));
219+
MHostTaskThreadPool->start();
220+
}
221+
198222
pi_native_handle queue_impl::getNative() const {
199223
auto Plugin = getPlugin();
200224
pi_native_handle Handle;

sycl/source/detail/queue_impl.hpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
#include <detail/event_impl.hpp>
2222
#include <detail/plugin.hpp>
2323
#include <detail/scheduler/scheduler.hpp>
24+
#include <detail/thread_pool.hpp>
2425

2526
__SYCL_INLINE_NAMESPACE(cl) {
2627
namespace sycl {
@@ -348,6 +349,13 @@ class queue_impl {
348349
MExceptions.PushBack(ExceptionPtr);
349350
}
350351

352+
ThreadPool &getThreadPool() {
353+
if (!MHostTaskThreadPool)
354+
initHostTaskAndEventCallbackThreadPool();
355+
356+
return *MHostTaskThreadPool;
357+
}
358+
351359
/// Gets the native handle of the SYCL queue.
352360
///
353361
/// \return a native handle.
@@ -380,6 +388,8 @@ class queue_impl {
380388
void instrumentationEpilog(void *TelementryEvent, string_class &Name,
381389
int32_t StreamID, uint64_t IId);
382390

391+
void initHostTaskAndEventCallbackThreadPool();
392+
383393
/// Stores a USM operation event that should be associated with the queue
384394
///
385395
/// \param Event is the event to be stored
@@ -414,6 +424,10 @@ class queue_impl {
414424
const bool MOpenCLInterop = false;
415425
// Assume OOO support by default.
416426
bool MSupportOOO = true;
427+
428+
// Thread pool for host task and event callbacks execution.
429+
// The thread pool is instantiated upon the very first call to getThreadPool()
430+
std::unique_ptr<ThreadPool> MHostTaskThreadPool;
417431
};
418432

419433
} // namespace detail

0 commit comments

Comments
 (0)