Skip to content

Commit 5e16cf4

Browse files
committed
[SYCL] Add run_on_host_intel method to handler.
run_on_host_intel allows inserting task which runs regular host code into a SYCL DAG. Signed-off-by: Vlad Romanov <[email protected]>
1 parent 33e8977 commit 5e16cf4

File tree

9 files changed

+170
-12
lines changed

9 files changed

+170
-12
lines changed

sycl/include/CL/sycl/detail/cg.hpp

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,8 @@ class CG {
325325
COPY_PTR_TO_ACC,
326326
COPY_ACC_TO_ACC,
327327
FILL,
328-
UPDATE_HOST
328+
UPDATE_HOST,
329+
RUN_ON_HOST_INTEL
329330
};
330331

331332
CG(CGTYPE Type, std::vector<std::vector<char>> ArgsStorage,
@@ -340,10 +341,6 @@ class CG {
340341

341342
CG(CG &&CommandGroup) = default;
342343

343-
std::vector<Requirement *> getRequirements() const { return MRequirements; }
344-
345-
std::vector<detail::EventImplPtr> getEvents() const { return MEvents; }
346-
347344
CGTYPE getType() { return MType; }
348345

349346
virtual ~CG() = default;
@@ -358,6 +355,8 @@ class CG {
358355
std::vector<detail::AccessorImplPtr> MAccStorage;
359356
// Storage for shared_ptrs.
360357
std::vector<std::shared_ptr<const void>> MSharedPtrStorage;
358+
359+
public:
361360
// List of requirements that specify which memory is needed for the command
362361
// group to be executed.
363362
std::vector<Requirement *> MRequirements;
@@ -385,14 +384,18 @@ class CGExecKernel : public CG {
385384
std::vector<detail::EventImplPtr> Events,
386385
std::vector<ArgDesc> Args, std::string KernelName,
387386
detail::OSModuleHandle OSModuleHandle,
388-
std::vector<std::shared_ptr<detail::stream_impl>> Streams)
389-
: CG(KERNEL, std::move(ArgsStorage), std::move(AccStorage),
387+
std::vector<std::shared_ptr<detail::stream_impl>> Streams,
388+
CGTYPE Type)
389+
: CG(Type, std::move(ArgsStorage), std::move(AccStorage),
390390
std::move(SharedPtrStorage), std::move(Requirements),
391391
std::move(Events)),
392392
MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
393393
MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)),
394394
MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle),
395-
MStreams(std::move(Streams)) {}
395+
MStreams(std::move(Streams)) {
396+
assert((getType() == RUN_ON_HOST_INTEL || getType() == KERNEL) &&
397+
"Wrong type of exec kernel CG.");
398+
}
396399

397400
std::vector<ArgDesc> getArguments() const { return MArgs; }
398401
std::string getKernelName() const { return MKernelName; }

sycl/include/CL/sycl/detail/pi.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ _PI_API(piSamplerRetain)
7878
_PI_API(piSamplerRelease)
7979
// Queue commands
8080
_PI_API(piEnqueueKernelLaunch)
81+
_PI_API(piEnqueueNativeKernel)
8182
_PI_API(piEnqueueEventsWait)
8283
_PI_API(piEnqueueMemBufferRead)
8384
_PI_API(piEnqueueMemBufferReadRect)

sycl/include/CL/sycl/detail/pi.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -691,6 +691,18 @@ pi_result piEnqueueKernelLaunch(
691691
const pi_event * event_wait_list,
692692
pi_event * event);
693693

694+
pi_result piEnqueueNativeKernel(
695+
pi_queue queue,
696+
void (*user_func)(void *),
697+
void * args,
698+
size_t cb_args,
699+
pi_uint32 num_mem_objects,
700+
const pi_mem * mem_list,
701+
const void ** args_mem_loc,
702+
pi_uint32 num_events_in_wait_list,
703+
const pi_event * event_wait_list,
704+
pi_event * event);
705+
694706
pi_result piEnqueueEventsWait(
695707
pi_queue command_queue,
696708
pi_uint32 num_events_in_wait_list,

sycl/include/CL/sycl/handler.hpp

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -355,12 +355,13 @@ class handler {
355355
std::unique_ptr<detail::CG> CommandGroup;
356356
switch (MCGType) {
357357
case detail::CG::KERNEL:
358+
case detail::CG::RUN_ON_HOST_INTEL:
358359
CommandGroup.reset(new detail::CGExecKernel(
359360
std::move(MNDRDesc), std::move(MHostKernel), std::move(MSyclKernel),
360361
std::move(MArgsStorage), std::move(MAccStorage),
361362
std::move(MSharedPtrStorage), std::move(MRequirements),
362363
std::move(MEvents), std::move(MArgs), std::move(MKernelName),
363-
std::move(MOSModuleHandle), std::move(MStreamStorage)));
364+
std::move(MOSModuleHandle), std::move(MStreamStorage), MCGType));
364365
break;
365366
case detail::CG::COPY_ACC_TO_PTR:
366367
case detail::CG::COPY_PTR_TO_ACC:
@@ -671,6 +672,16 @@ class handler {
671672
#endif
672673
}
673674

675+
// Similar to single_task, but passed lambda will be executed on host.
676+
template <typename FuncT> void run_on_host_intel(FuncT Func) {
677+
MNDRDesc.set(range<1>{1});
678+
679+
MArgs = std::move(MAssociatedAccesors);
680+
MHostKernel.reset(
681+
new detail::HostKernel<FuncT, void, 1>(std::move(Func)));
682+
MCGType = detail::CG::RUN_ON_HOST_INTEL;
683+
}
684+
674685
// parallel_for version with a kernel represented as a lambda + range and
675686
// offset that specify global size and global offset correspondingly.
676687
template <typename KernelName = csd::auto_name, typename KernelType, int Dims>

sycl/source/detail/pi_opencl.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ _PI_CL(piSamplerRetain, clRetainSampler)
310310
_PI_CL(piSamplerRelease, clReleaseSampler)
311311
// Queue commands
312312
_PI_CL(piEnqueueKernelLaunch, clEnqueueNDRangeKernel)
313+
_PI_CL(piEnqueueNativeKernel, clEnqueueNativeKernel)
313314
_PI_CL(piEnqueueEventsWait, clEnqueueMarkerWithWaitList)
314315
_PI_CL(piEnqueueMemBufferRead, clEnqueueReadBuffer)
315316
_PI_CL(piEnqueueMemBufferReadRect, clEnqueueReadBufferRect)

sycl/source/detail/scheduler/commands.cpp

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -533,6 +533,20 @@ static void adjustNDRangePerKernel(NDRDescT &NDR, RT::PiKernel Kernel,
533533
NDR.set(NDR.Dims, nd_range<3>(NDR.NumWorkGroups * WGSize, WGSize));
534534
}
535535

536+
// The function initialize accessors and calls lambda.
537+
// The function is used as argument to piEnqueueNativeKernel which requires
538+
// that the passed function takes one void* argument.
539+
void DispatchNativeKernel(void *Blob) {
540+
// First value is a pointer to Corresponding CGExecKernel object.
541+
CGExecKernel *HostTask = *(CGExecKernel **)Blob;
542+
543+
// Other value are pointer to the buffers.
544+
void **NextArg = (void **)Blob + 1;
545+
for (detail::Requirement *Req : HostTask->MRequirements)
546+
Req->MData = *(NextArg++);
547+
HostTask->MHostKernel->call(HostTask->MNDRDesc);
548+
}
549+
536550
cl_int ExecCGCommand::enqueueImp() {
537551
std::vector<RT::PiEvent> RawEvents =
538552
Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
@@ -606,6 +620,68 @@ cl_int ExecCGCommand::enqueueImp() {
606620
Event);
607621
return CL_SUCCESS;
608622
}
623+
case CG::CGTYPE::RUN_ON_HOST_INTEL: {
624+
CGExecKernel *HostTask = (CGExecKernel *)MCommandGroup.get();
625+
626+
// piEnqueueNativeKernel takes arguments blob which is passes to user
627+
// function.
628+
// Reserve extra space for the pointer to CGExecKernel to restore context.
629+
std::vector<void *> ArgsBlob(HostTask->MArgs.size() + 1);
630+
ArgsBlob[0] = (void *)HostTask;
631+
void **NextArg = ArgsBlob.data() + 1;
632+
633+
if (MQueue->is_host()) {
634+
for (ArgDesc &Arg : HostTask->MArgs) {
635+
assert(Arg.MType == kernel_param_kind_t::kind_accessor);
636+
637+
Requirement *Req = (Requirement *)(Arg.MPtr);
638+
AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
639+
640+
*NextArg = AllocaCmd->getMemAllocation();
641+
NextArg++;
642+
}
643+
644+
if (!RawEvents.empty())
645+
PI_CALL(RT::piEventsWait(RawEvents.size(), &RawEvents[0]));
646+
DispatchNativeKernel((void*)ArgsBlob.data());
647+
return CL_SUCCESS;
648+
}
649+
650+
std::vector<pi_mem> Buffers;
651+
// piEnqueueNativeKernel requires additional array of pointers to args blob,
652+
// values that pointers point to are replaced with actual pointers to the
653+
// memory before execution of user function.
654+
std::vector<void*> MemLocs;
655+
656+
for (ArgDesc &Arg : HostTask->MArgs) {
657+
assert(Arg.MType == kernel_param_kind_t::kind_accessor);
658+
659+
Requirement *Req = (Requirement *)(Arg.MPtr);
660+
AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
661+
pi_mem MemArg = (pi_mem)AllocaCmd->getMemAllocation();
662+
663+
Buffers.push_back(MemArg);
664+
MemLocs.push_back(NextArg);
665+
NextArg++;
666+
}
667+
668+
pi_result Error = PI_CALL_RESULT(RT::piEnqueueNativeKernel(
669+
MQueue->getHandleRef(), DispatchNativeKernel, (void *)ArgsBlob.data(),
670+
HostTask->MArgs[0].MSize, Buffers.size(), Buffers.data(),
671+
(const void **)MemLocs.data(), RawEvents.size(),
672+
RawEvents.empty() ? nullptr : RawEvents.data(), &Event));
673+
674+
switch (Error) {
675+
case PI_INVALID_OPERATION:
676+
throw cl::sycl::runtime_error(
677+
"Device doesn't support run_on_host_intel tasks.", Error);
678+
case PI_SUCCESS:
679+
return Error;
680+
default:
681+
throw cl::sycl::runtime_error(
682+
"Enqueueing run_on_host_intel task has failed.", Error);
683+
}
684+
}
609685
case CG::CGTYPE::KERNEL: {
610686
CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
611687

sycl/source/detail/scheduler/graph_builder.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -592,8 +592,8 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(
592592
Command *
593593
Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
594594
QueueImplPtr Queue) {
595-
std::vector<Requirement *> Reqs = CommandGroup->getRequirements();
596-
std::vector<detail::EventImplPtr> Events = CommandGroup->getEvents();
595+
const std::vector<Requirement *> &Reqs = CommandGroup->MRequirements;
596+
const std::vector<detail::EventImplPtr> &Events = CommandGroup->MEvents;
597597
std::unique_ptr<ExecCGCommand> NewCmd(
598598
new ExecCGCommand(std::move(CommandGroup), Queue));
599599
if (!NewCmd)
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
// RUN: %clangxx -fsycl %s -o %t.out -lOpenCL
2+
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
3+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
4+
5+
//==-- run_on_host_intel.cpp -----------------------------------------------==//
6+
//
7+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8+
// See https://llvm.org/LICENSE.txt for license information.
9+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#include "CL/sycl/access/access.hpp"
14+
#include <CL/sycl.hpp>
15+
16+
#include "../../helpers.hpp"
17+
18+
using namespace cl;
19+
20+
template <typename SrcAccType, typename DstAccType>
21+
void copyAndAdd(SrcAccType SrcAcc, DstAccType DstAcc, int Var) {
22+
for (int I = 0; I < (int)DstAcc.get_count(); ++I)
23+
DstAcc[I] = Var + SrcAcc[I];
24+
}
25+
26+
int main() {
27+
constexpr size_t BufSize = 4;
28+
int data1[BufSize] = {-1, -1, -1, -1};
29+
sycl::buffer<int, 1> SrcBuf(data1, sycl::range<1>{BufSize});
30+
sycl::buffer<int, 1> DstBuf(sycl::range<1>{BufSize});
31+
32+
TestQueue Queue{sycl::default_selector{}};
33+
Queue.submit([&](sycl::handler &CGH) {
34+
auto SrcAcc = SrcBuf.get_access<sycl::access::mode::read>(CGH);
35+
auto DstAcc = DstBuf.get_access<sycl::access::mode::write>(CGH);
36+
const int Var = 43;
37+
38+
CGH.run_on_host_intel([=]() { copyAndAdd(SrcAcc, DstAcc, Var); });
39+
});
40+
41+
auto DstAcc = DstBuf.template get_access<sycl::access::mode::read_write>();
42+
const int Expected = 42;
43+
for (int I = 0; I < DstAcc.get_count(); ++I)
44+
if (DstAcc[I] != Expected) {
45+
std::cerr << "Mismatch. Elem " << I << ". Expected: " << Expected
46+
<< ", Got: " << DstAcc[I] << std::endl;
47+
return 1;
48+
}
49+
50+
std::cout << "Success" << std::endl;
51+
52+
return 0;
53+
}

sycl/test/basic_tests/image_api.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,8 @@ int main() {
130130
std::move(Handler.MAccStorage), std::move(Handler.MSharedPtrStorage),
131131
std::move(Handler.MRequirements), /*DepsEvents*/ {},
132132
std::move(Handler.MArgs), std::move(Handler.MKernelName),
133-
std::move(Handler.MOSModuleHandle), std::move(Handler.MStreamStorage)));
133+
std::move(Handler.MOSModuleHandle), std::move(Handler.MStreamStorage),
134+
d::CG::KERNEL));
134135

135136
d::EventImplPtr Event = d::Scheduler::getInstance().addCG(
136137
std::move(CommandGroup), d::getSyclObjImpl(Queue));

0 commit comments

Comments
 (0)