Skip to content

Commit b7e60c1

Browse files
committed
[SYCL][ESIMD][EMU] single_task support.
[SYCL][ESIMD][EMU] misc clarifying plugin refactoring.
1 parent d47dda3 commit b7e60c1

File tree

4 files changed

+62
-89
lines changed

4 files changed

+62
-89
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -660,16 +660,28 @@ class __SYCL_EXPORT handler {
660660
KernelFunc);
661661
}
662662

663-
/* 'wrapper'-based approach using 'NormalizedKernelType' struct is
664-
* not applied for 'void(void)' type kernel and
665-
* 'void(sycl::group<Dims>)'. This is because 'void(void)' type does
666-
* not have argument to normalize and 'void(sycl::group<Dims>)' is
667-
* not supported in ESIMD.
668-
*/
669-
// For 'void' and 'sycl::group<Dims>' kernel argument
663+
// For 'void' kernel argument (single_task)
664+
template <class KernelType, typename ArgT, int Dims>
665+
typename std::enable_if_t<std::is_same<ArgT, void>::value, KernelType *>
666+
ResetHostKernel(const KernelType &KernelFunc) {
667+
struct NormalizedKernelType {
668+
KernelType MKernelFunc;
669+
NormalizedKernelType(const KernelType &KernelFunc)
670+
: MKernelFunc(KernelFunc) {}
671+
void operator()(const nd_item<Dims> &Arg) {
672+
detail::runKernelWithoutArg(MKernelFunc);
673+
}
674+
};
675+
return ResetHostKernelHelper<KernelType, struct NormalizedKernelType, Dims>(
676+
KernelFunc);
677+
}
678+
679+
// For 'sycl::group<Dims>' kernel argument
680+
// 'wrapper'-based approach using 'NormalizedKernelType' struct is not used
681+
// for 'void(sycl::group<Dims>)' since 'void(sycl::group<Dims>)' is not
682+
// supported in ESIMD.
670683
template <class KernelType, typename ArgT, int Dims>
671-
typename std::enable_if<std::is_same<ArgT, void>::value ||
672-
std::is_same<ArgT, sycl::group<Dims>>::value,
684+
typename std::enable_if<std::is_same<ArgT, sycl::group<Dims>>::value,
673685
KernelType *>::type
674686
ResetHostKernel(const KernelType &KernelFunc) {
675687
MHostKernel.reset(
@@ -1438,7 +1450,7 @@ class __SYCL_EXPORT handler {
14381450
// known constant.
14391451
MNDRDesc.set(range<1>{1});
14401452

1441-
StoreLambda<NameT, KernelType, /*Dims*/ 0, void>(KernelFunc);
1453+
StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(KernelFunc);
14421454
setType(detail::CG::Kernel);
14431455
#endif
14441456
}
@@ -2046,7 +2058,7 @@ class __SYCL_EXPORT handler {
20462058
extractArgsAndReqs();
20472059
MKernelName = getKernelName();
20482060
} else
2049-
StoreLambda<NameT, KernelType, /*Dims*/ 0, void>(std::move(KernelFunc));
2061+
StoreLambda<NameT, KernelType, /*Dims*/ 1, void>(std::move(KernelFunc));
20502062
#else
20512063
detail::CheckDeviceCopyable<KernelType>();
20522064
#endif

sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp

Lines changed: 34 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -147,33 +147,15 @@ using KernelFunc = std::function<void(const sycl::nd_item<NDims> &)>;
147147

148148
// Struct to wrap dimension info and lambda function to be invoked by
149149
// CM Kernel launcher that only accepts raw function pointer for
150-
// kernel execution. Function instances of 'InvokeLambda' un-wrap this
151-
// struct instance and invoke lambda function ('Func')
152-
template <int NDims> struct LambdaWrapper {
150+
// kernel execution. Function instances of 'InvokeKernel' un-wrap
151+
// this struct instance and invoke lambda function ('Func')
152+
template <int NDims> struct KernelInvocationContext {
153153
KernelFunc<NDims> Func;
154154
const sycl::range<NDims> &LocalSize;
155155
const sycl::range<NDims> &GlobalSize;
156156
const sycl::id<NDims> &GlobalOffset;
157-
LambdaWrapper(KernelFunc<NDims> ArgFunc,
158-
const sycl::range<NDims> &ArgLocalSize,
159-
const sycl::range<NDims> &ArgGlobalSize,
160-
const sycl::id<NDims> &ArgGlobalOffset)
161-
: Func(ArgFunc), LocalSize(ArgLocalSize), GlobalSize(ArgGlobalSize),
162-
GlobalOffset(ArgGlobalOffset) {}
163157
};
164158

165-
// Function to generate a lambda wrapper object above
166-
template <int NDims>
167-
auto MakeLambdaWrapper(KernelFunc<NDims> ArgFunc,
168-
const sycl::range<NDims> &LocalSize,
169-
const sycl::range<NDims> &GlobalSize,
170-
const sycl::id<NDims> &GlobalOffset) {
171-
std::unique_ptr<LambdaWrapper<NDims>> Wrapper =
172-
std::make_unique<LambdaWrapper<NDims>>(LambdaWrapper<NDims>(
173-
KernelFunc<NDims>(ArgFunc), LocalSize, GlobalSize, GlobalOffset));
174-
return Wrapper;
175-
}
176-
177159
// A helper structure to create multi-dimensional range when
178160
// dimensionality is given as a template parameter. `create` function
179161
// in specializations accepts a template `Gen` function which
@@ -199,69 +181,65 @@ template <> struct RangeBuilder<3> {
199181
// Function template to generate entry point of kernel execution as
200182
// raw function pointer. CM kernel launcher executes one instance of
201183
// this function per 'NDims'
202-
template <int NDims> void InvokeLambda(void *Wrapper) {
203-
auto *WrappedLambda = reinterpret_cast<LambdaWrapper<NDims> *>(Wrapper);
204-
sycl::range<NDims> GroupSize(
205-
sycl::detail::InitializedVal<NDims, sycl::range>::template get<0>());
184+
template <int NDims> void InvokeKernel(KernelInvocationContext<NDims> *ctx) {
185+
186+
sycl::range<NDims> GroupSize{
187+
sycl::detail::InitializedVal<NDims, sycl::range>::template get<0>()};
206188

207-
for (int I = 0; I < NDims /*Dims*/; ++I) {
208-
GroupSize[I] = WrappedLambda->GlobalSize[I] / WrappedLambda->LocalSize[I];
189+
for (int i = 0; i < NDims; ++i) {
190+
GroupSize[i] = ctx->GlobalSize[i] / ctx->LocalSize[i];
209191
}
210192

211193
const sycl::id<NDims> LocalID = RangeBuilder<NDims>::create(
212194
[](int i) { return cm_support::get_thread_idx(i); });
213195

214196
const sycl::id<NDims> GroupID = RangeBuilder<NDims>::create(
215-
[](int Id) { return cm_support::get_group_idx(Id); });
197+
[](int i) { return cm_support::get_group_idx(i); });
216198

217199
const sycl::group<NDims> Group = IDBuilder::createGroup<NDims>(
218-
WrappedLambda->GlobalSize, WrappedLambda->LocalSize, GroupSize, GroupID);
200+
ctx->GlobalSize, ctx->LocalSize, GroupSize, GroupID);
201+
202+
const sycl::id<NDims> GlobalID =
203+
GroupID * ctx->LocalSize + LocalID + ctx->GlobalOffset;
219204

220-
const sycl::id<NDims> GlobalID = GroupID * WrappedLambda->LocalSize +
221-
LocalID + WrappedLambda->GlobalOffset;
222205
const sycl::item<NDims, /*Offset=*/true> GlobalItem =
223-
IDBuilder::createItem<NDims, true>(WrappedLambda->GlobalSize, GlobalID,
224-
WrappedLambda->GlobalOffset);
206+
IDBuilder::createItem<NDims, true>(ctx->GlobalSize, GlobalID,
207+
ctx->GlobalOffset);
208+
225209
const sycl::item<NDims, /*Offset=*/false> LocalItem =
226-
IDBuilder::createItem<NDims, false>(WrappedLambda->LocalSize, LocalID);
210+
IDBuilder::createItem<NDims, false>(ctx->LocalSize, LocalID);
227211

228212
const sycl::nd_item<NDims> NDItem =
229213
IDBuilder::createNDItem<NDims>(GlobalItem, LocalItem, Group);
230214

231-
WrappedLambda->Func(NDItem);
215+
ctx->Func(NDItem);
232216
}
233217

234-
// libCMBatch class defines interface for lauching kernels with
235-
// software multi-threads
218+
// Interface for lauching kernels using libcm from CM EMU project.
236219
template <int DIMS> class libCMBatch {
237220
private:
238-
// Kernel function
239-
KernelFunc<DIMS> MKernel;
240-
241-
// Space-dimension info
242-
std::vector<uint32_t> GroupDim;
243-
std::vector<uint32_t> SpaceDim;
221+
const KernelFunc<DIMS> &MKernel;
222+
std::vector<uint32_t> GroupDim, SpaceDim;
244223

245224
public:
246-
libCMBatch(KernelFunc<DIMS> Kernel)
225+
libCMBatch(const KernelFunc<DIMS> &Kernel)
247226
: MKernel(Kernel), GroupDim{1, 1, 1}, SpaceDim{1, 1, 1} {}
248227

249-
/// Invoking kernel lambda function wrapped by 'LambdaWrapper' using
250-
/// 'InvokeLambda' function.
251228
void runIterationSpace(const sycl::range<DIMS> &LocalSize,
252229
const sycl::range<DIMS> &GlobalSize,
253230
const sycl::id<DIMS> &GlobalOffset) {
254-
auto WrappedLambda =
255-
MakeLambdaWrapper<DIMS>(MKernel, LocalSize, GlobalSize, GlobalOffset);
256231

257232
for (int I = 0; I < DIMS; I++) {
258233
SpaceDim[I] = (uint32_t)LocalSize[I];
259234
GroupDim[I] = (uint32_t)(GlobalSize[I] / LocalSize[I]);
260235
}
261236

262-
EsimdemuKernel Esimdemu((fptrVoid)InvokeLambda<DIMS>, GroupDim, SpaceDim);
237+
const auto InvokeKernelArg = KernelInvocationContext<DIMS>{
238+
MKernel, LocalSize, GlobalSize, GlobalOffset};
263239

264-
Esimdemu.launchMT(sizeof(struct LambdaWrapper<DIMS>), WrappedLambda.get());
240+
EsimdemuKernel{reinterpret_cast<fptrVoid>(InvokeKernel<DIMS>), GroupDim,
241+
SpaceDim}
242+
.launchMT(sizeof(InvokeKernelArg), &InvokeKernelArg);
265243
}
266244
};
267245

@@ -389,17 +367,12 @@ template <int NDims> struct InvokeImpl {
389367
return sycl::range<NDims>{Array[0], Array[1], Array[2]};
390368
}
391369

392-
static void invoke(void *Fptr, const size_t *GlobalWorkOffset,
370+
static void invoke(pi_kernel Kernel, const size_t *GlobalWorkOffset,
393371
const size_t *GlobalWorkSize,
394372
const size_t *LocalWorkSize) {
395-
auto GlobalSize = get_range(GlobalWorkSize);
396-
auto LocalSize = get_range(LocalWorkSize);
397-
sycl::id<NDims> GlobalOffset = get_range(GlobalWorkOffset);
398-
399-
auto KFunc = reinterpret_cast<KernelFunc<NDims> *>(Fptr);
400-
libCMBatch<NDims> CmThreading(*KFunc);
401-
402-
CmThreading.runIterationSpace(LocalSize, GlobalSize, GlobalOffset);
373+
libCMBatch<NDims>{*reinterpret_cast<KernelFunc<NDims> *>(Kernel)}
374+
.runIterationSpace(get_range(LocalWorkSize), get_range(GlobalWorkSize),
375+
sycl::id<NDims>{get_range(GlobalWorkOffset)});
403376
}
404377
};
405378

@@ -1636,15 +1609,14 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
16361609
const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
16371610
pi_uint32 NumEventsInWaitList,
16381611
const pi_event *EventWaitList, pi_event *Event) {
1612+
16391613
const size_t LocalWorkSz[] = {1, 1, 1};
16401614

16411615
if (Kernel == nullptr) {
16421616
return PI_INVALID_KERNEL;
16431617
}
16441618

1645-
// WorkDim == 0 is reserved for 'single_task()' kernel with no
1646-
// argument
1647-
if (WorkDim > 3) {
1619+
if (WorkDim > 3 || WorkDim == 0) {
16481620
return PI_INVALID_WORK_GROUP_SIZE;
16491621
}
16501622

@@ -1666,27 +1638,18 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
16661638
}
16671639

16681640
switch (WorkDim) {
1669-
case 0:
1670-
// TODO : intel/llvm_test_suite
1671-
// single_task() support - void(*)(void)
1672-
DIE_NO_IMPLEMENTATION;
1673-
break;
1674-
16751641
case 1:
16761642
InvokeImpl<1>::invoke(Kernel, GlobalWorkOffset, GlobalWorkSize,
16771643
LocalWorkSize);
16781644
break;
1679-
16801645
case 2:
16811646
InvokeImpl<2>::invoke(Kernel, GlobalWorkOffset, GlobalWorkSize,
16821647
LocalWorkSize);
16831648
break;
1684-
16851649
case 3:
16861650
InvokeImpl<3>::invoke(Kernel, GlobalWorkOffset, GlobalWorkSize,
16871651
LocalWorkSize);
16881652
break;
1689-
16901653
default:
16911654
DIE_NO_IMPLEMENTATION;
16921655
break;

sycl/source/detail/scheduler/commands.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2292,12 +2292,11 @@ cl_int ExecCGCommand::enqueueImp() {
22922292
} else {
22932293
assert(MQueue->getPlugin().getBackend() ==
22942294
backend::ext_intel_esimd_emulator);
2295-
// Dims==0 for 'single_task() - void(void) type'
2296-
uint32_t Dims = (Args.size() > 0) ? NDRDesc.Dims : 0;
2295+
22972296
MQueue->getPlugin().call<PiApiKind::piEnqueueKernelLaunch>(
22982297
nullptr,
22992298
reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
2300-
Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
2299+
NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
23012300
&NDRDesc.LocalSize[0], 0, nullptr, nullptr);
23022301
}
23032302

sycl/source/handler.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -230,18 +230,17 @@ event handler::finalize() {
230230
auto EnqueueKernel = [&]() {
231231
// 'Result' for single point of return
232232
cl_int Result = CL_INVALID_VALUE;
233+
233234
if (MQueue->is_host()) {
234235
MHostKernel->call(
235236
MNDRDesc, (NewEvent) ? NewEvent->getHostProfilingInfo() : nullptr);
236237
Result = CL_SUCCESS;
237238
} else {
238239
if (MQueue->getPlugin().getBackend() ==
239240
backend::ext_intel_esimd_emulator) {
240-
// Dims==0 for 'single_task() - void(void) type'
241-
uint32_t Dims = (MArgs.size() > 0) ? MNDRDesc.Dims : 0;
242241
MQueue->getPlugin().call<detail::PiApiKind::piEnqueueKernelLaunch>(
243-
nullptr, reinterpret_cast<pi_kernel>(MHostKernel->getPtr()), Dims,
244-
&MNDRDesc.GlobalOffset[0], &MNDRDesc.GlobalSize[0],
242+
nullptr, reinterpret_cast<pi_kernel>(MHostKernel->getPtr()),
243+
MNDRDesc.Dims, &MNDRDesc.GlobalOffset[0], &MNDRDesc.GlobalSize[0],
245244
&MNDRDesc.LocalSize[0], 0, nullptr, nullptr);
246245
Result = CL_SUCCESS;
247246
} else {

0 commit comments

Comments
 (0)