@@ -147,33 +147,15 @@ using KernelFunc = std::function<void(const sycl::nd_item<NDims> &)>;
147
147
148
148
// Struct to wrap dimension info and lambda function to be invoked by
149
149
// CM Kernel launcher that only accepts raw function pointer for
150
- // kernel execution. Function instances of 'InvokeLambda ' un-wrap this
151
- // struct instance and invoke lambda function ('Func')
152
- template <int NDims> struct LambdaWrapper {
150
+ // kernel execution. Function instances of 'InvokeKernel ' un-wrap
151
+ // this struct instance and invoke lambda function ('Func')
152
+ template <int NDims> struct KernelInvocationContext {
153
153
KernelFunc<NDims> Func;
154
154
const sycl::range<NDims> &LocalSize;
155
155
const sycl::range<NDims> &GlobalSize;
156
156
const sycl::id<NDims> &GlobalOffset;
157
- LambdaWrapper (KernelFunc<NDims> ArgFunc,
158
- const sycl::range<NDims> &ArgLocalSize,
159
- const sycl::range<NDims> &ArgGlobalSize,
160
- const sycl::id<NDims> &ArgGlobalOffset)
161
- : Func(ArgFunc), LocalSize(ArgLocalSize), GlobalSize(ArgGlobalSize),
162
- GlobalOffset (ArgGlobalOffset) {}
163
157
};
164
158
165
- // Function to generate a lambda wrapper object above
166
- template <int NDims>
167
- auto MakeLambdaWrapper (KernelFunc<NDims> ArgFunc,
168
- const sycl::range<NDims> &LocalSize,
169
- const sycl::range<NDims> &GlobalSize,
170
- const sycl::id<NDims> &GlobalOffset) {
171
- std::unique_ptr<LambdaWrapper<NDims>> Wrapper =
172
- std::make_unique<LambdaWrapper<NDims>>(LambdaWrapper<NDims>(
173
- KernelFunc<NDims>(ArgFunc), LocalSize, GlobalSize, GlobalOffset));
174
- return Wrapper;
175
- }
176
-
177
159
// A helper structure to create multi-dimensional range when
178
160
// dimensionality is given as a template parameter. `create` function
179
161
// in specializations accepts a template `Gen` function which
@@ -199,69 +181,65 @@ template <> struct RangeBuilder<3> {
199
181
// Function template to generate entry point of kernel execution as
200
182
// raw function pointer. CM kernel launcher executes one instance of
201
183
// this function per 'NDims'
202
- template <int NDims> void InvokeLambda ( void *Wrapper ) {
203
- auto *WrappedLambda = reinterpret_cast <LambdaWrapper<NDims> *>(Wrapper);
204
- sycl::range<NDims> GroupSize (
205
- sycl::detail::InitializedVal<NDims, sycl::range>::template get<0 >()) ;
184
+ template <int NDims> void InvokeKernel (KernelInvocationContext<NDims> *ctx ) {
185
+
186
+ sycl::range<NDims> GroupSize{
187
+ sycl::detail::InitializedVal<NDims, sycl::range>::template get<0 >()} ;
206
188
207
- for (int I = 0 ; I < NDims /* Dims */ ; ++I ) {
208
- GroupSize[I ] = WrappedLambda ->GlobalSize [I ] / WrappedLambda ->LocalSize [I ];
189
+ for (int i = 0 ; i < NDims; ++i ) {
190
+ GroupSize[i ] = ctx ->GlobalSize [i ] / ctx ->LocalSize [i ];
209
191
}
210
192
211
193
const sycl::id<NDims> LocalID = RangeBuilder<NDims>::create (
212
194
[](int i) { return cm_support::get_thread_idx (i); });
213
195
214
196
const sycl::id<NDims> GroupID = RangeBuilder<NDims>::create (
215
- [](int Id ) { return cm_support::get_group_idx (Id ); });
197
+ [](int i ) { return cm_support::get_group_idx (i ); });
216
198
217
199
const sycl::group<NDims> Group = IDBuilder::createGroup<NDims>(
218
- WrappedLambda->GlobalSize , WrappedLambda->LocalSize , GroupSize, GroupID);
200
+ ctx->GlobalSize , ctx->LocalSize , GroupSize, GroupID);
201
+
202
+ const sycl::id<NDims> GlobalID =
203
+ GroupID * ctx->LocalSize + LocalID + ctx->GlobalOffset ;
219
204
220
- const sycl::id<NDims> GlobalID = GroupID * WrappedLambda->LocalSize +
221
- LocalID + WrappedLambda->GlobalOffset ;
222
205
const sycl::item<NDims, /* Offset=*/ true > GlobalItem =
223
- IDBuilder::createItem<NDims, true >(WrappedLambda->GlobalSize , GlobalID,
224
- WrappedLambda->GlobalOffset );
206
+ IDBuilder::createItem<NDims, true >(ctx->GlobalSize , GlobalID,
207
+ ctx->GlobalOffset );
208
+
225
209
const sycl::item<NDims, /* Offset=*/ false > LocalItem =
226
- IDBuilder::createItem<NDims, false >(WrappedLambda ->LocalSize , LocalID);
210
+ IDBuilder::createItem<NDims, false >(ctx ->LocalSize , LocalID);
227
211
228
212
const sycl::nd_item<NDims> NDItem =
229
213
IDBuilder::createNDItem<NDims>(GlobalItem, LocalItem, Group);
230
214
231
- WrappedLambda ->Func (NDItem);
215
+ ctx ->Func (NDItem);
232
216
}
233
217
234
- // libCMBatch class defines interface for lauching kernels with
235
- // software multi-threads
218
+ // Interface for lauching kernels using libcm from CM EMU project.
236
219
template <int DIMS> class libCMBatch {
237
220
private:
238
- // Kernel function
239
- KernelFunc<DIMS> MKernel;
240
-
241
- // Space-dimension info
242
- std::vector<uint32_t > GroupDim;
243
- std::vector<uint32_t > SpaceDim;
221
+ const KernelFunc<DIMS> &MKernel;
222
+ std::vector<uint32_t > GroupDim, SpaceDim;
244
223
245
224
public:
246
- libCMBatch (KernelFunc<DIMS> Kernel)
225
+ libCMBatch (const KernelFunc<DIMS> & Kernel)
247
226
: MKernel(Kernel), GroupDim{1 , 1 , 1 }, SpaceDim{1 , 1 , 1 } {}
248
227
249
- // / Invoking kernel lambda function wrapped by 'LambdaWrapper' using
250
- // / 'InvokeLambda' function.
251
228
void runIterationSpace (const sycl::range<DIMS> &LocalSize,
252
229
const sycl::range<DIMS> &GlobalSize,
253
230
const sycl::id<DIMS> &GlobalOffset) {
254
- auto WrappedLambda =
255
- MakeLambdaWrapper<DIMS>(MKernel, LocalSize, GlobalSize, GlobalOffset);
256
231
257
232
for (int I = 0 ; I < DIMS; I++) {
258
233
SpaceDim[I] = (uint32_t )LocalSize[I];
259
234
GroupDim[I] = (uint32_t )(GlobalSize[I] / LocalSize[I]);
260
235
}
261
236
262
- EsimdemuKernel Esimdemu ((fptrVoid)InvokeLambda<DIMS>, GroupDim, SpaceDim);
237
+ const auto InvokeKernelArg = KernelInvocationContext<DIMS>{
238
+ MKernel, LocalSize, GlobalSize, GlobalOffset};
263
239
264
- Esimdemu.launchMT (sizeof (struct LambdaWrapper <DIMS>), WrappedLambda.get ());
240
+ EsimdemuKernel{reinterpret_cast <fptrVoid>(InvokeKernel<DIMS>), GroupDim,
241
+ SpaceDim}
242
+ .launchMT (sizeof (InvokeKernelArg), &InvokeKernelArg);
265
243
}
266
244
};
267
245
@@ -389,17 +367,12 @@ template <int NDims> struct InvokeImpl {
389
367
return sycl::range<NDims>{Array[0 ], Array[1 ], Array[2 ]};
390
368
}
391
369
392
- static void invoke (void *Fptr , const size_t *GlobalWorkOffset,
370
+ static void invoke (pi_kernel Kernel , const size_t *GlobalWorkOffset,
393
371
const size_t *GlobalWorkSize,
394
372
const size_t *LocalWorkSize) {
395
- auto GlobalSize = get_range (GlobalWorkSize);
396
- auto LocalSize = get_range (LocalWorkSize);
397
- sycl::id<NDims> GlobalOffset = get_range (GlobalWorkOffset);
398
-
399
- auto KFunc = reinterpret_cast <KernelFunc<NDims> *>(Fptr);
400
- libCMBatch<NDims> CmThreading (*KFunc);
401
-
402
- CmThreading.runIterationSpace (LocalSize, GlobalSize, GlobalOffset);
373
+ libCMBatch<NDims>{*reinterpret_cast <KernelFunc<NDims> *>(Kernel)}
374
+ .runIterationSpace (get_range (LocalWorkSize), get_range (GlobalWorkSize),
375
+ sycl::id<NDims>{get_range (GlobalWorkOffset)});
403
376
}
404
377
};
405
378
@@ -1636,15 +1609,14 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
1636
1609
const size_t *GlobalWorkSize, const size_t *LocalWorkSize,
1637
1610
pi_uint32 NumEventsInWaitList,
1638
1611
const pi_event *EventWaitList, pi_event *Event) {
1612
+
1639
1613
const size_t LocalWorkSz[] = {1 , 1 , 1 };
1640
1614
1641
1615
if (Kernel == nullptr ) {
1642
1616
return PI_INVALID_KERNEL;
1643
1617
}
1644
1618
1645
- // WorkDim == 0 is reserved for 'single_task()' kernel with no
1646
- // argument
1647
- if (WorkDim > 3 ) {
1619
+ if (WorkDim > 3 || WorkDim == 0 ) {
1648
1620
return PI_INVALID_WORK_GROUP_SIZE;
1649
1621
}
1650
1622
@@ -1666,27 +1638,18 @@ piEnqueueKernelLaunch(pi_queue Queue, pi_kernel Kernel, pi_uint32 WorkDim,
1666
1638
}
1667
1639
1668
1640
switch (WorkDim) {
1669
- case 0 :
1670
- // TODO : intel/llvm_test_suite
1671
- // single_task() support - void(*)(void)
1672
- DIE_NO_IMPLEMENTATION;
1673
- break ;
1674
-
1675
1641
case 1 :
1676
1642
InvokeImpl<1 >::invoke (Kernel, GlobalWorkOffset, GlobalWorkSize,
1677
1643
LocalWorkSize);
1678
1644
break ;
1679
-
1680
1645
case 2 :
1681
1646
InvokeImpl<2 >::invoke (Kernel, GlobalWorkOffset, GlobalWorkSize,
1682
1647
LocalWorkSize);
1683
1648
break ;
1684
-
1685
1649
case 3 :
1686
1650
InvokeImpl<3 >::invoke (Kernel, GlobalWorkOffset, GlobalWorkSize,
1687
1651
LocalWorkSize);
1688
1652
break ;
1689
-
1690
1653
default :
1691
1654
DIE_NO_IMPLEMENTATION;
1692
1655
break ;
0 commit comments