Skip to content

Commit c5318c5

Browse files
jbrodmanbader
authored andcommitted
[SYCL][USM] Initial commit of flattening for kernel submission on queue (#911)
Add utility methods to flatten kernel submission to 1 lambda when using USM. Events for depends_on are passed as an extra function arg. Signed-off-by: James Brodman [email protected]
1 parent 1d55c86 commit c5318c5

File tree

2 files changed

+250
-2
lines changed

2 files changed

+250
-2
lines changed

sycl/include/CL/sycl/queue.hpp

Lines changed: 184 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -117,8 +117,190 @@ class queue {
117117
}
118118

119119
event prefetch(const void* Ptr, size_t Count) {
120-
return submit([=](handler &cgh) {
121-
cgh.prefetch(Ptr, Count);
120+
return submit([=](handler &CGH) {
121+
CGH.prefetch(Ptr, Count);
122+
});
123+
}
124+
125+
/// single_task version with a kernel represented as a lambda.
126+
///
127+
/// @param KernelFunc is the Kernel functor or lambda
128+
template <typename KernelName = detail::auto_name, typename KernelType>
129+
event single_task(KernelType KernelFunc) {
130+
return submit([&](handler &CGH) {
131+
CGH.template single_task<KernelName, KernelType>(KernelFunc);
132+
});
133+
}
134+
135+
/// single_task version with a kernel represented as a lambda.
136+
///
137+
/// @param DepEvent is an event that specifies the kernel dependences
138+
/// @param KernelFunc is the Kernel functor or lambda
139+
template <typename KernelName = detail::auto_name, typename KernelType>
140+
event single_task(event DepEvent, KernelType KernelFunc) {
141+
return submit([&](handler &CGH) {
142+
CGH.depends_on(DepEvent);
143+
CGH.template single_task<KernelName, KernelType>(KernelFunc);
144+
});
145+
}
146+
147+
/// single_task version with a kernel represented as a lambda.
148+
///
149+
/// @param DepEvents is a vector of events that specify the kernel dependences
150+
/// @param KernelFunc is the Kernel functor or lambda
151+
template <typename KernelName = detail::auto_name, typename KernelType>
152+
event single_task(std::vector<event> DepEvents, KernelType KernelFunc) {
153+
return submit([&](handler &CGH) {
154+
CGH.depends_on(DepEvents);
155+
CGH.template single_task<KernelName, KernelType>(KernelFunc);
156+
});
157+
}
158+
159+
/// parallel_for version with a kernel represented as a lambda + range that
160+
/// specifies global size only.
161+
///
162+
/// @param NumWorkItems is a range that specifies the work space of the kernel
163+
/// @param KernelFunc is the Kernel functor or lambda
164+
template <typename KernelName = detail::auto_name, typename KernelType,
165+
int Dims>
166+
event parallel_for(range<Dims> NumWorkItems, KernelType KernelFunc) {
167+
return submit([&](handler &CGH) {
168+
CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
169+
KernelFunc);
170+
});
171+
}
172+
173+
/// parallel_for version with a kernel represented as a lambda + range that
174+
/// specifies global size only.
175+
///
176+
/// @param NumWorkItems is a range that specifies the work space of the kernel
177+
/// @param DepEvent is an event that specifies the kernel dependences
178+
/// @param KernelFunc is the Kernel functor or lambda
179+
template <typename KernelName = detail::auto_name, typename KernelType,
180+
int Dims>
181+
event parallel_for(range<Dims> NumWorkItems, event DepEvent,
182+
KernelType KernelFunc) {
183+
return submit([&](handler &CGH) {
184+
CGH.depends_on(DepEvent);
185+
CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
186+
KernelFunc);
187+
});
188+
}
189+
190+
/// parallel_for version with a kernel represented as a lambda + range that
191+
/// specifies global size only.
192+
///
193+
/// @param NumWorkItems is a range that specifies the work space of the kernel
194+
/// @param DepEvents is a vector of events that specifies the kernel dependences
195+
/// @param KernelFunc is the Kernel functor or lambda
196+
template <typename KernelName = detail::auto_name, typename KernelType,
197+
int Dims>
198+
event parallel_for(range<Dims> NumWorkItems, std::vector<event> DepEvents,
199+
KernelType KernelFunc) {
200+
return submit([&](handler &CGH) {
201+
CGH.depends_on(DepEvents);
202+
CGH.template parallel_for<KernelName, KernelType, Dims>(NumWorkItems,
203+
KernelFunc);
204+
});
205+
}
206+
207+
/// parallel_for version with a kernel represented as a lambda + range and
208+
/// offset that specify global size and global offset correspondingly.
209+
///
210+
/// @param NumWorkItems is a range that specifies the work space of the kernel
211+
/// @param WorkItemOffset specifies the offset for each work item id
212+
/// @param KernelFunc is the Kernel functor or lambda
213+
template <typename KernelName = detail::auto_name, typename KernelType,
214+
int Dims>
215+
event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
216+
KernelType KernelFunc) {
217+
return submit([&](handler &CGH) {
218+
CGH.template parallel_for<KernelName, KernelType, Dims>(
219+
NumWorkItems, WorkItemOffset, KernelFunc);
220+
});
221+
}
222+
223+
/// parallel_for version with a kernel represented as a lambda + range and
224+
/// offset that specify global size and global offset correspondingly.
225+
///
226+
/// @param NumWorkItems is a range that specifies the work space of the kernel
227+
/// @param WorkItemOffset specifies the offset for each work item id
228+
/// @param DepEvent is an event that specifies the kernel dependences
229+
/// @param KernelFunc is the Kernel functor or lambda
230+
template <typename KernelName = detail::auto_name, typename KernelType,
231+
int Dims>
232+
event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
233+
event DepEvent, KernelType KernelFunc) {
234+
return submit([&](handler &CGH) {
235+
CGH.depends_on(DepEvent);
236+
CGH.template parallel_for<KernelName, KernelType, Dims>(
237+
NumWorkItems, WorkItemOffset, KernelFunc);
238+
});
239+
}
240+
241+
/// parallel_for version with a kernel represented as a lambda + range and
242+
/// offset that specify global size and global offset correspondingly.
243+
///
244+
/// @param NumWorkItems is a range that specifies the work space of the kernel
245+
/// @param WorkItemOffset specifies the offset for each work item id
246+
/// @param DepEvents is a vector of events that specifies the kernel dependences
247+
/// @param KernelFunc is the Kernel functor or lambda
248+
template <typename KernelName = detail::auto_name, typename KernelType,
249+
int Dims>
250+
event parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
251+
std::vector<event> DepEvents, KernelType KernelFunc) {
252+
return submit([&](handler &CGH) {
253+
CGH.depends_on(DepEvents);
254+
CGH.template parallel_for<KernelName, KernelType, Dims>(
255+
NumWorkItems, WorkItemOffset, KernelFunc);
256+
});
257+
}
258+
259+
/// parallel_for version with a kernel represented as a lambda + nd_range that
260+
/// specifies global, local sizes and offset.
261+
///
262+
/// @param ExecutionRange is a range that specifies the work space of the kernel
263+
/// @param KernelFunc is the Kernel functor or lambda
264+
template <typename KernelName = detail::auto_name, typename KernelType,
265+
int Dims>
266+
event parallel_for(nd_range<Dims> ExecutionRange, KernelType KernelFunc) {
267+
return submit([&](handler &CGH) {
268+
CGH.template parallel_for<KernelName, KernelType, Dims>(ExecutionRange,
269+
KernelFunc);
270+
});
271+
}
272+
273+
/// parallel_for version with a kernel represented as a lambda + nd_range that
274+
/// specifies global, local sizes and offset.
275+
///
276+
/// @param ExecutionRange is a range that specifies the work space of the kernel
277+
/// @param DepEvent is an event that specifies the kernel dependences
278+
/// @param KernelFunc is the Kernel functor or lambda
279+
template <typename KernelName = detail::auto_name, typename KernelType,
280+
int Dims>
281+
event parallel_for(nd_range<Dims> ExecutionRange,
282+
event DepEvent, KernelType KernelFunc) {
283+
return submit([&](handler &CGH) {
284+
CGH.depends_on(DepEvent);
285+
CGH.template parallel_for<KernelName, KernelType, Dims>(ExecutionRange,
286+
KernelFunc);
287+
});
288+
}
289+
290+
/// parallel_for version with a kernel represented as a lambda + nd_range that
291+
/// specifies global, local sizes and offset.
292+
///
293+
/// @param ExecutionRange is a range that specifies the work space of the kernel
294+
/// @param DepEvents is a vector of events that specifies the kernel dependences
295+
/// @param KernelFunc is the Kernel functor or lambda
296+
template <typename KernelName = detail::auto_name, typename KernelType,
297+
int Dims>
298+
event parallel_for(nd_range<Dims> ExecutionRange,
299+
std::vector<event> DepEvents, KernelType KernelFunc) {
300+
return submit([&](handler &CGH) {
301+
CGH.depends_on(DepEvents);
302+
CGH.template parallel_for<KernelName, KernelType, Dims>(ExecutionRange,
303+
KernelFunc);
122304
});
123305
}
124306

sycl/test/usm/pfor_flatten.cpp

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
// RUN: %clangxx -fsycl -fsycl-unnamed-lambda %s -o %t1.out
2+
// RUN: env SYCL_DEVICE_TYPE=HOST %t1.out
3+
// RUN: %CPU_RUN_PLACEHOLDER %t1.out
4+
// RUN: %GPU_RUN_PLACEHOLDER %t1.out
5+
6+
//==--------------- pfor_flatten.cpp - Kernel Launch Flattening test -------==//
7+
//
8+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
9+
// See https://llvm.org/LICENSE.txt for license information.
10+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
11+
//
12+
//===----------------------------------------------------------------------===//
13+
14+
#include <CL/sycl.hpp>
15+
16+
using namespace cl::sycl;
17+
18+
class foo;
19+
int main() {
20+
int *array = nullptr;
21+
const int N = 42;
22+
const int MAGIC_NUM = 42;
23+
24+
queue q;
25+
auto ctxt = q.get_context();
26+
27+
array = (int *)malloc_host(N * sizeof(int), q);
28+
if (array == nullptr) {
29+
return -1;
30+
}
31+
32+
range<1> R{N};
33+
auto e1 = q.parallel_for(R, [=](id<1> ID) {
34+
int i = ID[0];
35+
array[i] = MAGIC_NUM-4;
36+
});
37+
38+
39+
auto e2 = q.parallel_for(R, e1, [=](id<1> ID) {
40+
int i = ID[0];
41+
array[i] += 2;
42+
});
43+
44+
auto e3 =
45+
q.parallel_for(nd_range<1>{R, range<1>{1}}, {e1, e2}, [=](nd_item<1> ID) {
46+
int i = ID.get_global_id(0);
47+
array[i]++;
48+
});
49+
50+
q.single_task({e3}, [=]() {
51+
for (int i = 0; i < N; i++) {
52+
array[i]++;
53+
}
54+
});
55+
56+
q.wait();
57+
58+
for (int i = 0; i < N; i++) {
59+
if (array[i] != MAGIC_NUM) {
60+
return -1;
61+
}
62+
}
63+
free(array, ctxt);
64+
65+
return 0;
66+
}

0 commit comments

Comments
 (0)