Skip to content

Commit ffdadc2

Browse files
authored
[SYCL] Implement queue::parallel_for() accepting reduction (#2682)
Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 08066b2 commit ffdadc2

File tree

2 files changed

+65
-0
lines changed

2 files changed

+65
-0
lines changed

sycl/include/CL/sycl/queue.hpp

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -719,6 +719,27 @@ class __SYCL_EXPORT queue {
719719
CodeLoc);
720720
}
721721

722+
/// parallel_for version with a kernel represented as a lambda + nd_range that
723+
/// specifies global, local sizes and offset.
724+
///
725+
/// \param ExecutionRange is a range that specifies the work space of the
726+
/// kernel
727+
/// \param Redu is a reduction operation
728+
/// \param KernelFunc is the Kernel functor or lambda
729+
/// \param CodeLoc contains the code location of user code
730+
template <typename KernelName = detail::auto_name, typename KernelType,
731+
int Dims, typename Reduction>
732+
event parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
733+
_KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
734+
_CODELOCARG(&CodeLoc);
735+
return submit(
736+
[&](handler &CGH) {
737+
CGH.template parallel_for<KernelName, KernelType, Dims, Reduction>(
738+
ExecutionRange, Redu, KernelFunc);
739+
},
740+
CodeLoc);
741+
}
742+
722743
// Clean up CODELOC and KERNELFUNC macros.
723744
#undef _CODELOCPARAM
724745
#undef _CODELOCARG
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
4+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
5+
6+
// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
7+
// TODO: Enable the test for HOST when it supports ONEAPI::reduce() and
8+
// barrier()
9+
10+
// This test only checks that the method queue::parallel_for() accepting
11+
// reduction, can be properly translated into queue::submit + parallel_for().
12+
13+
#include <CL/sycl.hpp>
14+
using namespace sycl;
15+
16+
int main() {
17+
const size_t NElems = 1024;
18+
const size_t WGSize = 256;
19+
20+
queue Q;
21+
int *Data = malloc_shared<int>(NElems, Q);
22+
for (int I = 0; I < NElems; I++)
23+
Data[I] = I;
24+
25+
int *Sum = malloc_shared<int>(1, Q);
26+
*Sum = 0;
27+
28+
Q.parallel_for<class XYZ>(
29+
nd_range<1>{NElems, WGSize}, ONEAPI::reduction(Sum, ONEAPI::plus<>()),
30+
[=](nd_item<1> It, auto &Sum) { Sum += Data[It.get_global_id(0)]; })
31+
.wait();
32+
33+
int ExpectedSum = (NElems - 1) * NElems / 2;
34+
int Error = 0;
35+
if (*Sum != ExpectedSum) {
36+
std::cerr << "Error: Expected = " << ExpectedSum << ", Computed = " << *Sum
37+
<< std::endl;
38+
Error = 1;
39+
}
40+
41+
free(Data, Q);
42+
free(Sum, Q);
43+
return Error;
44+
}

0 commit comments

Comments
 (0)