[SYCL] Implement queue::parallel_for() accepting reduction (#2682)

v-klochkov · web-flow · commit ffdadc2ea64d · 2020-10-26T09:09:21.000-07:00
Signed-off-by: Vyacheslav N Klochkov &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/sycl/include/CL/sycl/queue.hpp b/sycl/include/CL/sycl/queue.hpp
@@ -719,6 +719,27 @@ class __SYCL_EXPORT queue {
         CodeLoc);
   }
 
+  /// parallel_for version with a kernel represented as a lambda + nd_range that
+  /// specifies global, local sizes and offset.
+  ///
+  /// \param ExecutionRange is a range that specifies the work space of the
+  /// kernel
+  /// \param Redu is a reduction operation
+  /// \param KernelFunc is the Kernel functor or lambda
+  /// \param CodeLoc contains the code location of user code
+  template <typename KernelName = detail::auto_name, typename KernelType,
+            int Dims, typename Reduction>
+  event parallel_for(nd_range<Dims> ExecutionRange, Reduction Redu,
+                     _KERNELFUNCPARAM(KernelFunc) _CODELOCPARAM(&CodeLoc)) {
+    _CODELOCARG(&CodeLoc);
+    return submit(
+        [&](handler &CGH) {
+          CGH.template parallel_for<KernelName, KernelType, Dims, Reduction>(
+              ExecutionRange, Redu, KernelFunc);
+        },
+        CodeLoc);
+  }
+
 // Clean up CODELOC and KERNELFUNC macros.
 #undef _CODELOCPARAM
 #undef _CODELOCARG
diff --git a/sycl/test/reduction/reduction_queue_parallel_for.cpp b/sycl/test/reduction/reduction_queue_parallel_for.cpp
@@ -0,0 +1,44 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable the test for HOST when it supports ONEAPI::reduce() and
+// barrier()
+
+// This test only checks that the method queue::parallel_for() accepting
+// reduction, can be properly translated into queue::submit + parallel_for().
+
+#include <CL/sycl.hpp>
+using namespace sycl;
+
+int main() {
+  const size_t NElems = 1024;
+  const size_t WGSize = 256;
+
+  queue Q;
+  int *Data = malloc_shared<int>(NElems, Q);
+  for (int I = 0; I < NElems; I++)
+    Data[I] = I;
+
+  int *Sum = malloc_shared<int>(1, Q);
+  *Sum = 0;
+
+  Q.parallel_for<class XYZ>(
+       nd_range<1>{NElems, WGSize}, ONEAPI::reduction(Sum, ONEAPI::plus<>()),
+       [=](nd_item<1> It, auto &Sum) { Sum += Data[It.get_global_id(0)]; })
+      .wait();
+
+  int ExpectedSum = (NElems - 1) * NElems / 2;
+  int Error = 0;
+  if (*Sum != ExpectedSum) {
+    std::cerr << "Error: Expected = " << ExpectedSum << ", Computed = " << *Sum
+              << std::endl;
+    Error = 1;
+  }
+
+  free(Data, Q);
+  free(Sum, Q);
+  return Error;
+}