[SYCL] Fix types and transparent functors recognition in reduction (#1709)

v-klochkov · web-flow · commit 92e622616762 · 2020-05-20T00:35:07.000-07:00
* [SYCL] Fix types and transparent functors recognition in reduction

1. Enable operator*, operator+, operator|, operator&amp;, operator^= for
   corresponding transparent functors used in reduction.
2. Fixed the case when reduction object is passed to parallel_for
   an R-value.
3. Allow identity-less constructors for reductions with transparent functors.
4. Replaced some 'auto' declarations with Reduction::result_type
   and added intermediate assignments/casts to avoid type ambiguities
   caused by using sycl::half type, and which may also be caused by custom/user
   types as well.
5. Fix statically known identity values for MIN/MAX reductions

Signed-off-by: Vyacheslav N Klochkov &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
@@ -853,7 +853,7 @@ class __SYCL_EXPORT handler {
             int Dims, typename Reduction>
   detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
                       Reduction::has_fast_atomics>
-  parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
+  parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
     if (Reduction::is_usm)
       Redu.associateWithHandler(*this);
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
@@ -886,7 +886,7 @@ class __SYCL_EXPORT handler {
             int Dims, typename Reduction>
   detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
                       Reduction::has_fast_atomics>
-  parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
+  parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
     shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
     auto RWAcc = Redu.getReadWriteScalarAcc(*this);
     intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
@@ -920,7 +920,7 @@ class __SYCL_EXPORT handler {
   template <typename KernelName = detail::auto_name, typename KernelType,
             int Dims, typename Reduction>
   detail::enable_if_t<!Reduction::has_fast_atomics>
-  parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
+  parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
     size_t NWorkGroups = Range.get_group_range().size();
 
     // This parallel_for() is lowered to the following sequence:
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp
diff --git a/sycl/test/reduction/reduction_ctor.cpp b/sycl/test/reduction/reduction_ctor.cpp
@@ -1,48 +1,15 @@
 // RUN: %clangxx -fsycl %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
-//==----------------reduction_ctor.cpp - SYCL reduction basic test ---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
 
 // This performs basic checks such as reduction creation, getIdentity() method,
 // and the combine() method of the aux class 'reducer'.
 
+#include "reduction_utils.hpp"
 #include <CL/sycl.hpp>
 #include <cassert>
 
 using namespace cl::sycl;
 
-template <typename T, class BinaryOperation, int N>
-struct init_data_t {
-  void initInputData(T IdentityVal,
-                     buffer<T, 1> &InBuf,
-                     T &ExpectedReduValue) {
-    ExpectedReduValue = IdentityVal;
-    BinaryOperation Op;
-    auto In = InBuf.template get_access<access::mode::write>();
-    for (int I = 0; I < N; ++I) {
-      In[I] = ((I + 1) % 5) + 1;
-      ExpectedReduValue = Op(ExpectedReduValue, In[I]);
-    }
-  }
-};
-
-template <typename T, int N>
-struct init_data_t<T, std::multiplies<T>, N> {
-  void initInputData(T IdentityVal, buffer<T, 1> &InBuf, T &ExpectedReduValue) {
-    ExpectedReduValue = IdentityVal;
-    std::multiplies<T> Op;
-    auto In = InBuf.template get_access<access::mode::write>();
-    for (int I = 0; I < N; ++I) {
-      In[I] = 1 + (((I % 37) == 0) ? 1 : 0);
-      ExpectedReduValue = Op(ExpectedReduValue, In[I]);
-    }
-  }
-};
 
 template <typename T, typename Reduction>
 void test_reducer(Reduction &Redu, T A, T B) {
@@ -157,15 +124,13 @@ int main() {
   testBoth<int, intel::bit_or<int>>(0, 1, 8);
   testBoth<int, intel::bit_xor<int>>(0, 7, 3);
   testBoth<int, intel::bit_and<int>>(~0, 7, 3);
-  testBoth<int, intel::minimum<int>>(std::numeric_limits<int>::max(), 7, 3);
-  testBoth<int, intel::maximum<int>>(std::numeric_limits<int>::min(), 7, 3);
+  testBoth<int, intel::minimum<int>>((std::numeric_limits<int>::max)(), 7, 3);
+  testBoth<int, intel::maximum<int>>((std::numeric_limits<int>::min)(), 7, 3);
 
   testBoth<float, intel::plus<float>>(0, 1, 7);
   testBoth<float, std::multiplies<float>>(1, 1, 7);
-  testBoth<float, intel::minimum<float>>(
-      std::numeric_limits<float>::max(), 7, 3);
-  testBoth<float, intel::maximum<float>>(
-      std::numeric_limits<float>::min(), 7, 3);
+  testBoth<float, intel::minimum<float>>(getMaximumFPValue<float>(), 7, 3);
+  testBoth<float, intel::maximum<float>>(getMinimumFPValue<float>(), 7, 3);
 
   testUnknown<Point<float>, 0, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
   testUnknown<Point<float>, 1, PointPlus<float>>(Point<float>(0), Point<float>(1), Point<float>(7));
diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp
@@ -79,17 +79,17 @@ int main() {
   test<int, 0, intel::bit_or<int>>(0, 8, 256);
   test<int, 0, intel::bit_xor<int>>(0, 8, 256);
   test<int, 0, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 0, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
-  test<int, 0, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+  test<int, 0, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 0, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 0, std::multiplies<float>>(1, 8, 256);
-  test<float, 0, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
-  test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+  test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   test<double, 0, std::multiplies<double>>(1, 8, 256);
-  test<double, 0, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
-  test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+  test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
+  test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp
@@ -81,17 +81,17 @@ int main() {
   test<int, 0, intel::bit_or<int>>(0, 8, 256);
   test<int, 0, intel::bit_xor<int>>(0, 8, 256);
   test<int, 0, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 0, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
-  test<int, 0, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+  test<int, 0, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 0, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 0, std::multiplies<float>>(1, 8, 256);
-  test<float, 0, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
-  test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+  test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   test<double, 0, std::multiplies<double>>(1, 8, 256);
-  test<double, 0, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
-  test<double, 0, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+  test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
+  test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp
@@ -80,17 +80,17 @@ int main() {
   test<int, 1, intel::bit_or<int>>(0, 8, 256);
   test<int, 1, intel::bit_xor<int>>(0, 8, 256);
   test<int, 1, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 1, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
-  test<int, 1, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+  test<int, 1, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 1, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
-  test<float, 1, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   test<double, 1, std::multiplies<double>>(1, 8, 256);
-  test<double, 1, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
-  test<double, 1, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+  test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
+  test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp
@@ -82,17 +82,17 @@ int main() {
   test<int, 1, intel::bit_or<int>>(0, 8, 256);
   test<int, 1, intel::bit_xor<int>>(0, 8, 256);
   test<int, 1, intel::bit_and<int>>(~0, 8, 256);
-  test<int, 1, intel::minimum<int>>(std::numeric_limits<int>::max(), 8, 256);
-  test<int, 1, intel::maximum<int>>(std::numeric_limits<int>::min(), 8, 256);
+  test<int, 1, intel::minimum<int>>((std::numeric_limits<int>::max)(), 8, 256);
+  test<int, 1, intel::maximum<int>>((std::numeric_limits<int>::min)(), 8, 256);
 
   // Check with various types.
   test<float, 1, std::multiplies<float>>(1, 8, 256);
-  test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 8, 256);
-  test<float, 1, intel::maximum<float>>(std::numeric_limits<float>::min(), 8, 256);
+  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
+  test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
   test<double, 1, std::multiplies<double>>(1, 8, 256);
-  test<double, 1, intel::minimum<double>>(std::numeric_limits<double>::max(), 8, 256);
-  test<double, 1, intel::maximum<double>>(std::numeric_limits<double>::min(), 8, 256);
+  test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
+  test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
 
   // Check with CUSTOM type.
   test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
diff --git a/sycl/test/reduction/reduction_placeholder.cpp b/sycl/test/reduction/reduction_placeholder.cpp
@@ -72,8 +72,8 @@ int main() {
   test<int, 1, intel::bit_or<int>>(0, 4, 128);
 
   // fast reduce
-  test<float, 1, intel::minimum<float>>(std::numeric_limits<float>::max(), 5, 5 * 7);
-  test<float, 0, intel::maximum<float>>(std::numeric_limits<float>::min(), 4, 128);
+  test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
+  test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
 
   // generic algorithm
   test<int, 0, std::multiplies<int>>(1, 7, 7 * 5);
diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp
@@ -1,8 +1,17 @@
+// UNSUPPORTED: cuda
+// Reductions use work-group builtins not yet supported by CUDA.
+
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: enable all checks for CPU/ACC when CPU/ACC RT supports intel::reduce()
+// for 'cl::sycl::half' type.
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DSKIP_FOR_HALF -o %t.no_half.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.no_half.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.no_half.out
+
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
 
 // This test performs basic checks of parallel_for(nd_range, reduction, func)
 // where func is a transparent functor.
@@ -14,10 +23,14 @@
 using namespace cl::sycl;
 
 template <typename T, int Dim, class BinaryOperation>
-class SomeClass;
+class SomeIdClass;
+template <typename T, int Dim, class BinaryOperation>
+class SomeNoIdClass;
 
+// Checks reductions initialized with transparent functor and explicitly set
+// identity value.
 template <typename T, int Dim, class BinaryOperation>
-void test(T Identity, size_t WGSize, size_t NWItems) {
+void testId(T Identity, size_t WGSize, size_t NWItems) {
   buffer<T, 1> InBuf(NWItems);
   buffer<T, 1> OutBuf(1);
 
@@ -32,13 +45,12 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
     auto In = InBuf.template get_access<access::mode::read>(CGH);
     accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
         Out(OutBuf, CGH);
-    auto Redu = intel::reduction(Out, Identity, BOp);
 
     range<1> GlobalRange(NWItems);
     range<1> LocalRange(WGSize);
     nd_range<1> NDRange(GlobalRange, LocalRange);
-    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
-        NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
+    CGH.parallel_for<SomeIdClass<T, Dim, BinaryOperation>>(
+        NDRange, intel::reduction(Out, Identity, BOp), [=](nd_item<1> NDIt, auto &Sum) {
           Sum.combine(In[NDIt.get_global_linear_id()]);
         });
   });
@@ -54,12 +66,62 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
   }
 }
 
+// Checks reductions initialized with transparent functor and identity
+// value not explicitly specified. The parameter 'Identity' is passed here
+// only to pre-initialize input data correctly.
+template <typename T, int Dim, class BinaryOperation>
+void testNoId(T Identity, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, access::mode::discard_write, access::target::global_buffer>
+        Out(OutBuf, CGH);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<SomeNoIdClass<T, Dim, BinaryOperation>>(
+        NDRange, intel::reduction(Out, BOp), [=](nd_item<1> NDIt, auto &Sum) {
+          Sum.combine(In[NDIt.get_global_linear_id()]);
+        });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  if (ComputedOut != CorrectOut) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+template <typename T, int Dim, class BinaryOperation>
+void test(T Identity, size_t WGSize, size_t NWItems) {
+  testId<T, Dim, BinaryOperation>(Identity, WGSize, NWItems);
+  testNoId<T, Dim, BinaryOperation>(Identity, WGSize, NWItems);
+}
+
 int main() {
 #if __cplusplus >= 201402L
-  test<double, 0, intel::maximum<>>(std::numeric_limits<double>::min(), 7, 7 * 5);
+  test<double, 0, intel::maximum<>>(getMinimumFPValue<double>(), 7, 7 * 5);
   test<signed char, 0, intel::plus<>>(0, 7, 49);
   test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
-#endif
+#ifndef SKIP_FOR_HALF
+  test<half, 1, intel::plus<>>(0, 4, 8);
+  test<half, 1, intel::minimum<>>(getMaximumFPValue<half>(), 8, 32);
+#endif // SKIP_FOR_HALF
+#endif // __cplusplus >= 201402L
 
   std::cout << "Test passed\n";
   return 0;
diff --git a/sycl/test/reduction/reduction_usm.cpp b/sycl/test/reduction/reduction_usm.cpp
@@ -113,10 +113,8 @@ int main() {
   testUSM<int, 1, intel::bit_or<int>>(0, 4, 128);
 
   // fast reduce
-  testUSM<float, 1, intel::minimum<float>>(
-      (std::numeric_limits<float>::max)(), 5, 5 * 7);
-  testUSM<float, 0, intel::maximum<float>>(
-      (std::numeric_limits<float>::min)(), 4, 128);
+  testUSM<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 5, 5 * 7);
+  testUSM<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 4, 128);
 
   // generic algorithm
   testUSM<int, 0, std::multiplies<int>>(1, 7, 7 * 5);
diff --git a/sycl/test/reduction/reduction_utils.hpp b/sycl/test/reduction/reduction_utils.hpp
@@ -52,3 +52,17 @@ struct CustomVecPlus {
     return CV(A.X + B.X, A.Y + B.Y);
   }
 };
+
+template <typename T>
+T getMinimumFPValue() {
+  return std::numeric_limits<T>::has_infinity
+             ? -std::numeric_limits<T>::infinity()
+             : std::numeric_limits<T>::lowest();
+}
+
+template <typename T>
+T getMaximumFPValue() {
+  return std::numeric_limits<T>::has_infinity
+             ? std::numeric_limits<T>::infinity()
+             : (std::numeric_limits<T>::max)();
+}