[SYCL] Fix errors caused by half/double types in reduction (#1912)

v-klochkov · web-flow · commit 1b8b26ae19b4 · 2020-06-18T10:54:45.000-07:00
1) Added a fix/workaround in half types support that caused
   a compilation error for unary minus applied to half type operand.
2) Separated the test cases for half and double types to two new
   reduction LIT tests. It is needed to handle them more easily
   on devices where 'half' and/or 'double' extensions are not supported.

Signed-off-by: Vyacheslav N Klochkov &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/sycl/include/CL/sycl/intel/reduction.hpp b/sycl/include/CL/sycl/intel/reduction.hpp
@@ -219,7 +219,7 @@ class reducer<T, BinaryOperation,
   static enable_if_t<IsMaximumIdentityOp<_T, _BinaryOperation>::value, _T>
   getIdentity() {
     return std::numeric_limits<_T>::has_infinity
-               ? -std::numeric_limits<_T>::infinity()
+               ? static_cast<_T>(-std::numeric_limits<_T>::infinity())
                : std::numeric_limits<_T>::lowest();
   }
 
diff --git a/sycl/test/reduction/reduction_nd_ext_double.cpp b/sycl/test/reduction/reduction_nd_ext_double.cpp
@@ -0,0 +1,19 @@
+// UNSUPPORTED: cuda
+// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// used with 'double' type.
+
+#include "reduction_nd_ext_type.hpp"
+
+int main() {
+  return runTests<double>("cl_khr_double");
+}
diff --git a/sycl/test/reduction/reduction_nd_ext_half.cpp b/sycl/test/reduction/reduction_nd_ext_half.cpp
@@ -0,0 +1,21 @@
+// UNSUPPORTED: cuda
+// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
+//
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: Enable the test for CPU/ACC when they support half type.
+// RUNx: %CPU_RUN_PLACEHOLDER %t.out
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
+// RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
+
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// used with 'half' type.
+
+#include "reduction_nd_ext_type.hpp"
+
+int main() {
+  return runTests<half>("cl_khr_fp16");
+}
diff --git a/sycl/test/reduction/reduction_nd_ext_type.hpp b/sycl/test/reduction/reduction_nd_ext_type.hpp
@@ -0,0 +1,80 @@
+// This test performs basic checks of parallel_for(nd_range, reduction, func)
+// with types that may require additional runtime checks for extensions
+// supported by the device, e.g. 'half' or 'double'
+
+#include "reduction_utils.hpp"
+#include <CL/sycl.hpp>
+#include <cassert>
+
+using namespace cl::sycl;
+
+template <typename T, int Dim, class BinaryOperation>
+class SomeClass;
+
+template <typename T, int Dim, access::mode Mode, class BinaryOperation>
+void test(T Identity, size_t WGSize, size_t NWItems) {
+  buffer<T, 1> InBuf(NWItems);
+  buffer<T, 1> OutBuf(1);
+
+  // Initialize.
+  BinaryOperation BOp;
+  T CorrectOut;
+  initInputData(InBuf, CorrectOut, Identity, BOp, NWItems);
+
+  if (Mode == access::mode::read_write)
+    (OutBuf.template get_access<access::mode::write>())[0] = Identity;
+
+  // Compute.
+  queue Q;
+  Q.submit([&](handler &CGH) {
+    auto In = InBuf.template get_access<access::mode::read>(CGH);
+    accessor<T, Dim, Mode, access::target::global_buffer>
+        Out(OutBuf, CGH);
+    auto Redu = intel::reduction(Out, Identity, BOp);
+
+    range<1> GlobalRange(NWItems);
+    range<1> LocalRange(WGSize);
+    nd_range<1> NDRange(GlobalRange, LocalRange);
+    CGH.parallel_for<SomeClass<T, Dim, BinaryOperation>>(
+        NDRange, Redu, [=](nd_item<1> NDIt, auto &Sum) {
+          Sum.combine(In[NDIt.get_global_linear_id()]);
+        });
+  });
+
+  // Check correctness.
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  T ComputedOut = *(Out.get_pointer());
+  T MaxDiff = 3 * std::numeric_limits<T>::epsilon() * std::fabs(ComputedOut + CorrectOut);
+  if (std::fabs(static_cast<T>(ComputedOut - CorrectOut)) > MaxDiff) {
+    std::cout << "NWItems = " << NWItems << ", WGSize = " << WGSize << "\n";
+    std::cout << "Computed value: " << ComputedOut
+              << ", Expected value: " << CorrectOut
+              << ", MaxDiff = " << MaxDiff << "\n";
+    assert(0 && "Wrong value.");
+  }
+}
+
+template <typename T>
+int runTests(const string_class &ExtensionName) {
+  device D = default_selector().select_device();
+  if (!D.is_host() && !D.has_extension(ExtensionName)) {
+    std::cout << "Test skipped\n";
+    return 0;
+  }
+
+  // Check some less standards WG sizes and corner cases first.
+  test<T, 1, access::mode::read_write, std::multiplies<T>>(0, 4, 4);
+  test<T, 0, access::mode::discard_write, intel::plus<T>>(0, 4, 64);
+
+  test<T, 0, access::mode::read_write, intel::minimum<T>>(getMaximumFPValue<T>(), 7, 7);
+  test<T, 1, access::mode::discard_write, intel::maximum<T>>(getMinimumFPValue<T>(), 7, 7 * 5);
+
+#if __cplusplus >= 201402L
+  test<T, 1, access::mode::read_write, intel::plus<>>(1, 3, 3 * 5);
+  test<T, 1, access::mode::discard_write, intel::minimum<>>(getMaximumFPValue<T>(), 3, 3);
+  test<T, 0, access::mode::discard_write, intel::maximum<>>(getMinimumFPValue<T>(), 3, 3);
+#endif // __cplusplus >= 201402L
+
+  std::cout << "Test passed\n";
+  return 0;
+}
diff --git a/sycl/test/reduction/reduction_nd_s0_dw.cpp b/sycl/test/reduction/reduction_nd_s0_dw.cpp
@@ -87,10 +87,6 @@ int main() {
   test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
   test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
-  test<double, 0, std::multiplies<double>>(1, 8, 256);
-  test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
-  test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
-
   // Check with CUSTOM type.
   test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
 
diff --git a/sycl/test/reduction/reduction_nd_s0_rw.cpp b/sycl/test/reduction/reduction_nd_s0_rw.cpp
@@ -89,10 +89,6 @@ int main() {
   test<float, 0, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
   test<float, 0, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
-  test<double, 0, std::multiplies<double>>(1, 8, 256);
-  test<double, 0, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
-  test<double, 0, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
-
   // Check with CUSTOM type.
   test<CustomVec<long long>, 0, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
 
diff --git a/sycl/test/reduction/reduction_nd_s1_dw.cpp b/sycl/test/reduction/reduction_nd_s1_dw.cpp
@@ -88,10 +88,6 @@ int main() {
   test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
   test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
-  test<double, 1, std::multiplies<double>>(1, 8, 256);
-  test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
-  test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
-
   // Check with CUSTOM type.
   test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
 
diff --git a/sycl/test/reduction/reduction_nd_s1_rw.cpp b/sycl/test/reduction/reduction_nd_s1_rw.cpp
@@ -90,10 +90,6 @@ int main() {
   test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
   test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);
 
-  test<double, 1, std::multiplies<double>>(1, 8, 256);
-  test<double, 1, intel::minimum<double>>(getMaximumFPValue<double>(), 8, 256);
-  test<double, 1, intel::maximum<double>>(getMinimumFPValue<double>(), 8, 256);
-
   // Check with CUSTOM type.
   test<CustomVec<long long>, 1, CustomVecPlus<long long>>(CustomVec<long long>(0), 8, 256);
 
diff --git a/sycl/test/reduction/reduction_transparent.cpp b/sycl/test/reduction/reduction_transparent.cpp
@@ -3,12 +3,8 @@
 
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
-
-// TODO: enable all checks for CPU/ACC when CPU/ACC RT supports intel::reduce()
-// for 'cl::sycl::half' type.
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -DSKIP_FOR_HALF -o %t.no_half.out
-// RUN: %ACC_RUN_PLACEHOLDER %t.no_half.out
-// RUN: %CPU_RUN_PLACEHOLDER %t.no_half.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
 
 // RUNx: env SYCL_DEVICE_TYPE=HOST %t.out
 // TODO: Enable the test for HOST when it supports intel::reduce() and barrier()
@@ -114,13 +110,9 @@ void test(T Identity, size_t WGSize, size_t NWItems) {
 
 int main() {
 #if __cplusplus >= 201402L
-  test<double, 0, intel::maximum<>>(getMinimumFPValue<double>(), 7, 7 * 5);
+  test<float, 0, intel::maximum<>>(getMinimumFPValue<float>(), 7, 7 * 5);
   test<signed char, 0, intel::plus<>>(0, 7, 49);
   test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
-#ifndef SKIP_FOR_HALF
-  test<half, 1, intel::plus<>>(0, 4, 8);
-  test<half, 1, intel::minimum<>>(getMaximumFPValue<half>(), 8, 32);
-#endif // SKIP_FOR_HALF
 #endif // __cplusplus >= 201402L
 
   std::cout << "Test passed\n";
diff --git a/sycl/test/reduction/reduction_utils.hpp b/sycl/test/reduction/reduction_utils.hpp
@@ -56,7 +56,7 @@ struct CustomVecPlus {
 template <typename T>
 T getMinimumFPValue() {
   return std::numeric_limits<T>::has_infinity
-             ? -std::numeric_limits<T>::infinity()
+             ? static_cast<T>(-std::numeric_limits<T>::infinity())
              : std::numeric_limits<T>::lowest();
 }
 

Original file line number	Diff line number	Diff line change
`@@ -219,7 +219,7 @@ class reducer<T, BinaryOperation,`
`219`	`219`	`static enable_if_t<IsMaximumIdentityOp<_T, _BinaryOperation>::value, _T>`
`220`	`220`	`getIdentity() {`
`221`	`221`	`return std::numeric_limits<_T>::has_infinity`
`222`		`- ? -std::numeric_limits<_T>::infinity()`
	`222`	`+ ? static_cast<_T>(-std::numeric_limits<_T>::infinity())`
`223`	`223`	`: std::numeric_limits<_T>::lowest();`
`224`	`224`	`}`
`225`	`225`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ struct CustomVecPlus {`
`56`	`56`	`template <typename T>`
`57`	`57`	`T getMinimumFPValue() {`
`58`	`58`	`return std::numeric_limits<T>::has_infinity`
`59`		`- ? -std::numeric_limits<T>::infinity()`
	`59`	`+ ? static_cast<T>(-std::numeric_limits<T>::infinity())`
`60`	`60`	`: std::numeric_limits<T>::lowest();`
`61`	`61`	`}`
`62`	`62`