Additional fixes for CUDA: move spir-v 1.3 test cases to separate files

v-klochkov · v-klochkov · commit c3f7aeb251a3 · 2021-03-01T13:32:05.000-08:00
Signed-off-by: Vyacheslav N Klochkov &lt;vyacheslav.n.klochkov@intel.com&gt;
diff --git a/SYCL/SubGroup/reduce.hpp b/SYCL/SubGroup/reduce.hpp
@@ -107,17 +107,52 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
   check_op<
       sycl_subgr<SpecializationKernelName, class KernelName_BaCGaWDMFeMFqvotbk>,
       T>(Queue, T(0), ONEAPI::maximum<>(), true, G, L);
+}
 
-  // Use small sub-groups to avoid overflow effects for int multiply operations
-  // and avoid rounding issues for FP multiply.
-  L = 4;
+template <typename SpecializationKernelName, typename T>
+void check_mul(queue &Queue, size_t G = 256, size_t L = 4) {
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulF>, T>(
       Queue, T(G), ONEAPI::multiplies<T>(), false, G, L);
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulT>, T>(
       Queue, T(1), ONEAPI::multiplies<T>(), true, G, L);
 
+  // Transparent operator functors.
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulFV>, T>(
       Queue, T(G), ONEAPI::multiplies<>(), false, G, L);
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulTV>, T>(
       Queue, T(1), ONEAPI::multiplies<>(), true, G, L);
 }
+
+template <typename SpecializationKernelName, typename T>
+void check_bit_ops(queue &Queue, size_t G = 256, size_t L = 4) {
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORF>, T>(
+      Queue, T(G), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORT>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORF>, T>(
+      Queue, T(G), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORT>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDF>, T>(
+      Queue, T(G), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDT>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+
+  // Transparent operator functors
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORFV>, T>(
+      Queue, T(G), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORTV>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORFV>, T>(
+      Queue, T(G), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORTV>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDFV>, T>(
+      Queue, T(G), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDTV>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+}
diff --git a/SYCL/SubGroup/reduce_bit_ops.cpp b/SYCL/SubGroup/reduce_bit_ops.cpp
diff --git a/SYCL/SubGroup/reduce_spirv13.cpp b/SYCL/SubGroup/reduce_spirv13.cpp
@@ -0,0 +1,39 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 reduce algorithm
+// used with the operation MUL, bitwise OR, XOR, AND.
+
+#include "reduce.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  check_mul<class MulA, int>(Queue);
+  check_mul<class MulB, unsigned int>(Queue);
+  check_mul<class MulC, long>(Queue);
+  check_mul<class MulD, unsigned long>(Queue);
+  check_mul<class MulE, float>(Queue);
+
+  check_bit_ops<class A, int>(Queue);
+  check_bit_ops<class B, unsigned int>(Queue);
+  check_bit_ops<class C, unsigned>(Queue);
+  check_bit_ops<class D, long>(Queue);
+  check_bit_ops<class E, unsigned long>(Queue);
+  check_bit_ops<class F, long long>(Queue);
+  check_bit_ops<class G, unsigned long long>(Queue);
+  return 0;
+}
diff --git a/SYCL/SubGroup/reduce_spirv13_fp16.cpp b/SYCL/SubGroup/reduce_spirv13_fp16.cpp
@@ -0,0 +1,20 @@
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 reduce algorithm
+// used with MUL operation.
+
+#include "reduce.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulHalf, cl::sycl::half>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/SubGroup/reduce_spirv13_fp64.cpp b/SYCL/SubGroup/reduce_spirv13_fp64.cpp
@@ -0,0 +1,27 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 reduce algorithm
+// used with MUL operation.
+
+#include "reduce.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulDouble, double>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/SubGroup/scan.hpp b/SYCL/SubGroup/scan.hpp
@@ -150,10 +150,10 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
         T>(Queue, std::numeric_limits<T>::min(), ONEAPI::maximum<>(), true, G,
            L);
   }
+}
 
-  // Use small sub-groups to avoid overflow effects for int multiply operations
-  // and avoid rounding issues for FP multiply.
-  L = 4;
+template <typename SpecializationKernelName, typename T>
+void check_mul(queue &Queue, size_t G = 256, size_t L = 4) {
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulF>, T>(
       Queue, T(L), ONEAPI::multiplies<T>(), false, G, L);
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulT>, T>(
@@ -164,3 +164,37 @@ void check(queue &Queue, size_t G = 256, size_t L = 64) {
   check_op<sycl_subgr<SpecializationKernelName, class KernelName_MulTV>, T>(
       Queue, T(1), ONEAPI::multiplies<>(), true, G, L);
 }
+
+template <typename SpecializationKernelName, typename T>
+void check_bit_ops(queue &Queue, size_t G = 256, size_t L = 4) {
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORF>, T>(
+      Queue, T(L), ONEAPI::bit_or<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORT>, T>(
+      Queue, T(0), ONEAPI::bit_or<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORF>, T>(
+      Queue, T(L), ONEAPI::bit_xor<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORT>, T>(
+      Queue, T(0), ONEAPI::bit_xor<T>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDF>, T>(
+      Queue, T(L), ONEAPI::bit_and<T>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDT>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<T>(), true, G, L);
+
+  // Transparent operator functors.
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORFV>, T>(
+      Queue, T(L), ONEAPI::bit_or<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ORTV>, T>(
+      Queue, T(0), ONEAPI::bit_or<>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORFV>, T>(
+      Queue, T(L), ONEAPI::bit_xor<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_XORTV>, T>(
+      Queue, T(0), ONEAPI::bit_xor<>(), true, G, L);
+
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDFV>, T>(
+      Queue, T(L), ONEAPI::bit_and<>(), false, G, L);
+  check_op<sycl_subgr<SpecializationKernelName, class KernelName_ANDTV>, T>(
+      Queue, ~T(0), ONEAPI::bit_and<>(), true, G, L);
+}
diff --git a/SYCL/SubGroup/scan_bit_ops.cpp b/SYCL/SubGroup/scan_bit_ops.cpp
diff --git a/SYCL/SubGroup/scan_spirv13.cpp b/SYCL/SubGroup/scan_spirv13.cpp
@@ -0,0 +1,39 @@
+// UNSUPPORTED: cpu
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
+
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %HOST_RUN_PLACEHOLDER %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 exclusive_scan() and
+// inclusive_scan() algoriths used with the operation MUL, bitwise OR, XOR, AND.
+
+#include "scan.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulA, int>(Queue);
+  check_mul<class MulB, unsigned int>(Queue);
+  check_mul<class MulC, long>(Queue);
+  check_mul<class MulD, unsigned long>(Queue);
+  check_mul<class MulE, float>(Queue);
+
+  check_bit_ops<class A, int>(Queue);
+  check_bit_ops<class B, unsigned int>(Queue);
+  check_bit_ops<class C, unsigned>(Queue);
+  check_bit_ops<class D, long>(Queue);
+  check_bit_ops<class E, unsigned long>(Queue);
+  check_bit_ops<class F, long long>(Queue);
+  check_bit_ops<class G, unsigned long long>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/SubGroup/scan_spirv13_fp16.cpp b/SYCL/SubGroup/scan_spirv13_fp16.cpp
@@ -0,0 +1,20 @@
+// UNSUPPORTED: cuda
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// This test verifies the correct work of SPIR-V 1.3 exclusive_scan() and
+// inclusive_scan() algoriths used with the MUL operation.
+
+#include "scan.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check_mul<class MulHalf, cl::sycl::half>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
diff --git a/SYCL/SubGroup/scan_spirv13_fp64.cpp b/SYCL/SubGroup/scan_spirv13_fp64.cpp