intel
diff --git a/‎sycl/test/sub_group/broadcast.cpp
Lines changed: 3 additions & 57 deletions b/‎sycl/test/sub_group/broadcast.cpp
Lines changed: 3 additions & 57 deletions
diff --git a/‎sycl/test/sub_group/broadcast.hpp
Lines changed: 54 additions & 0 deletions b/‎sycl/test/sub_group/broadcast.hpp
Lines changed: 54 additions & 0 deletions
diff --git a/‎sycl/test/sub_group/broadcast_fp16.cpp
Lines changed: 25 additions & 0 deletions b/‎sycl/test/sub_group/broadcast_fp16.cpp
Lines changed: 25 additions & 0 deletions
diff --git a/‎sycl/test/sub_group/broadcast_fp64.cpp
Lines changed: 29 additions & 0 deletions b/‎sycl/test/sub_group/broadcast_fp64.cpp
Lines changed: 29 additions & 0 deletions
diff --git a/‎sycl/test/sub_group/reduce.cpp
Lines changed: 3 additions & 101 deletions b/‎sycl/test/sub_group/reduce.cpp
Lines changed: 3 additions & 101 deletions
@@ -2,10 +2,9 @@
 // CUDA compilation and runtime do not yet support sub-groups.
 
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -D SG_GPU %s -o %t_gpu.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
 //==--------- broadcast.cpp - SYCL sub_group broadcast test ----*- C++ -*---==//
@@ -16,52 +15,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "helper.hpp"
-#include <CL/sycl.hpp>
-template <typename T>
-class sycl_subgr;
-using namespace cl::sycl;
-template <typename T>
-void check(queue &Queue) {
-  const int G = 240, L = 60;
-  try {
-    nd_range<1> NdRange(G, L);
-    buffer<T> syclbuf(G);
-    buffer<size_t> sgsizebuf(1);
-    Queue.submit([&](handler &cgh) {
-      auto syclacc = syclbuf.template get_access<access::mode::read_write>(cgh);
-      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
-      cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
-        intel::sub_group SG = NdItem.get_sub_group();
-        /*Broadcast GID of element with SGLID == SGID */
-        syclacc[NdItem.get_global_id()] =
-            broadcast(SG, T(NdItem.get_global_id(0)), SG.get_group_id());
-        if (NdItem.get_global_id(0) == 0)
-          sgsizeacc[0] = SG.get_max_local_range()[0];
-      });
-    });
-    auto syclacc = syclbuf.template get_access<access::mode::read_write>();
-    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
-    size_t sg_size = sgsizeacc[0];
-    if (sg_size == 0)
-      sg_size = L;
-    int WGid = -1, SGid = 0;
-    for (int j = 0; j < G; j++) {
-      if (j % L % sg_size == 0) {
-        SGid++;
-      }
-      if (j % L == 0) {
-        WGid++;
-        SGid = 0;
-      }
-      exit_if_not_equal<T>(syclacc[j], L * WGid + SGid + SGid * sg_size,
-                           "broadcasted value");
-    }
-  } catch (exception e) {
-    std::cout << "SYCL exception caught: " << e.what();
-    exit(1);
-  }
-}
+#include "broadcast.hpp"
+
 int main() {
   queue Queue;
   if (!core_sg_supported(Queue.get_device())) {
@@ -73,15 +28,6 @@ int main() {
   check<long>(Queue);
   check<unsigned long>(Queue);
   check<float>(Queue);
-  // broadcast half type is not supported in OCL CPU RT
-#ifdef SG_GPU
-  if (Queue.get_device().has_extension("cl_khr_fp16")) {
-    check<cl::sycl::half>(Queue);
-  }
-#endif
-  if (Queue.get_device().has_extension("cl_khr_fp64")) {
-    check<double>(Queue);
-  }
   std::cout << "Test passed." << std::endl;
   return 0;
 }
@@ -0,0 +1,54 @@
+//==--------- broadcast.hpp - SYCL sub_group broadcast test ----*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "helper.hpp"
+#include <CL/sycl.hpp>
+template <typename T>
+class sycl_subgr;
+using namespace cl::sycl;
+template <typename T>
+void check(queue &Queue) {
+  const int G = 240, L = 60;
+  try {
+    nd_range<1> NdRange(G, L);
+    buffer<T> syclbuf(G);
+    buffer<size_t> sgsizebuf(1);
+    Queue.submit([&](handler &cgh) {
+      auto syclacc = syclbuf.template get_access<access::mode::read_write>(cgh);
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
+        intel::sub_group SG = NdItem.get_sub_group();
+        /*Broadcast GID of element with SGLID == SGID */
+        syclacc[NdItem.get_global_id()] =
+            broadcast(SG, T(NdItem.get_global_id(0)), SG.get_group_id());
+        if (NdItem.get_global_id(0) == 0)
+          sgsizeacc[0] = SG.get_max_local_range()[0];
+      });
+    });
+    auto syclacc = syclbuf.template get_access<access::mode::read_write>();
+    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+    size_t sg_size = sgsizeacc[0];
+    if (sg_size == 0)
+      sg_size = L;
+    int WGid = -1, SGid = 0;
+    for (int j = 0; j < G; j++) {
+      if (j % L % sg_size == 0) {
+        SGid++;
+      }
+      if (j % L == 0) {
+        WGid++;
+        SGid = 0;
+      }
+      exit_if_not_equal<T>(syclacc[j], L * WGid + SGid + SGid * sg_size,
+                           "broadcasted value");
+    }
+  } catch (exception e) {
+    std::cout << "SYCL exception caught: " << e.what();
+    exit(1);
+  }
+}
@@ -0,0 +1,25 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+//==--------- broadcast_fp16.cpp - SYCL sub_group broadcast test ----*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------------===//
+
+#include "broadcast.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<cl::sycl::half>(Queue);
+  return 0;
+}
@@ -0,0 +1,29 @@
+// UNSUPPORTED: cuda
+// CUDA compilation and runtime do not yet support sub-groups.
+
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+
+//==--------- broadcast_fp64.cpp - SYCL sub_group broadcast test ----*- C++ -*---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------------===//
+
+#include "broadcast.hpp"
+
+int main() {
+  queue Queue;
+  if (!core_sg_supported(Queue.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+  check<double>(Queue);
+  std::cout << "Test passed." << std::endl;
+  return 0;
+}
@@ -1,13 +1,11 @@
 // UNSUPPORTED: cuda
 // CUDA compilation and runtime do not yet support sub-groups.
 //
-// RUN: %clangxx -fsycl -std=c++14 %s -o %t.out
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple -std=c++14 -D SG_GPU %s -o %t_gpu.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t_gpu.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
-
 //==--------------- reduce.cpp - SYCL sub_group reduce test ----*- C++ -*---==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
@@ -16,115 +14,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "helper.hpp"
-#include <CL/sycl.hpp>
-
-template <typename T, class BinaryOperation>
-class sycl_subgr;
-
-using namespace cl::sycl;
-
-template <typename T, class BinaryOperation>
-void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false,
-              size_t G = 240, size_t L = 60) {
-  try {
-    nd_range<1> NdRange(G, L);
-    buffer<T> buf(G);
-    buffer<size_t> sgsizebuf(1);
-    Queue.submit([&](handler &cgh) {
-      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
-      auto acc = buf.template get_access<access::mode::read_write>(cgh);
-      cgh.parallel_for<sycl_subgr<T, BinaryOperation>>(
-          NdRange, [=](nd_item<1> NdItem) {
-            intel::sub_group sg = NdItem.get_sub_group();
-            if (skip_init) {
-              acc[NdItem.get_global_id(0)] =
-                  reduce(sg, T(NdItem.get_global_id(0)), op);
-            } else {
-              acc[NdItem.get_global_id(0)] =
-                  reduce(sg, T(NdItem.get_global_id(0)), init, op);
-            }
-            if (NdItem.get_global_id(0) == 0)
-              sgsizeacc[0] = sg.get_max_local_range()[0];
-          });
-    });
-    auto acc = buf.template get_access<access::mode::read_write>();
-    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
-    size_t sg_size = sgsizeacc[0];
-    int WGid = -1, SGid = 0;
-    T result = init;
-    for (int j = 0; j < G; j++) {
-      if (j % L % sg_size == 0) {
-        SGid++;
-        result = init;
-        for (int i = j; (i % L && i % L % sg_size) || (i == j); i++) {
-          result = op(result, T(i));
-        }
-      }
-      if (j % L == 0) {
-        WGid++;
-        SGid = 0;
-      }
-      std::string name =
-          std::string("reduce_") + typeid(BinaryOperation).name();
-      exit_if_not_equal<T>(acc[j], result, name.c_str());
-    }
-  } catch (exception e) {
-    std::cout << "SYCL exception caught: " << e.what();
-    exit(1);
-  }
-}
-
-template <typename T>
-void check(queue &Queue, size_t G = 240, size_t L = 60) {
-  // limit data range for half to avoid rounding issues
-  if (std::is_same<T, cl::sycl::half>::value) {
-    G = 64;
-    L = 32;
-  }
-
-  check_op<T>(Queue, T(L), intel::plus<T>(), false, G, L);
-  check_op<T>(Queue, T(0), intel::plus<T>(), true, G, L);
-
-  check_op<T>(Queue, T(0), intel::minimum<T>(), false, G, L);
-  check_op<T>(Queue, T(G), intel::minimum<T>(), true, G, L);
-
-  check_op<T>(Queue, T(G), intel::maximum<T>(), false, G, L);
-  check_op<T>(Queue, T(0), intel::maximum<T>(), true, G, L);
-
-#if __cplusplus >= 201402L
-  check_op<T>(Queue, T(L), intel::plus<>(), false, G, L);
-  check_op<T>(Queue, T(0), intel::plus<>(), true, G, L);
-
-  check_op<T>(Queue, T(0), intel::minimum<>(), false, G, L);
-  check_op<T>(Queue, T(G), intel::minimum<>(), true, G, L);
-
-  check_op<T>(Queue, T(G), intel::maximum<>(), false, G, L);
-  check_op<T>(Queue, T(0), intel::maximum<>(), true, G, L);
-#endif
-}
+#include "reduce.hpp"
 
 int main() {
   queue Queue;
   if (!core_sg_supported(Queue.get_device())) {
     std::cout << "Skipping test\n";
     return 0;
   }
-
   check<int>(Queue);
   check<unsigned int>(Queue);
   check<long>(Queue);
   check<unsigned long>(Queue);
   check<float>(Queue);
-  // reduce half type is not supported in OCL CPU RT
-#ifdef SG_GPU
-  if (Queue.get_device().has_extension("cl_khr_fp16")) {
-    check<cl::sycl::half>(Queue);
-  }
-#endif
-  if (Queue.get_device().has_extension("cl_khr_fp64")) {
-    check<double>(Queue);
-  }
   std::cout << "Test passed." << std::endl;
   return 0;
 }