intel · bader · Aug 5, 2020 · Aug 4, 2020 · Aug 4, 2020 · Aug 4, 2020
@@ -36,7 +36,7 @@
 config.test_exec_root = os.path.join(config.sycl_obj_root, 'test')
 
 # Propagate some variables from the host environment.
-llvm_config.with_system_environment(['PATH', 'OCL_ICD_FILENAME', 'SYCL_DEVICE_ALLOWLIST', 'SYCL_CONFIG_FILE_NAME'])
+llvm_config.with_system_environment(['PATH', 'OCL_ICD_FILENAMES', 'SYCL_DEVICE_ALLOWLIST', 'SYCL_CONFIG_FILE_NAME'])
 
 # Configure LD_LIBRARY_PATH or corresponding os-specific alternatives
 if platform.system() == "Linux":

@@ -8,12 +8,10 @@
 
 #include "helper.hpp"
 #include <CL/sycl.hpp>
-template <typename T>
-class sycl_subgr;
+template <typename T> class sycl_subgr;
 using namespace cl::sycl;
-template <typename T>
-void check(queue &Queue) {
-  const int G = 240, L = 60;
+template <typename T> void check(queue &Queue) {
+  const int G = 256, L = 64;
   try {
     nd_range<1> NdRange(G, L);
     buffer<T> syclbuf(G);
@@ -23,9 +21,10 @@ void check(queue &Queue) {
       auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>(cgh);
       cgh.parallel_for<sycl_subgr<T>>(NdRange, [=](nd_item<1> NdItem) {
         intel::sub_group SG = NdItem.get_sub_group();
-        /*Broadcast GID of element with SGLID == SGID */
+        /*Broadcast GID of element with SGLID == SGID % SGMLR*/
         syclacc[NdItem.get_global_id()] =
-            broadcast(SG, T(NdItem.get_global_id(0)), SG.get_group_id());
+            broadcast(SG, T(NdItem.get_global_id(0)),
+                      SG.get_group_id() % SG.get_max_local_range()[0]);
         if (NdItem.get_global_id(0) == 0)
           sgsizeacc[0] = SG.get_max_local_range()[0];
       });
@@ -44,7 +43,8 @@ void check(queue &Queue) {
         WGid++;
         SGid = 0;
       }
-      exit_if_not_equal<T>(syclacc[j], L * WGid + SGid + SGid * sg_size,
+      exit_if_not_equal<T>(syclacc[j],
+                           L * WGid + SGid % sg_size + SGid * sg_size,
                            "broadcasted value");
     }
   } catch (exception e) {

@@ -1,6 +1,5 @@
-// UNSUPPORTED: cuda || cpu
+// UNSUPPORTED: cuda
 // CUDA compilation and runtime do not yet support sub-groups.
-// #2245 failed on OpenCL CPU (2020.10.7.0.15) with avx2 instruction set
 //
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
@@ -24,7 +23,7 @@ template <typename T> class pointer_kernel;
 using namespace cl::sycl;
 
 template <typename T>
-void check_pointer(queue &Queue, size_t G = 240, size_t L = 60) {
+void check_pointer(queue &Queue, size_t G = 256, size_t L = 64) {
   try {
     nd_range<1> NdRange(G, L);
     buffer<T *> buf(G);
@@ -118,7 +117,7 @@ void check_pointer(queue &Queue, size_t G = 240, size_t L = 60) {
 }
 
 template <typename T, typename Generator>
-void check_struct(queue &Queue, Generator &Gen, size_t G = 240, size_t L = 60) {
+void check_struct(queue &Queue, Generator &Gen, size_t G = 256, size_t L = 64) {
 
   // Fill a vector with values that will be shuffled
   std::vector<T> values(G);

@@ -1,5 +1,7 @@
-// UNSUPPORTED: cuda
+// UNSUPPORTED: cuda || cpu
 // CUDA compilation and runtime do not yet support sub-groups.
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
 //
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out

@@ -1,5 +1,7 @@
-// UNSUPPORTED: cuda
+// UNSUPPORTED: cuda || cpu
 // CUDA compilation and runtime do not yet support sub-groups.
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
 //
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out

@@ -9,14 +9,13 @@
 #include "helper.hpp"
 #include <CL/sycl.hpp>
 
-template <typename T, class BinaryOperation>
-class sycl_subgr;
+template <typename T, class BinaryOperation> class sycl_subgr;
 
 using namespace cl::sycl;
 
 template <typename T, class BinaryOperation>
 void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false,
-              size_t G = 240, size_t L = 60) {
+              size_t G = 256, size_t L = 64) {
   try {
     nd_range<1> NdRange(G, L);
     buffer<T> buf(G);
@@ -65,8 +64,7 @@ void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false,
   }
 }
 
-template <typename T>
-void check(queue &Queue, size_t G = 240, size_t L = 60) {
+template <typename T> void check(queue &Queue, size_t G = 256, size_t L = 64) {
   // limit data range for half to avoid rounding issues
   if (std::is_same<T, cl::sycl::half>::value) {
     G = 64;

@@ -1,5 +1,7 @@
-// UNSUPPORTED: cuda
+// UNSUPPORTED: cuda || cpu
 // CUDA compilation and runtime do not yet support sub-groups.
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
 //
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out

@@ -1,5 +1,7 @@
-// UNSUPPORTED: cuda
+// UNSUPPORTED: cuda || cpu
 // CUDA compilation and runtime do not yet support sub-groups.
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
 //
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out

@@ -10,14 +10,13 @@
 #include <CL/sycl.hpp>
 #include <limits>
 
-template <typename T, class BinaryOperation>
-class sycl_subgr;
+template <typename T, class BinaryOperation> class sycl_subgr;
 
 using namespace cl::sycl;
 
 template <typename T, class BinaryOperation>
 void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false,
-              size_t G = 120, size_t L = 60) {
+              size_t G = 256, size_t L = 64) {
   try {
     nd_range<1> NdRange(G, L);
     buffer<T> exbuf(G), inbuf(G);
@@ -73,8 +72,7 @@ void check_op(queue &Queue, T init, BinaryOperation op, bool skip_init = false,
   }
 }
 
-template <typename T>
-void check(queue &Queue, size_t G = 120, size_t L = 60) {
+template <typename T> void check(queue &Queue, size_t G = 256, size_t L = 64) {
   // limit data range for half to avoid rounding issues
   if (std::is_same<T, cl::sycl::half>::value) {
     G = 64;

@@ -1,5 +1,7 @@
-// UNSUPPORTED: cuda
+// UNSUPPORTED: cuda || cpu
 // CUDA compilation and runtime do not yet support sub-groups.
+// #2252 Disable until all variants of built-ins are available in OpenCL CPU
+// runtime for every supported ISA
 //
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out

@@ -17,23 +17,15 @@ using namespace cl::sycl;
 struct wa_half;
 
 template <typename T, int N>
-void check(queue &Queue, size_t G = 240, size_t L = 60) {
+void check(queue &Queue, size_t G = 256, size_t L = 64) {
   try {
     nd_range<1> NdRange(G, L);
-    buffer<vec<T, N>> buf2(G);
-    buffer<vec<T, N>> buf2_up(G);
-    buffer<vec<T, N>> buf2_down(G);
     buffer<vec<T, N>> buf(G);
     buffer<vec<T, N>> buf_up(G);
     buffer<vec<T, N>> buf_down(G);
     buffer<vec<T, N>> buf_xor(G);
     buffer<size_t> sgsizebuf(1);
     Queue.submit([&](handler &cgh) {
-      auto acc2 = buf2.template get_access<access::mode::read_write>(cgh);
-      auto acc2_up = buf2_up.template get_access<access::mode::read_write>(cgh);
-      auto acc2_down =
-          buf2_down.template get_access<access::mode::read_write>(cgh);
-
       auto acc = buf.template get_access<access::mode::read_write>(cgh);
       auto acc_up = buf_up.template get_access<access::mode::read_write>(cgh);
       auto acc_down =
@@ -48,15 +40,6 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
         vec<T, N> vwggid(wggid), vsgid(sgid);
         if (wggid == 0)
           sgsizeacc[0] = SG.get_max_local_range()[0];
-        /* 1 for odd subgroups and 2 for even*/
-        acc2[NdItem.get_global_id()] =
-            SG.shuffle(vec<T, N>(1), vec<T, N>(2),
-                       (sgid % 2) ? 1 : SG.get_max_local_range()[0]);
-        /* GID-SGID */
-        acc2_up[NdItem.get_global_id()] = SG.shuffle_up(vwggid, vwggid, sgid);
-        /* GID-SGID or SGLID if GID+SGID > SGsize*/
-        acc2_down[NdItem.get_global_id()] =
-            SG.shuffle_down(vwggid, vec<T, N>(SG.get_local_id().get(0)), sgid);
 
         /*GID of middle element in every subgroup*/
         acc[NdItem.get_global_id()] =
@@ -73,9 +56,6 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
     auto acc = buf.template get_access<access::mode::read_write>();
     auto acc_up = buf_up.template get_access<access::mode::read_write>();
     auto acc_down = buf_down.template get_access<access::mode::read_write>();
-    auto acc2 = buf2.template get_access<access::mode::read_write>();
-    auto acc2_up = buf2_up.template get_access<access::mode::read_write>();
-    auto acc2_down = buf2_down.template get_access<access::mode::read_write>();
     auto acc_xor = buf_xor.template get_access<access::mode::read_write>();
     auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
 
@@ -98,28 +78,15 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
       exit_if_not_equal_vec<T, N>(
           acc[j], vec<T, N>(j / L * L + SGid * sg_size + sg_size / 2),
           "shuffle");
-      /* 1 for odd subgroups and 2 for even*/
-      exit_if_not_equal_vec<T, N>(acc2[j], vec<T, N>((SGid % 2) ? 1 : 2),
-                                  "shuffle2");
       /* Value GID+SGID for all element except last SGID in SG*/
       if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) {
-        exit_if_not_equal_vec(acc_down[j], vec<T, N>(j + SGid), "shuffle_down");
-        exit_if_not_equal_vec(acc2_down[j], vec<T, N>(j + SGid),
-                              "shuffle2_down");
-      } else {                /* SGLID for GID+SGid */
-        if (j % L + SGid < L) /* Do not go out  LG*/
-          exit_if_not_equal_vec<T, N>(acc2_down[j],
-                                      vec<T, N>((j + SGid) % L % sg_size),
-                                      "shuffle2_down");
+        exit_if_not_equal_vec(acc_down[j], vec<T, N>(j + SGid % sg_size),
+                              "shuffle_down");
       }
       /* Value GID-SGID for all element except first SGID in SG*/
       if (j % L % sg_size >= SGid) {
-        exit_if_not_equal_vec(acc_up[j], vec<T, N>(j - SGid), "shuffle_up");
-        exit_if_not_equal_vec(acc2_up[j], vec<T, N>(j - SGid), "shuffle2_up");
-      } else {                          /* SGLID for GID-SGid */
-        if (j % L - SGid + sg_size < L) /* Do not go out  LG*/
-          exit_if_not_equal_vec(acc2_up[j], vec<T, N>(j - SGid + sg_size),
-                                "shuffle2_up");
+        exit_if_not_equal_vec(acc_up[j], vec<T, N>(j - SGid % sg_size),
+                              "shuffle_up");
       }
       /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */
       exit_if_not_equal_vec(acc_xor[j],
@@ -133,23 +100,15 @@ void check(queue &Queue, size_t G = 240, size_t L = 60) {
   }
 }
 
-template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
+template <typename T> void check(queue &Queue, size_t G = 256, size_t L = 64) {
   try {
     nd_range<1> NdRange(G, L);
-    buffer<T> buf2(G);
-    buffer<T> buf2_up(G);
-    buffer<T> buf2_down(G);
     buffer<T> buf(G);
     buffer<T> buf_up(G);
     buffer<T> buf_down(G);
     buffer<T> buf_xor(G);
     buffer<size_t> sgsizebuf(1);
     Queue.submit([&](handler &cgh) {
-      auto acc2 = buf2.template get_access<access::mode::read_write>(cgh);
-      auto acc2_up = buf2_up.template get_access<access::mode::read_write>(cgh);
-      auto acc2_down =
-          buf2_down.template get_access<access::mode::read_write>(cgh);
-
       auto acc = buf.template get_access<access::mode::read_write>(cgh);
       auto acc_up = buf_up.template get_access<access::mode::read_write>(cgh);
       auto acc_down =
@@ -163,14 +122,6 @@ template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
         uint32_t sgid = SG.get_group_id().get(0);
         if (wggid == 0)
           sgsizeacc[0] = SG.get_max_local_range()[0];
-        /* 1 for odd subgroups and 2 for even*/
-        acc2[NdItem.get_global_id()] =
-            SG.shuffle<T>(1, 2, (sgid % 2) ? 1 : SG.get_max_local_range()[0]);
-        /* GID-SGID */
-        acc2_up[NdItem.get_global_id()] = SG.shuffle_up<T>(wggid, wggid, sgid);
-        /* GID-SGID or SGLID if GID+SGID > SGsize*/
-        acc2_down[NdItem.get_global_id()] =
-            SG.shuffle_down<T>(wggid, SG.get_local_id().get(0), sgid);
 
         /*GID of middle element in every subgroup*/
         acc[NdItem.get_global_id()] =
@@ -187,9 +138,6 @@ template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
     auto acc = buf.template get_access<access::mode::read_write>();
     auto acc_up = buf_up.template get_access<access::mode::read_write>();
     auto acc_down = buf_down.template get_access<access::mode::read_write>();
-    auto acc2 = buf2.template get_access<access::mode::read_write>();
-    auto acc2_up = buf2_up.template get_access<access::mode::read_write>();
-    auto acc2_down = buf2_down.template get_access<access::mode::read_write>();
     auto acc_xor = buf_xor.template get_access<access::mode::read_write>();
     auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
 
@@ -212,24 +160,13 @@ template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
       /*GID of middle element in every subgroup*/
       exit_if_not_equal<T>(acc[j], j / L * L + SGid * sg_size + sg_size / 2,
                            "shuffle");
-      /* 1 for odd subgroups and 2 for even*/
-      exit_if_not_equal<T>(acc2[j], (SGid % 2) ? 1 : 2, "shuffle2");
       /* Value GID+SGID for all element except last SGID in SG*/
       if (j % L % sg_size + SGid < sg_size && j % L + SGid < L) {
         exit_if_not_equal<T>(acc_down[j], j + SGid, "shuffle_down");
-        exit_if_not_equal<T>(acc2_down[j], j + SGid, "shuffle2_down");
-      } else {                /* SGLID for GID+SGid */
-        if (j % L + SGid < L) /* Do not go out  LG*/
-          exit_if_not_equal<T>(acc2_down[j], (j + SGid) % L % sg_size,
-                               "shuffle2_down");
       }
       /* Value GID-SGID for all element except first SGID in SG*/
       if (j % L % sg_size >= SGid) {
         exit_if_not_equal<T>(acc_up[j], j - SGid, "shuffle_up");
-        exit_if_not_equal<T>(acc2_up[j], j - SGid, "shuffle2_up");
-      } else {                          /* SGLID for GID-SGid */
-        if (j % L - SGid + sg_size < L) /* Do not go out  LG*/
-          exit_if_not_equal<T>(acc2_up[j], j - SGid + sg_size, "shuffle2_up");
       }
       /* Value GID with SGLID = ( SGLID XOR SGID ) % SGMaxSize */
       exit_if_not_equal<T>(acc_xor[j], SGBeginGid + (SGLid ^ (SGid % sg_size)),