intel · v-klochkov · Oct 19, 2020 · Oct 15, 2020 · Oct 15, 2020 · Oct 15, 2020
@@ -40,4 +40,10 @@ enum FPRoundingMode {
   SPV_RTN = 3,
 };
 
+enum GroupOperation {
+  Reduce = 0,
+  InclusiveScan = 1,
+  ExclusiveScan = 2,
+};
+
 #endif // CLC_SPIRV_TYPES
@@ -83,3 +83,5 @@ workitem/get_sub_group_local_id.cl
 workitem/get_sub_group_size.cl
 images/image_helpers.ll
 images/image.cl
+group/collectives_helpers.ll
+group/collectives.cl
@@ -0,0 +1,61 @@
+; 64 storage locations is sufficient for all current-generation NVIDIA GPUs
+; 64 bits per warp is sufficient for all fundamental data types
+; Reducing storage for small data types or increasing it for user-defined types
+; will likely require an additional pass to track group algorithm usage
+@__clc__group_scratch = internal addrspace(3) global [64 x i64] undef, align 1
+
+define i8 addrspace(3)* @__clc__get_group_scratch_bool() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
+  ret i8 addrspace(3)* %cast
+}
+
+define i8 addrspace(3)* @__clc__get_group_scratch_char() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
+  ret i8 addrspace(3)* %cast
+}
+
+define i16 addrspace(3)* @__clc__get_group_scratch_short() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to i16 addrspace(3)*
+  ret i16 addrspace(3)* %cast
+}
+
+define i32 addrspace(3)* @__clc__get_group_scratch_int() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to i32 addrspace(3)*
+  ret i32 addrspace(3)* %cast
+}
+
+define i64 addrspace(3)* @__clc__get_group_scratch_long() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to i64 addrspace(3)*
+  ret i64 addrspace(3)* %cast
+}
+
+define half addrspace(3)* @__clc__get_group_scratch_half() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to half addrspace(3)*
+  ret half addrspace(3)* %cast
+}
+
+define float addrspace(3)* @__clc__get_group_scratch_float() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to float addrspace(3)*
+  ret float addrspace(3)* %cast
+}
+
+define double addrspace(3)* @__clc__get_group_scratch_double() nounwind alwaysinline {
+entry:
+  %ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
+  %cast = bitcast i64 addrspace(3)* %ptr to double addrspace(3)*
+  ret double addrspace(3)* %cast
+}
@@ -14,15 +14,15 @@ DPC++ extensions status:
 | [SYCL_INTEL_device_specific_kernel_queries](DeviceSpecificKernelQueries/SYCL_INTEL_device_specific_kernel_queries.asciidoc) | Proposal                                  | |
 | [SYCL_INTEL_enqueue_barrier](EnqueueBarrier/enqueue_barrier.asciidoc)                                                       | Supported(OpenCL, Level Zero)             | |
 | [SYCL_INTEL_extended_atomics](ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc)                                         | Supported(OpenCL: CPU, GPU)               | |
-| [SYCL_INTEL_group_algorithms](GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc)                                         | Supported(OpenCL)                         | |
+| [SYCL_INTEL_group_algorithms](GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc)                                         | Supported(OpenCL; CUDA)                   | |
 | [SYCL_INTEL_group_mask](./GroupMask/SYCL_INTEL_group_mask.asciidoc)                                                         | Proposal                                  | |
 | [FPGA selector](IntelFPGA/FPGASelector.md)                                                                                  | Supported                                 | |
 | [FPGA reg](IntelFPGA/FPGAReg.md)                                                                                            | Supported(OpenCL: ACCELERATOR)            | |
 | [SYCL_INTEL_kernel_restrict_all](KernelRestrictAll/SYCL_INTEL_kernel_restrict_all.asciidoc)                                 | Supported(OpenCL)                         | |
 | [SYCL_INTEL_attribute_style](KernelRHSAttributes/SYCL_INTEL_attribute_style.asciidoc)                                       | Proposal                                  | |
 | [Queue Order Properties](OrderedQueue/OrderedQueue_v2.adoc)                                                                 | Supported                                 | |
 | [Queue Shortcuts](QueueShortcuts/QueueShortcuts.adoc)                                                                       | Supported                                 | |
-| [Reductions for ND-Range Parallelism](Reduction/Reduction.md)                                                               | Partially supported(OpenCL: CPU, GPU)     | Not supported: multiple reduction vars, multi-dimensional reduction vars |
+| [Reductions for ND-Range Parallelism](Reduction/Reduction.md)                                                               | Partially supported(OpenCL: CPU, GPU; CUDA) | Not supported: multiple reduction vars, multi-dimensional reduction vars |
 | [SYCL_INTEL_relax_standard_layout](RelaxStdLayout/SYCL_INTEL_relax_standard_layout.asciidoc)                                | Supported                                 | |
 | [SYCL_INTEL_reqd_work_group_size](ReqdWorkGroupSize/SYCL_INTEL_reqd_work_group_size.asciidoc)                               | Supported(OpenCL: CPU, GPU)               | |
 | [SPV_INTEL_function_pointers](SPIRV/SPV_INTEL_function_pointers.asciidoc)                                                   | Supported(OpenCL: CPU, GPU; HOST)         | |

@@ -1,12 +1,10 @@
-// UNSUPPORTED: cuda
-// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
-//
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+#include "support.h"
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -32,7 +30,7 @@ void test(queue q, InputContainer input, OutputContainer output,
           Predicate pred) {
   typedef class all_of_kernel<Predicate> kernel_name;
   size_t N = input.size();
-  size_t G = 16;
+  size_t G = 64;
   {
     buffer<int> in_buf(input.data(), input.size());
     buffer<bool> out_buf(output.data(), output.size());
@@ -57,13 +55,12 @@ void test(queue q, InputContainer input, OutputContainer output,
 
 int main() {
   queue q;
-  std::string version = q.get_device().get_info<info::device::version>();
-  if (version < std::string("2.0")) {
+  if (!isSupportedDevice(q.get_device())) {
     std::cout << "Skipping test\n";
     return 0;
   }
 
-  constexpr int N = 32;
+  constexpr int N = 128;
   std::array<int, N> input;
   std::array<bool, 3> output;
   std::iota(input.begin(), input.end(), 0);

@@ -1,12 +1,10 @@
-// UNSUPPORTED: cuda
-// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
-//
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+#include "support.h"
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -34,7 +32,7 @@ void test(queue q, InputContainer input, OutputContainer output,
   typedef typename OutputContainer::value_type OutputT;
   typedef class any_of_kernel<Predicate> kernel_name;
   size_t N = input.size();
-  size_t G = 16;
+  size_t G = 64;
   {
     buffer<InputT> in_buf(input.data(), input.size());
     buffer<OutputT> out_buf(output.data(), output.size());
@@ -59,13 +57,12 @@ void test(queue q, InputContainer input, OutputContainer output,
 
 int main() {
   queue q;
-  std::string version = q.get_device().get_info<info::device::version>();
-  if (version < std::string("2.0")) {
+  if (!isSupportedDevice(q.get_device())) {
     std::cout << "Skipping test\n";
     return 0;
   }
 
-  constexpr int N = 32;
+  constexpr int N = 128;
   std::array<int, N> input;
   std::array<bool, 3> output;
   std::iota(input.begin(), input.end(), 0);

@@ -1,12 +1,10 @@
-// UNSUPPORTED: cuda
-// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
-//
 // RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+#include "support.h"
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -46,8 +44,7 @@ void test(queue q, InputContainer input, OutputContainer output) {
 
 int main() {
   queue q;
-  std::string version = q.get_device().get_info<info::device::version>();
-  if (version < std::string("2.0")) {
+  if (!isSupportedDevice(q.get_device())) {
     std::cout << "Skipping test\n";
     return 0;
   }

@@ -1,7 +1,4 @@
-// UNSUPPORTED: cuda
-// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
-//
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -13,8 +10,10 @@
 // unconditionally. Using operators specific for spirv 1.3 and higher with
 // -spirv-max-version=1.1 being set by default causes assert/check fails
 // in spirv translator.
-// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -o %t13.out
+// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \
+   %t13.out
 
+#include "support.h"
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -57,7 +56,7 @@ void test(queue q, InputContainer input, OutputContainer output,
   typedef class exclusive_scan_kernel<SpecializationKernelName, 3> kernel_name3;
   OutputT init = 42;
   size_t N = input.size();
-  size_t G = 16;
+  size_t G = 64;
   std::vector<OutputT> expected(N);
   {
     buffer<InputT> in_buf(input.data(), input.size());
@@ -128,32 +127,14 @@ void test(queue q, InputContainer input, OutputContainer output,
   assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
 }
 
-bool isSupportedDevice(device D) {
-  std::string PlatformName = D.get_platform().get_info<info::platform::name>();
-  if (PlatformName.find("Level-Zero") != std::string::npos)
-    return true;
-
-  if (PlatformName.find("OpenCL") != std::string::npos) {
-    std::string Version = D.get_info<info::device::version>();
-    size_t Offset = Version.find("OpenCL");
-    if (Offset == std::string::npos)
-      return false;
-    Version = Version.substr(Offset + 7, 3);
-    if (Version >= std::string("2.0"))
-      return true;
-  }
-
-  return false;
-}
-
 int main() {
   queue q;
   if (!isSupportedDevice(q.get_device())) {
     std::cout << "Skipping test\n";
     return 0;
   }
 
-  constexpr int N = 32;
+  constexpr int N = 128;
   std::array<int, N> input;
   std::array<int, N> output;
   std::iota(input.begin(), input.end(), 0);

@@ -1,7 +1,4 @@
-// UNSUPPORTED: cuda
-// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
-//
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -13,8 +10,10 @@
 // unconditionally. Using operators specific for spirv 1.3 and higher with
 // -spirv-max-version=1.1 being set by default causes assert/check fails
 // in spirv translator.
-// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -o %t13.out
+// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \
+   %t13.out
 
+#include "support.h"
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -57,7 +56,7 @@ void test(queue q, InputContainer input, OutputContainer output,
   typedef class inclusive_scan_kernel<SpecializationKernelName, 3> kernel_name3;
   OutputT init = 42;
   size_t N = input.size();
-  size_t G = 16;
+  size_t G = 64;
   std::vector<OutputT> expected(N);
   {
     buffer<InputT> in_buf(input.data(), input.size());
@@ -128,32 +127,14 @@ void test(queue q, InputContainer input, OutputContainer output,
   assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
 }
 
-bool isSupportedDevice(device D) {
-  std::string PlatformName = D.get_platform().get_info<info::platform::name>();
-  if (PlatformName.find("Level-Zero") != std::string::npos)
-    return true;
-
-  if (PlatformName.find("OpenCL") != std::string::npos) {
-    std::string Version = D.get_info<info::device::version>();
-    size_t Offset = Version.find("OpenCL");
-    if (Offset == std::string::npos)
-      return false;
-    Version = Version.substr(Offset + 7, 3);
-    if (Version >= std::string("2.0"))
-      return true;
-  }
-
-  return false;
-}
-
 int main() {
   queue q;
   if (!isSupportedDevice(q.get_device())) {
     std::cout << "Skipping test\n";
     return 0;
   }
 
-  constexpr int N = 32;
+  constexpr int N = 128;
   std::array<int, N> input;
   std::array<int, N> output;
   std::iota(input.begin(), input.end(), 0);

@@ -1,12 +1,10 @@
-// UNSUPPORTED: cuda
-// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
-//
-// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
 // RUN: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 
+#include "support.h"
 #include <CL/sycl.hpp>
 #include <algorithm>
 #include <cassert>
@@ -32,7 +30,7 @@ void test(queue q, InputContainer input, OutputContainer output,
           Predicate pred) {
   typedef class none_of_kernel<Predicate> kernel_name;
   size_t N = input.size();
-  size_t G = 16;
+  size_t G = 64;
   {
     buffer<int> in_buf(input.data(), input.size());
     buffer<bool> out_buf(output.data(), output.size());
@@ -57,13 +55,12 @@ void test(queue q, InputContainer input, OutputContainer output,
 
 int main() {
   queue q;
-  std::string version = q.get_device().get_info<info::device::version>();
-  if (version < std::string("2.0")) {
+  if (!isSupportedDevice(q.get_device())) {
     std::cout << "Skipping test\n";
     return 0;
   }
 
-  constexpr int N = 32;
+  constexpr int N = 128;
   std::array<int, N> input;
   std::array<bool, 3> output;
   std::iota(input.begin(), input.end(), 0);