Skip to content

[SYCL][CUDA] Add group algorithms #2647

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Oct 19, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions libclc/generic/include/spirv/spirv_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,10 @@ enum FPRoundingMode {
SPV_RTN = 3,
};

enum GroupOperation {
Reduce = 0,
InclusiveScan = 1,
ExclusiveScan = 2,
};

#endif // CLC_SPIRV_TYPES
2 changes: 2 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/SOURCES
Original file line number Diff line number Diff line change
Expand Up @@ -83,3 +83,5 @@ workitem/get_sub_group_local_id.cl
workitem/get_sub_group_size.cl
images/image_helpers.ll
images/image.cl
group/collectives_helpers.ll
group/collectives.cl
417 changes: 417 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/group/collectives.cl

Large diffs are not rendered by default.

61 changes: 61 additions & 0 deletions libclc/ptx-nvidiacl/libspirv/group/collectives_helpers.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
; 64 storage locations is sufficient for all current-generation NVIDIA GPUs
; 64 bits per warp is sufficient for all fundamental data types
; Reducing storage for small data types or increasing it for user-defined types
; will likely require an additional pass to track group algorithm usage
@__clc__group_scratch = internal addrspace(3) global [64 x i64] undef, align 1

define i8 addrspace(3)* @__clc__get_group_scratch_bool() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
ret i8 addrspace(3)* %cast
}

define i8 addrspace(3)* @__clc__get_group_scratch_char() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
ret i8 addrspace(3)* %cast
}

define i16 addrspace(3)* @__clc__get_group_scratch_short() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to i16 addrspace(3)*
ret i16 addrspace(3)* %cast
}

define i32 addrspace(3)* @__clc__get_group_scratch_int() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to i32 addrspace(3)*
ret i32 addrspace(3)* %cast
}

define i64 addrspace(3)* @__clc__get_group_scratch_long() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to i64 addrspace(3)*
ret i64 addrspace(3)* %cast
}

define half addrspace(3)* @__clc__get_group_scratch_half() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to half addrspace(3)*
ret half addrspace(3)* %cast
}

define float addrspace(3)* @__clc__get_group_scratch_float() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to float addrspace(3)*
ret float addrspace(3)* %cast
}

define double addrspace(3)* @__clc__get_group_scratch_double() nounwind alwaysinline {
entry:
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
%cast = bitcast i64 addrspace(3)* %ptr to double addrspace(3)*
ret double addrspace(3)* %cast
}
4 changes: 2 additions & 2 deletions sycl/doc/extensions/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,15 @@ DPC++ extensions status:
| [SYCL_INTEL_device_specific_kernel_queries](DeviceSpecificKernelQueries/SYCL_INTEL_device_specific_kernel_queries.asciidoc) | Proposal | |
| [SYCL_INTEL_enqueue_barrier](EnqueueBarrier/enqueue_barrier.asciidoc) | Supported(OpenCL, Level Zero) | |
| [SYCL_INTEL_extended_atomics](ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc) | Supported(OpenCL: CPU, GPU) | |
| [SYCL_INTEL_group_algorithms](GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc) | Supported(OpenCL) | |
| [SYCL_INTEL_group_algorithms](GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc) | Supported(OpenCL; CUDA) | |
| [SYCL_INTEL_group_mask](./GroupMask/SYCL_INTEL_group_mask.asciidoc) | Proposal | |
| [FPGA selector](IntelFPGA/FPGASelector.md) | Supported | |
| [FPGA reg](IntelFPGA/FPGAReg.md) | Supported(OpenCL: ACCELERATOR) | |
| [SYCL_INTEL_kernel_restrict_all](KernelRestrictAll/SYCL_INTEL_kernel_restrict_all.asciidoc) | Supported(OpenCL) | |
| [SYCL_INTEL_attribute_style](KernelRHSAttributes/SYCL_INTEL_attribute_style.asciidoc) | Proposal | |
| [Queue Order Properties](OrderedQueue/OrderedQueue_v2.adoc) | Supported | |
| [Queue Shortcuts](QueueShortcuts/QueueShortcuts.adoc) | Supported | |
| [Reductions for ND-Range Parallelism](Reduction/Reduction.md) | Partially supported(OpenCL: CPU, GPU) | Not supported: multiple reduction vars, multi-dimensional reduction vars |
| [Reductions for ND-Range Parallelism](Reduction/Reduction.md) | Partially supported(OpenCL: CPU, GPU; CUDA) | Not supported: multiple reduction vars, multi-dimensional reduction vars |
| [SYCL_INTEL_relax_standard_layout](RelaxStdLayout/SYCL_INTEL_relax_standard_layout.asciidoc) | Supported | |
| [SYCL_INTEL_reqd_work_group_size](ReqdWorkGroupSize/SYCL_INTEL_reqd_work_group_size.asciidoc) | Supported(OpenCL: CPU, GPU) | |
| [SPV_INTEL_function_pointers](SPIRV/SPV_INTEL_function_pointers.asciidoc) | Supported(OpenCL: CPU, GPU; HOST) | |
Expand Down
13 changes: 5 additions & 8 deletions sycl/test/group-algorithm/all_of.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
// UNSUPPORTED: cuda
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

#include "support.h"
#include <CL/sycl.hpp>
#include <algorithm>
#include <cassert>
Expand All @@ -32,7 +30,7 @@ void test(queue q, InputContainer input, OutputContainer output,
Predicate pred) {
typedef class all_of_kernel<Predicate> kernel_name;
size_t N = input.size();
size_t G = 16;
size_t G = 64;
{
buffer<int> in_buf(input.data(), input.size());
buffer<bool> out_buf(output.data(), output.size());
Expand All @@ -57,13 +55,12 @@ void test(queue q, InputContainer input, OutputContainer output,

int main() {
queue q;
std::string version = q.get_device().get_info<info::device::version>();
if (version < std::string("2.0")) {
if (!isSupportedDevice(q.get_device())) {
std::cout << "Skipping test\n";
return 0;
}

constexpr int N = 32;
constexpr int N = 128;
std::array<int, N> input;
std::array<bool, 3> output;
std::iota(input.begin(), input.end(), 0);
Expand Down
13 changes: 5 additions & 8 deletions sycl/test/group-algorithm/any_of.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
// UNSUPPORTED: cuda
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

#include "support.h"
#include <CL/sycl.hpp>
#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -34,7 +32,7 @@ void test(queue q, InputContainer input, OutputContainer output,
typedef typename OutputContainer::value_type OutputT;
typedef class any_of_kernel<Predicate> kernel_name;
size_t N = input.size();
size_t G = 16;
size_t G = 64;
{
buffer<InputT> in_buf(input.data(), input.size());
buffer<OutputT> out_buf(output.data(), output.size());
Expand All @@ -59,13 +57,12 @@ void test(queue q, InputContainer input, OutputContainer output,

int main() {
queue q;
std::string version = q.get_device().get_info<info::device::version>();
if (version < std::string("2.0")) {
if (!isSupportedDevice(q.get_device())) {
std::cout << "Skipping test\n";
return 0;
}

constexpr int N = 32;
constexpr int N = 128;
std::array<int, N> input;
std::array<bool, 3> output;
std::iota(input.begin(), input.end(), 0);
Expand Down
7 changes: 2 additions & 5 deletions sycl/test/group-algorithm/broadcast.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
// UNSUPPORTED: cuda
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

#include "support.h"
#include <CL/sycl.hpp>
#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -46,8 +44,7 @@ void test(queue q, InputContainer input, OutputContainer output) {

int main() {
queue q;
std::string version = q.get_device().get_info<info::device::version>();
if (version < std::string("2.0")) {
if (!isSupportedDevice(q.get_device())) {
std::cout << "Skipping test\n";
return 0;
}
Expand Down
31 changes: 6 additions & 25 deletions sycl/test/group-algorithm/exclusive_scan.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
// UNSUPPORTED: cuda
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
Expand All @@ -13,8 +10,10 @@
// unconditionally. Using operators specific for spirv 1.3 and higher with
// -spirv-max-version=1.1 being set by default causes assert/check fails
// in spirv translator.
// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -o %t13.out
// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \
%t13.out

#include "support.h"
#include <CL/sycl.hpp>
#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -57,7 +56,7 @@ void test(queue q, InputContainer input, OutputContainer output,
typedef class exclusive_scan_kernel<SpecializationKernelName, 3> kernel_name3;
OutputT init = 42;
size_t N = input.size();
size_t G = 16;
size_t G = 64;
std::vector<OutputT> expected(N);
{
buffer<InputT> in_buf(input.data(), input.size());
Expand Down Expand Up @@ -128,32 +127,14 @@ void test(queue q, InputContainer input, OutputContainer output,
assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
}

bool isSupportedDevice(device D) {
std::string PlatformName = D.get_platform().get_info<info::platform::name>();
if (PlatformName.find("Level-Zero") != std::string::npos)
return true;

if (PlatformName.find("OpenCL") != std::string::npos) {
std::string Version = D.get_info<info::device::version>();
size_t Offset = Version.find("OpenCL");
if (Offset == std::string::npos)
return false;
Version = Version.substr(Offset + 7, 3);
if (Version >= std::string("2.0"))
return true;
}

return false;
}

int main() {
queue q;
if (!isSupportedDevice(q.get_device())) {
std::cout << "Skipping test\n";
return 0;
}

constexpr int N = 32;
constexpr int N = 128;
std::array<int, N> input;
std::array<int, N> output;
std::iota(input.begin(), input.end(), 0);
Expand Down
31 changes: 6 additions & 25 deletions sycl/test/group-algorithm/inclusive_scan.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
// UNSUPPORTED: cuda
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
Expand All @@ -13,8 +10,10 @@
// unconditionally. Using operators specific for spirv 1.3 and higher with
// -spirv-max-version=1.1 being set by default causes assert/check fails
// in spirv translator.
// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -o %t13.out
// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \
%t13.out

#include "support.h"
#include <CL/sycl.hpp>
#include <algorithm>
#include <cassert>
Expand Down Expand Up @@ -57,7 +56,7 @@ void test(queue q, InputContainer input, OutputContainer output,
typedef class inclusive_scan_kernel<SpecializationKernelName, 3> kernel_name3;
OutputT init = 42;
size_t N = input.size();
size_t G = 16;
size_t G = 64;
std::vector<OutputT> expected(N);
{
buffer<InputT> in_buf(input.data(), input.size());
Expand Down Expand Up @@ -128,32 +127,14 @@ void test(queue q, InputContainer input, OutputContainer output,
assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
}

bool isSupportedDevice(device D) {
std::string PlatformName = D.get_platform().get_info<info::platform::name>();
if (PlatformName.find("Level-Zero") != std::string::npos)
return true;

if (PlatformName.find("OpenCL") != std::string::npos) {
std::string Version = D.get_info<info::device::version>();
size_t Offset = Version.find("OpenCL");
if (Offset == std::string::npos)
return false;
Version = Version.substr(Offset + 7, 3);
if (Version >= std::string("2.0"))
return true;
}

return false;
}

int main() {
queue q;
if (!isSupportedDevice(q.get_device())) {
std::cout << "Skipping test\n";
return 0;
}

constexpr int N = 32;
constexpr int N = 128;
std::array<int, N> input;
std::array<int, N> output;
std::iota(input.begin(), input.end(), 0);
Expand Down
13 changes: 5 additions & 8 deletions sycl/test/group-algorithm/none_of.cpp
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
// UNSUPPORTED: cuda
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
//
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out
// RUN: %ACC_RUN_PLACEHOLDER %t.out

#include "support.h"
#include <CL/sycl.hpp>
#include <algorithm>
#include <cassert>
Expand All @@ -32,7 +30,7 @@ void test(queue q, InputContainer input, OutputContainer output,
Predicate pred) {
typedef class none_of_kernel<Predicate> kernel_name;
size_t N = input.size();
size_t G = 16;
size_t G = 64;
{
buffer<int> in_buf(input.data(), input.size());
buffer<bool> out_buf(output.data(), output.size());
Expand All @@ -57,13 +55,12 @@ void test(queue q, InputContainer input, OutputContainer output,

int main() {
queue q;
std::string version = q.get_device().get_info<info::device::version>();
if (version < std::string("2.0")) {
if (!isSupportedDevice(q.get_device())) {
std::cout << "Skipping test\n";
return 0;
}

constexpr int N = 32;
constexpr int N = 128;
std::array<int, N> input;
std::array<bool, 3> output;
std::iota(input.begin(), input.end(), 0);
Expand Down
Loading