Skip to content

Commit 909459b

Browse files
authored
[SYCL][CUDA] Add group algorithms (#2647)
* [SYCL][CUDA] Add group algorithms Adds support for the following SPIR-V instructions to libclc: - OpGroupAll, OpGroupAny - OpGroupBroadcast - OpGroupIAdd, OpGroupFAdd - OpGroupFMin, OpGroupUMin, OpGroupSMin - OpGroupFMax, OpGroupUMax, OpGroupSMax At sub-group scope, these operations employ shuffles and other warp instructions. At work-group scope, partial results from each sub-group are combined via shared memory. The current implementation reserves 512 bytes of shared memory for any kernel using a group algorithm, which is sufficient to cover the worst case. Determining the correct amount of shared memory to reserve for a specific kernel will likely require a dedicated compiler pass. * [SYCL][CUDA] Enable group algorithm tests Moves isSupportedDevice into support.h and adds check for CUDA. Increases work-group size for some tests to ensure more than one warp. * [SYCL][CUDA] Enable reduction tests Reductions only failed previously because of missing group algorithm support. * [SYCL][CUDA] Add half overloads to libclc Requires additional mangled entry points: - OpenCL mangles "half" to "h" - SYCL mangles "half" to "DF16_" * [SYCL][CUDA][Doc] Update extension support docs Signed-off-by: John Pennycook <[email protected]>
1 parent b37a234 commit 909459b

24 files changed

+547
-140
lines changed

libclc/generic/include/spirv/spirv_types.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,4 +40,10 @@ enum FPRoundingMode {
4040
SPV_RTN = 3,
4141
};
4242

43+
enum GroupOperation {
44+
Reduce = 0,
45+
InclusiveScan = 1,
46+
ExclusiveScan = 2,
47+
};
48+
4349
#endif // CLC_SPIRV_TYPES

libclc/ptx-nvidiacl/libspirv/SOURCES

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,3 +83,5 @@ workitem/get_sub_group_local_id.cl
8383
workitem/get_sub_group_size.cl
8484
images/image_helpers.ll
8585
images/image.cl
86+
group/collectives_helpers.ll
87+
group/collectives.cl

libclc/ptx-nvidiacl/libspirv/group/collectives.cl

Lines changed: 417 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
; 64 storage locations is sufficient for all current-generation NVIDIA GPUs
2+
; 64 bits per warp is sufficient for all fundamental data types
3+
; Reducing storage for small data types or increasing it for user-defined types
4+
; will likely require an additional pass to track group algorithm usage
5+
@__clc__group_scratch = internal addrspace(3) global [64 x i64] undef, align 1
6+
7+
define i8 addrspace(3)* @__clc__get_group_scratch_bool() nounwind alwaysinline {
8+
entry:
9+
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
10+
%cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
11+
ret i8 addrspace(3)* %cast
12+
}
13+
14+
define i8 addrspace(3)* @__clc__get_group_scratch_char() nounwind alwaysinline {
15+
entry:
16+
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
17+
%cast = bitcast i64 addrspace(3)* %ptr to i8 addrspace(3)*
18+
ret i8 addrspace(3)* %cast
19+
}
20+
21+
define i16 addrspace(3)* @__clc__get_group_scratch_short() nounwind alwaysinline {
22+
entry:
23+
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
24+
%cast = bitcast i64 addrspace(3)* %ptr to i16 addrspace(3)*
25+
ret i16 addrspace(3)* %cast
26+
}
27+
28+
define i32 addrspace(3)* @__clc__get_group_scratch_int() nounwind alwaysinline {
29+
entry:
30+
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
31+
%cast = bitcast i64 addrspace(3)* %ptr to i32 addrspace(3)*
32+
ret i32 addrspace(3)* %cast
33+
}
34+
35+
define i64 addrspace(3)* @__clc__get_group_scratch_long() nounwind alwaysinline {
36+
entry:
37+
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
38+
%cast = bitcast i64 addrspace(3)* %ptr to i64 addrspace(3)*
39+
ret i64 addrspace(3)* %cast
40+
}
41+
42+
define half addrspace(3)* @__clc__get_group_scratch_half() nounwind alwaysinline {
43+
entry:
44+
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
45+
%cast = bitcast i64 addrspace(3)* %ptr to half addrspace(3)*
46+
ret half addrspace(3)* %cast
47+
}
48+
49+
define float addrspace(3)* @__clc__get_group_scratch_float() nounwind alwaysinline {
50+
entry:
51+
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
52+
%cast = bitcast i64 addrspace(3)* %ptr to float addrspace(3)*
53+
ret float addrspace(3)* %cast
54+
}
55+
56+
define double addrspace(3)* @__clc__get_group_scratch_double() nounwind alwaysinline {
57+
entry:
58+
%ptr = getelementptr inbounds [64 x i64], [64 x i64] addrspace(3)* @__clc__group_scratch, i64 0, i64 0
59+
%cast = bitcast i64 addrspace(3)* %ptr to double addrspace(3)*
60+
ret double addrspace(3)* %cast
61+
}

sycl/doc/extensions/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,15 +14,15 @@ DPC++ extensions status:
1414
| [SYCL_INTEL_device_specific_kernel_queries](DeviceSpecificKernelQueries/SYCL_INTEL_device_specific_kernel_queries.asciidoc) | Proposal | |
1515
| [SYCL_INTEL_enqueue_barrier](EnqueueBarrier/enqueue_barrier.asciidoc) | Supported(OpenCL, Level Zero) | |
1616
| [SYCL_INTEL_extended_atomics](ExtendedAtomics/SYCL_INTEL_extended_atomics.asciidoc) | Supported(OpenCL: CPU, GPU) | |
17-
| [SYCL_INTEL_group_algorithms](GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc) | Supported(OpenCL) | |
17+
| [SYCL_INTEL_group_algorithms](GroupAlgorithms/SYCL_INTEL_group_algorithms.asciidoc) | Supported(OpenCL; CUDA) | |
1818
| [SYCL_INTEL_group_mask](./GroupMask/SYCL_INTEL_group_mask.asciidoc) | Proposal | |
1919
| [FPGA selector](IntelFPGA/FPGASelector.md) | Supported | |
2020
| [FPGA reg](IntelFPGA/FPGAReg.md) | Supported(OpenCL: ACCELERATOR) | |
2121
| [SYCL_INTEL_kernel_restrict_all](KernelRestrictAll/SYCL_INTEL_kernel_restrict_all.asciidoc) | Supported(OpenCL) | |
2222
| [SYCL_INTEL_attribute_style](KernelRHSAttributes/SYCL_INTEL_attribute_style.asciidoc) | Proposal | |
2323
| [Queue Order Properties](OrderedQueue/OrderedQueue_v2.adoc) | Supported | |
2424
| [Queue Shortcuts](QueueShortcuts/QueueShortcuts.adoc) | Supported | |
25-
| [Reductions for ND-Range Parallelism](Reduction/Reduction.md) | Partially supported(OpenCL: CPU, GPU) | Not supported: multiple reduction vars, multi-dimensional reduction vars |
25+
| [Reductions for ND-Range Parallelism](Reduction/Reduction.md) | Partially supported(OpenCL: CPU, GPU; CUDA) | Not supported: multiple reduction vars, multi-dimensional reduction vars |
2626
| [SYCL_INTEL_relax_standard_layout](RelaxStdLayout/SYCL_INTEL_relax_standard_layout.asciidoc) | Supported | |
2727
| [SYCL_INTEL_reqd_work_group_size](ReqdWorkGroupSize/SYCL_INTEL_reqd_work_group_size.asciidoc) | Supported(OpenCL: CPU, GPU) | |
2828
| [SPV_INTEL_function_pointers](SPIRV/SPV_INTEL_function_pointers.asciidoc) | Supported(OpenCL: CPU, GPU; HOST) | |

sycl/test/group-algorithm/all_of.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
// UNSUPPORTED: cuda
2-
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
3-
//
4-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
52
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
63
// RUN: %CPU_RUN_PLACEHOLDER %t.out
74
// RUN: %GPU_RUN_PLACEHOLDER %t.out
85
// RUN: %ACC_RUN_PLACEHOLDER %t.out
96

7+
#include "support.h"
108
#include <CL/sycl.hpp>
119
#include <algorithm>
1210
#include <cassert>
@@ -32,7 +30,7 @@ void test(queue q, InputContainer input, OutputContainer output,
3230
Predicate pred) {
3331
typedef class all_of_kernel<Predicate> kernel_name;
3432
size_t N = input.size();
35-
size_t G = 16;
33+
size_t G = 64;
3634
{
3735
buffer<int> in_buf(input.data(), input.size());
3836
buffer<bool> out_buf(output.data(), output.size());
@@ -57,13 +55,12 @@ void test(queue q, InputContainer input, OutputContainer output,
5755

5856
int main() {
5957
queue q;
60-
std::string version = q.get_device().get_info<info::device::version>();
61-
if (version < std::string("2.0")) {
58+
if (!isSupportedDevice(q.get_device())) {
6259
std::cout << "Skipping test\n";
6360
return 0;
6461
}
6562

66-
constexpr int N = 32;
63+
constexpr int N = 128;
6764
std::array<int, N> input;
6865
std::array<bool, 3> output;
6966
std::iota(input.begin(), input.end(), 0);

sycl/test/group-algorithm/any_of.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
// UNSUPPORTED: cuda
2-
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
3-
//
4-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
52
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
63
// RUN: %CPU_RUN_PLACEHOLDER %t.out
74
// RUN: %GPU_RUN_PLACEHOLDER %t.out
85
// RUN: %ACC_RUN_PLACEHOLDER %t.out
96

7+
#include "support.h"
108
#include <CL/sycl.hpp>
119
#include <algorithm>
1210
#include <cassert>
@@ -34,7 +32,7 @@ void test(queue q, InputContainer input, OutputContainer output,
3432
typedef typename OutputContainer::value_type OutputT;
3533
typedef class any_of_kernel<Predicate> kernel_name;
3634
size_t N = input.size();
37-
size_t G = 16;
35+
size_t G = 64;
3836
{
3937
buffer<InputT> in_buf(input.data(), input.size());
4038
buffer<OutputT> out_buf(output.data(), output.size());
@@ -59,13 +57,12 @@ void test(queue q, InputContainer input, OutputContainer output,
5957

6058
int main() {
6159
queue q;
62-
std::string version = q.get_device().get_info<info::device::version>();
63-
if (version < std::string("2.0")) {
60+
if (!isSupportedDevice(q.get_device())) {
6461
std::cout << "Skipping test\n";
6562
return 0;
6663
}
6764

68-
constexpr int N = 32;
65+
constexpr int N = 128;
6966
std::array<int, N> input;
7067
std::array<bool, 3> output;
7168
std::iota(input.begin(), input.end(), 0);

sycl/test/group-algorithm/broadcast.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
// UNSUPPORTED: cuda
2-
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
3-
//
41
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
52
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
63
// RUN: %CPU_RUN_PLACEHOLDER %t.out
74
// RUN: %GPU_RUN_PLACEHOLDER %t.out
85
// RUN: %ACC_RUN_PLACEHOLDER %t.out
96

7+
#include "support.h"
108
#include <CL/sycl.hpp>
119
#include <algorithm>
1210
#include <cassert>
@@ -46,8 +44,7 @@ void test(queue q, InputContainer input, OutputContainer output) {
4644

4745
int main() {
4846
queue q;
49-
std::string version = q.get_device().get_info<info::device::version>();
50-
if (version < std::string("2.0")) {
47+
if (!isSupportedDevice(q.get_device())) {
5148
std::cout << "Skipping test\n";
5249
return 0;
5350
}

sycl/test/group-algorithm/exclusive_scan.cpp

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
// UNSUPPORTED: cuda
2-
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
3-
//
4-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
52
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
63
// RUN: %CPU_RUN_PLACEHOLDER %t.out
74
// RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -13,8 +10,10 @@
1310
// unconditionally. Using operators specific for spirv 1.3 and higher with
1411
// -spirv-max-version=1.1 being set by default causes assert/check fails
1512
// in spirv translator.
16-
// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -o %t13.out
13+
// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \
14+
%t13.out
1715

16+
#include "support.h"
1817
#include <CL/sycl.hpp>
1918
#include <algorithm>
2019
#include <cassert>
@@ -57,7 +56,7 @@ void test(queue q, InputContainer input, OutputContainer output,
5756
typedef class exclusive_scan_kernel<SpecializationKernelName, 3> kernel_name3;
5857
OutputT init = 42;
5958
size_t N = input.size();
60-
size_t G = 16;
59+
size_t G = 64;
6160
std::vector<OutputT> expected(N);
6261
{
6362
buffer<InputT> in_buf(input.data(), input.size());
@@ -128,32 +127,14 @@ void test(queue q, InputContainer input, OutputContainer output,
128127
assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
129128
}
130129

131-
bool isSupportedDevice(device D) {
132-
std::string PlatformName = D.get_platform().get_info<info::platform::name>();
133-
if (PlatformName.find("Level-Zero") != std::string::npos)
134-
return true;
135-
136-
if (PlatformName.find("OpenCL") != std::string::npos) {
137-
std::string Version = D.get_info<info::device::version>();
138-
size_t Offset = Version.find("OpenCL");
139-
if (Offset == std::string::npos)
140-
return false;
141-
Version = Version.substr(Offset + 7, 3);
142-
if (Version >= std::string("2.0"))
143-
return true;
144-
}
145-
146-
return false;
147-
}
148-
149130
int main() {
150131
queue q;
151132
if (!isSupportedDevice(q.get_device())) {
152133
std::cout << "Skipping test\n";
153134
return 0;
154135
}
155136

156-
constexpr int N = 32;
137+
constexpr int N = 128;
157138
std::array<int, N> input;
158139
std::array<int, N> output;
159140
std::iota(input.begin(), input.end(), 0);

sycl/test/group-algorithm/inclusive_scan.cpp

Lines changed: 6 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,4 @@
1-
// UNSUPPORTED: cuda
2-
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
3-
//
4-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
52
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
63
// RUN: %CPU_RUN_PLACEHOLDER %t.out
74
// RUN: %GPU_RUN_PLACEHOLDER %t.out
@@ -13,8 +10,10 @@
1310
// unconditionally. Using operators specific for spirv 1.3 and higher with
1411
// -spirv-max-version=1.1 being set by default causes assert/check fails
1512
// in spirv translator.
16-
// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -o %t13.out
13+
// RUNx: %clangxx -fsycl -fsycl-targets=%sycl_triple -DSPIRV_1_3 %s -I . -o \
14+
%t13.out
1715

16+
#include "support.h"
1817
#include <CL/sycl.hpp>
1918
#include <algorithm>
2019
#include <cassert>
@@ -57,7 +56,7 @@ void test(queue q, InputContainer input, OutputContainer output,
5756
typedef class inclusive_scan_kernel<SpecializationKernelName, 3> kernel_name3;
5857
OutputT init = 42;
5958
size_t N = input.size();
60-
size_t G = 16;
59+
size_t G = 64;
6160
std::vector<OutputT> expected(N);
6261
{
6362
buffer<InputT> in_buf(input.data(), input.size());
@@ -128,32 +127,14 @@ void test(queue q, InputContainer input, OutputContainer output,
128127
assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
129128
}
130129

131-
bool isSupportedDevice(device D) {
132-
std::string PlatformName = D.get_platform().get_info<info::platform::name>();
133-
if (PlatformName.find("Level-Zero") != std::string::npos)
134-
return true;
135-
136-
if (PlatformName.find("OpenCL") != std::string::npos) {
137-
std::string Version = D.get_info<info::device::version>();
138-
size_t Offset = Version.find("OpenCL");
139-
if (Offset == std::string::npos)
140-
return false;
141-
Version = Version.substr(Offset + 7, 3);
142-
if (Version >= std::string("2.0"))
143-
return true;
144-
}
145-
146-
return false;
147-
}
148-
149130
int main() {
150131
queue q;
151132
if (!isSupportedDevice(q.get_device())) {
152133
std::cout << "Skipping test\n";
153134
return 0;
154135
}
155136

156-
constexpr int N = 32;
137+
constexpr int N = 128;
157138
std::array<int, N> input;
158139
std::array<int, N> output;
159140
std::iota(input.begin(), input.end(), 0);

sycl/test/group-algorithm/none_of.cpp

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
// UNSUPPORTED: cuda
2-
// OpenCL C 2.x alike work-group functions not yet supported by CUDA.
3-
//
4-
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -I . -o %t.out
52
// RUN: env SYCL_DEVICE_TYPE=HOST %t.out
63
// RUN: %CPU_RUN_PLACEHOLDER %t.out
74
// RUN: %GPU_RUN_PLACEHOLDER %t.out
85
// RUN: %ACC_RUN_PLACEHOLDER %t.out
96

7+
#include "support.h"
108
#include <CL/sycl.hpp>
119
#include <algorithm>
1210
#include <cassert>
@@ -32,7 +30,7 @@ void test(queue q, InputContainer input, OutputContainer output,
3230
Predicate pred) {
3331
typedef class none_of_kernel<Predicate> kernel_name;
3432
size_t N = input.size();
35-
size_t G = 16;
33+
size_t G = 64;
3634
{
3735
buffer<int> in_buf(input.data(), input.size());
3836
buffer<bool> out_buf(output.data(), output.size());
@@ -57,13 +55,12 @@ void test(queue q, InputContainer input, OutputContainer output,
5755

5856
int main() {
5957
queue q;
60-
std::string version = q.get_device().get_info<info::device::version>();
61-
if (version < std::string("2.0")) {
58+
if (!isSupportedDevice(q.get_device())) {
6259
std::cout << "Skipping test\n";
6360
return 0;
6461
}
6562

66-
constexpr int N = 32;
63+
constexpr int N = 128;
6764
std::array<int, N> input;
6865
std::array<bool, 3> output;
6966
std::iota(input.begin(), input.end(), 0);

0 commit comments

Comments
 (0)