Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 9cf8258

Browse files
authored
[SYCL] Add tests for span reductions (#1009)
* [SYCL] Add tests for span reductions Tests the following: - Only spans with static extents are available - Kernels with a single reduction span work - Kernels with a reduction pack containing a span work * [SYCL] Reenable sporadically failing tests
1 parent c9e9d5f commit 9cf8258

File tree

3 files changed

+380
-0
lines changed

3 files changed

+380
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// RUN: not %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out
2+
3+
#include <CL/sycl.hpp>
4+
5+
using namespace sycl;
6+
7+
int main(int argc, char *argv[]) {
8+
9+
// SYCL 2020 reductions cannot be created from spans with dynamic extents
10+
auto Span = span<int, dynamic_extent>(nullptr, 1);
11+
auto Redu = reduction(Span, plus<>());
12+
}

SYCL/Reduction/reduction_span.cpp

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,142 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
5+
//
6+
// `Group algorithms are not supported on host device.` on Nvidia.
7+
// XFAIL: hip_nvidia
8+
9+
// This test performs basic checks of reductions initialized with a sycl::span
10+
11+
#include <CL/sycl.hpp>
12+
using namespace sycl;
13+
14+
int NumErrors = 0;
15+
16+
template <int Dimensions> size_t getLinearSize(range<Dimensions> Range) {
17+
return Range.size();
18+
}
19+
20+
template <int Dimensions> size_t getLinearSize(nd_range<Dimensions> NDRange) {
21+
return NDRange.get_global_range().size();
22+
}
23+
24+
template <int Dimensions>
25+
size_t getLinearId(nd_range<Dimensions>, nd_item<Dimensions> Item) {
26+
return Item.get_global_linear_id();
27+
}
28+
29+
size_t getLinearId(range<1>, id<1> Id) { return Id[0]; }
30+
31+
size_t getLinearId(range<2> Range, id<2> Id) {
32+
return Id[0] * Range[1] + Id[1];
33+
}
34+
35+
size_t getLinearId(range<3> Range, id<3> Id) {
36+
return Id[0] * Range[1] * Range[2] + Id[1] * Range[2] + Id[2];
37+
}
38+
39+
enum class submission_mode {
40+
handler,
41+
queue,
42+
};
43+
44+
template <size_t N, typename T, typename BinaryOperation, typename Range,
45+
submission_mode SubmissionMode>
46+
void test(queue Q, Range Rng, T Identity, T Value) {
47+
48+
// Initialize output to identity value
49+
T *Output = malloc_shared<T>(N, Q);
50+
Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait();
51+
52+
// Perform generalized "histogram" with N bins
53+
auto Redu = reduction(span<T, N>(Output, N), Identity, BinaryOperation());
54+
auto Kern = [=](auto It, auto &Reducer) {
55+
size_t Index = getLinearId(Rng, It) % N;
56+
Reducer[Index].combine(Value);
57+
};
58+
if constexpr (SubmissionMode == submission_mode::handler) {
59+
Q.submit([&](handler &CGH) { CGH.parallel_for(Rng, Redu, Kern); }).wait();
60+
} else /*if (SubmissionMode == submission_mode::queue) */ {
61+
Q.parallel_for(Rng, Redu, Kern).wait();
62+
}
63+
64+
size_t Size = getLinearSize(Rng);
65+
66+
// Each bin should have the same value unless B doesn't divide N
67+
T Expected = Identity;
68+
T ExpectedRemainder;
69+
for (size_t I = 0; I < Size; I += N) {
70+
ExpectedRemainder = Expected;
71+
Expected = BinaryOperation()(Expected, Value);
72+
}
73+
74+
bool Passed = true;
75+
for (size_t I = 0; I < N; ++I) {
76+
if (I < Size % N) {
77+
Passed &= (Output[I] == Expected);
78+
} else {
79+
Passed &= (Output[I] == ExpectedRemainder);
80+
}
81+
}
82+
83+
free(Output, Q);
84+
NumErrors += (Passed) ? 0 : 1;
85+
}
86+
87+
struct CustomType {
88+
int x;
89+
bool operator==(const CustomType &o) const { return (x == o.x); }
90+
};
91+
92+
struct CustomBinaryOperation {
93+
CustomType operator()(const CustomType &lhs, const CustomType &rhs) const {
94+
return CustomType{lhs.x + rhs.x};
95+
}
96+
};
97+
98+
int main() {
99+
queue Q;
100+
101+
// Tests for small spans that can be privatized efficiently
102+
// Each combination tests a different sycl::reduction implementation
103+
test<16, int, std::plus<int>, sycl::range<1>, submission_mode::handler>(Q, 24,
104+
0, 1);
105+
test<16, float, std::plus<float>, sycl::range<1>, submission_mode::handler>(
106+
Q, 24, 0, 1);
107+
test<16, int, std::multiplies<int>, sycl::range<1>, submission_mode::handler>(
108+
Q, 24, 1, 2);
109+
test<16, CustomType, CustomBinaryOperation, sycl::range<1>,
110+
submission_mode::handler>(Q, 24, CustomType{0}, CustomType{1});
111+
test<16, int, std::plus<int>, sycl::range<1>, submission_mode::queue>(Q, 24,
112+
0, 1);
113+
test<16, float, std::plus<float>, sycl::range<1>, submission_mode::queue>(
114+
Q, 24, 0, 1);
115+
test<16, int, std::multiplies<int>, sycl::range<1>, submission_mode::queue>(
116+
Q, 24, 1, 2);
117+
test<16, CustomType, CustomBinaryOperation, sycl::range<1>,
118+
submission_mode::queue>(Q, 24, CustomType{0}, CustomType{1});
119+
120+
test<16, int, std::plus<int>, sycl::nd_range<1>, submission_mode::handler>(
121+
Q, {24, 8}, 0, 1);
122+
test<16, float, std::plus<float>, sycl::nd_range<1>,
123+
submission_mode::handler>(Q, {24, 8}, 0, 1);
124+
test<16, int, std::multiplies<int>, sycl::nd_range<1>,
125+
submission_mode::handler>(Q, {24, 8}, 1, 2);
126+
test<16, int, std::bit_or<int>, sycl::nd_range<1>, submission_mode::handler>(
127+
Q, {24, 8}, 0, 1);
128+
test<16, CustomType, CustomBinaryOperation, sycl::nd_range<1>,
129+
submission_mode::handler>(Q, {24, 8}, CustomType{0}, CustomType{1});
130+
test<16, int, std::plus<int>, sycl::nd_range<1>, submission_mode::queue>(
131+
Q, {24, 8}, 0, 1);
132+
test<16, float, std::plus<float>, sycl::nd_range<1>, submission_mode::queue>(
133+
Q, {24, 8}, 0, 1);
134+
test<16, int, std::multiplies<int>, sycl::nd_range<1>,
135+
submission_mode::queue>(Q, {24, 8}, 1, 2);
136+
test<16, int, std::bit_or<int>, sycl::nd_range<1>, submission_mode::queue>(
137+
Q, {24, 8}, 0, 1);
138+
test<16, CustomType, CustomBinaryOperation, sycl::nd_range<1>,
139+
submission_mode::queue>(Q, {24, 8}, CustomType{0}, CustomType{1});
140+
141+
return NumErrors;
142+
}
Lines changed: 226 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,226 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
5+
//
6+
// `Group algorithms are not supported on host device.` on Nvidia.
7+
// XFAIL: hip_nvidia
8+
9+
// This test performs basic checks of reductions initialized with a pack
10+
// containing at least one sycl::span
11+
12+
#include <CL/sycl.hpp>
13+
using namespace sycl;
14+
15+
int NumErrors = 0;
16+
17+
template <int Dimensions> size_t getLinearSize(range<Dimensions> Range) {
18+
return Range.size();
19+
}
20+
21+
template <int Dimensions> size_t getLinearSize(nd_range<Dimensions> NDRange) {
22+
return NDRange.get_global_range().size();
23+
}
24+
25+
template <int Dimensions>
26+
size_t getLinearId(nd_range<Dimensions>, nd_item<Dimensions> Item) {
27+
return Item.get_global_linear_id();
28+
}
29+
30+
size_t getLinearId(range<1>, id<1> Id) { return Id[0]; }
31+
32+
size_t getLinearId(range<2> Range, id<2> Id) {
33+
return Id[0] * Range[1] + Id[1];
34+
}
35+
36+
size_t getLinearId(range<3> Range, id<3> Id) {
37+
return Id[0] * Range[1] * Range[2] + Id[1] * Range[2] + Id[2];
38+
}
39+
40+
enum class submission_mode {
41+
handler,
42+
queue,
43+
};
44+
45+
// Test a span and a regular sum
46+
template <size_t N, typename T, typename BinaryOperation, typename Range,
47+
submission_mode SubmissionMode>
48+
void test1(queue Q, Range Rng, T Identity, T Value) {
49+
50+
// Initialize output to identity value
51+
int *Sum = malloc_shared<int>(1, Q);
52+
Q.single_task([=]() { *Sum = 0; }).wait();
53+
T *Output = malloc_shared<T>(N, Q);
54+
Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait();
55+
56+
// Perform generalized "histogram" with N bins
57+
auto ScalarRedu = reduction(Sum, plus<>());
58+
auto SpanRedu = reduction(span<T, N>(Output, N), Identity, BinaryOperation());
59+
auto Kern = [=](auto It, auto &ScalarReducer, auto &SpanReducer) {
60+
ScalarReducer++;
61+
size_t Index = getLinearId(Rng, It) % N;
62+
SpanReducer[Index].combine(Value);
63+
};
64+
if constexpr (SubmissionMode == submission_mode::handler) {
65+
Q.submit([&](handler &CGH) {
66+
CGH.parallel_for(Rng, ScalarRedu, SpanRedu, Kern);
67+
}).wait();
68+
} else /*if (SubmissionMode == submission_mode::queue) */ {
69+
Q.parallel_for(Rng, ScalarRedu, SpanRedu, Kern).wait();
70+
}
71+
72+
size_t Size = getLinearSize(Rng);
73+
74+
// Each bin should have the same value unless B doesn't divide N
75+
T Expected = Identity;
76+
T ExpectedRemainder;
77+
for (size_t I = 0; I < Size; I += N) {
78+
ExpectedRemainder = Expected;
79+
Expected = BinaryOperation()(Expected, Value);
80+
}
81+
82+
bool Passed = true;
83+
for (size_t I = 0; I < N; ++I) {
84+
if (I < Size % N) {
85+
Passed &= (Output[I] == Expected);
86+
} else {
87+
Passed &= (Output[I] == ExpectedRemainder);
88+
}
89+
}
90+
Passed &= (*Sum == Size);
91+
92+
free(Output, Q);
93+
free(Sum, Q);
94+
NumErrors += (Passed) ? 0 : 1;
95+
}
96+
97+
// Test two spans
98+
template <size_t N, typename T, typename BinaryOperation, typename Range,
99+
submission_mode SubmissionMode>
100+
void test2(queue Q, Range Rng, T Identity, T Value) {
101+
102+
// Initialize output to identity value
103+
int *Output1 = malloc_shared<int>(N, Q);
104+
Q.parallel_for(range<1>{N}, [=](id<1> I) { Output1[I] = 0; }).wait();
105+
T *Output2 = malloc_shared<T>(N, Q);
106+
Q.parallel_for(range<1>{N}, [=](id<1> I) { Output2[I] = Identity; }).wait();
107+
108+
// Perform generalized "histogram" with N bins
109+
auto Redu1 = reduction(span<int, N>(Output1, N), plus<>());
110+
auto Redu2 = reduction(span<T, N>(Output2, N), Identity, BinaryOperation());
111+
auto Kern = [=](auto It, auto &Reducer1, auto &Reducer2) {
112+
size_t Index = getLinearId(Rng, It) % N;
113+
Reducer1[Index]++;
114+
Reducer2[Index].combine(Value);
115+
};
116+
if constexpr (SubmissionMode == submission_mode::handler) {
117+
Q.submit([&](handler &CGH) {
118+
CGH.parallel_for(Rng, Redu1, Redu2, Kern);
119+
}).wait();
120+
} else /*if (SubmissionMode == submission_mode::queue) */ {
121+
Q.parallel_for(Rng, Redu1, Redu2, Kern).wait();
122+
}
123+
124+
size_t Size = getLinearSize(Rng);
125+
bool Passed = true;
126+
// Span1
127+
{
128+
int Expected = 0;
129+
int ExpectedRemainder;
130+
for (size_t I = 0; I < Size; I += N) {
131+
ExpectedRemainder = Expected;
132+
Expected += 1;
133+
}
134+
135+
for (size_t I = 0; I < N; ++I) {
136+
if (I < Size % N) {
137+
Passed &= (Output1[I] == Expected);
138+
} else {
139+
Passed &= (Output1[I] == ExpectedRemainder);
140+
}
141+
}
142+
}
143+
144+
// Span2
145+
{
146+
T Expected = Identity;
147+
T ExpectedRemainder;
148+
for (size_t I = 0; I < Size; I += N) {
149+
ExpectedRemainder = Expected;
150+
Expected = BinaryOperation()(Expected, Value);
151+
}
152+
153+
for (size_t I = 0; I < N; ++I) {
154+
if (I < Size % N) {
155+
Passed &= (Output2[I] == Expected);
156+
} else {
157+
Passed &= (Output2[I] == ExpectedRemainder);
158+
}
159+
}
160+
}
161+
162+
free(Output2, Q);
163+
free(Output1, Q);
164+
NumErrors += (Passed) ? 0 : 1;
165+
}
166+
167+
struct CustomType {
168+
int x;
169+
bool operator==(const CustomType &o) const { return (x == o.x); }
170+
};
171+
172+
struct CustomBinaryOperation {
173+
CustomType operator()(const CustomType &lhs, const CustomType &rhs) const {
174+
return CustomType{lhs.x + rhs.x};
175+
}
176+
};
177+
178+
template <size_t N, typename T, typename BinaryOperation, typename Range,
179+
submission_mode SubmissionMode>
180+
void test(queue Q, Range Rng, T Identity, T Value) {
181+
test1<N, T, BinaryOperation, Range, SubmissionMode>(Q, Rng, Identity, Value);
182+
test2<N, T, BinaryOperation, Range, SubmissionMode>(Q, Rng, Identity, Value);
183+
}
184+
185+
int main() {
186+
queue Q;
187+
188+
// Tests for small spans that can be privatized efficiently
189+
// Each combination tests a different sycl::reduction implementation
190+
// TODO: Enable range<> tests once parallel_for accepts pack
191+
/*test<16, int, std::plus<int>, sycl::range<1>, submission_mode::handler>(Q,
192+
24, 0, 1); test<16, float, std::plus<float>, sycl::range<1>,
193+
submission_mode::handler>(Q, 24, 0, 1); test<16, int, std::multiplies<int>,
194+
sycl::range<1>, submission_mode::handler>(Q, 24, 1, 2); test<16, CustomType,
195+
CustomBinaryOperation, sycl::range<1>, submission_mode::handler>(Q, 24,
196+
CustomType{0}, CustomType{1});
197+
test<16, int, std::plus<int>, sycl::range<1>, submission_mode::queue>(Q, 24,
198+
0, 1); test<16, float, std::plus<float>, sycl::range<1>,
199+
submission_mode::queue>(Q, 24, 0, 1); test<16, int, std::multiplies<int>,
200+
sycl::range<1>, submission_mode::queue>(Q, 24, 1, 2); test<16, CustomType,
201+
CustomBinaryOperation, sycl::range<1>, submission_mode::queue>(Q, 24,
202+
CustomType{0}, CustomType{1});*/
203+
204+
test<16, int, std::plus<int>, sycl::nd_range<1>, submission_mode::handler>(
205+
Q, {24, 8}, 0, 1);
206+
test<16, float, std::plus<float>, sycl::nd_range<1>,
207+
submission_mode::handler>(Q, {24, 8}, 0, 1);
208+
test<16, int, std::multiplies<int>, sycl::nd_range<1>,
209+
submission_mode::handler>(Q, {24, 8}, 1, 2);
210+
test<16, int, std::bit_or<int>, sycl::nd_range<1>, submission_mode::handler>(
211+
Q, {24, 8}, 0, 1);
212+
test<16, CustomType, CustomBinaryOperation, sycl::nd_range<1>,
213+
submission_mode::handler>(Q, {24, 8}, CustomType{0}, CustomType{1});
214+
test<16, int, std::plus<int>, sycl::nd_range<1>, submission_mode::queue>(
215+
Q, {24, 8}, 0, 1);
216+
test<16, float, std::plus<float>, sycl::nd_range<1>, submission_mode::queue>(
217+
Q, {24, 8}, 0, 1);
218+
test<16, int, std::multiplies<int>, sycl::nd_range<1>,
219+
submission_mode::queue>(Q, {24, 8}, 1, 2);
220+
test<16, int, std::bit_or<int>, sycl::nd_range<1>, submission_mode::queue>(
221+
Q, {24, 8}, 0, 1);
222+
test<16, CustomType, CustomBinaryOperation, sycl::nd_range<1>,
223+
submission_mode::queue>(Q, {24, 8}, CustomType{0}, CustomType{1});
224+
225+
return NumErrors;
226+
}

0 commit comments

Comments
 (0)