Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit 8ca8249

Browse files
committed
[SYCL] Add tests for span reductions
Tests the following: - Only spans with static extents are available - Kernels with a single reduction span work - Kernels with a reduction pack containing a span work Signed-off-by: John Pennycook <[email protected]>
1 parent 58b6aa0 commit 8ca8249

File tree

3 files changed

+319
-0
lines changed

3 files changed

+319
-0
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
// RUN: not %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out
2+
3+
#include <CL/sycl.hpp>
4+
5+
using namespace sycl;
6+
7+
int main(int argc, char *argv[]) {
8+
9+
// SYCL 2020 reductions cannot be created from spans with dynamic extents
10+
auto Span = span<int, dynamic_extent>(nullptr, 1);
11+
auto Redu = reduction(Span, plus<>());
12+
}

SYCL/Reduction/reduction_span.cpp

Lines changed: 113 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,113 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
5+
//
6+
// `Group algorithms are not supported on host device.` on Nvidia.
7+
// XFAIL: hip_nvidia
8+
9+
// TODO: test disabled due to sporadic fails in level_zero:gpu RT.
10+
// UNSUPPORTED: linux && level_zero
11+
12+
// This test performs basic checks of reductions initialized with a sycl::span
13+
14+
#include <CL/sycl.hpp>
15+
using namespace sycl;
16+
17+
int NumErrors = 0;
18+
19+
template <int Dimensions> size_t getLinearSize(range<Dimensions> Range) {
20+
return Range.size();
21+
}
22+
23+
template <int Dimensions> size_t getLinearSize(nd_range<Dimensions> NDRange) {
24+
return NDRange.get_global_range().size();
25+
}
26+
27+
template <int Dimensions>
28+
size_t getLinearId(nd_range<Dimensions>, nd_item<Dimensions> Item) {
29+
return Item.get_global_linear_id();
30+
}
31+
32+
size_t getLinearId(range<1>, id<1> Id) { return Id[0]; }
33+
34+
size_t getLinearId(range<2> Range, id<2> Id) {
35+
return Id[0] * Range[1] + Id[1];
36+
}
37+
38+
size_t getLinearId(range<3> Range, id<3> Id) {
39+
return Id[0] * Range[1] * Range[2] + Id[1] * Range[2] + Id[2];
40+
}
41+
42+
template <size_t N, typename T, typename BinaryOperation, typename Range>
43+
void test(queue Q, Range Rng, T Identity, T Value) {
44+
45+
// Initialize output to identity value
46+
T *Output = malloc_shared<T>(N, Q);
47+
Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait();
48+
49+
// Perform generalized "histogram" with N bins
50+
// TODO: Test Q.parallel_for when code_location is fixed
51+
Q.submit([&](handler &CGH) {
52+
CGH.parallel_for(
53+
Rng, reduction(span<T, N>(Output, N), Identity, BinaryOperation()),
54+
[=](auto It, auto &Reducer) {
55+
size_t Index = getLinearId(Rng, It) % N;
56+
Reducer[Index].combine(Value);
57+
});
58+
}).wait();
59+
60+
size_t Size = getLinearSize(Rng);
61+
62+
// Each bin should have the same value unless B doesn't divide N
63+
T Expected = Identity;
64+
T ExpectedRemainder;
65+
for (size_t I = 0; I < Size; I += N) {
66+
ExpectedRemainder = Expected;
67+
Expected = BinaryOperation()(Expected, Value);
68+
}
69+
70+
bool Passed = true;
71+
for (size_t I = 0; I < N; ++I) {
72+
if (I < Size % N) {
73+
Passed &= (Output[I] == Expected);
74+
} else {
75+
Passed &= (Output[I] == ExpectedRemainder);
76+
}
77+
}
78+
79+
free(Output, Q);
80+
NumErrors += (Passed) ? 0 : 1;
81+
}
82+
83+
struct CustomType {
84+
int x;
85+
bool operator==(const CustomType &o) const { return (x == o.x); }
86+
};
87+
88+
struct CustomBinaryOperation {
89+
CustomType operator()(const CustomType &lhs, const CustomType &rhs) const {
90+
return CustomType{lhs.x + rhs.x};
91+
}
92+
};
93+
94+
int main() {
95+
queue Q;
96+
97+
// Tests for small spans that can be privatized efficiently
98+
// Each combination tests a different sycl::reduction implementation
99+
test<16, int, std::plus<int>, sycl::range<1>>(Q, 24, 0, 1);
100+
test<16, float, std::plus<float>, sycl::range<1>>(Q, 24, 0, 1);
101+
test<16, int, std::multiplies<int>, sycl::range<1>>(Q, 24, 1, 2);
102+
test<16, CustomType, CustomBinaryOperation, sycl::range<1>>(
103+
Q, 24, CustomType{0}, CustomType{1});
104+
105+
test<16, int, std::plus<int>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
106+
test<16, float, std::plus<float>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
107+
test<16, int, std::multiplies<int>, sycl::nd_range<1>>(Q, {24, 8}, 1, 2);
108+
test<16, int, std::bit_or<int>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
109+
test<16, CustomType, CustomBinaryOperation, sycl::nd_range<1>>(
110+
Q, {24, 8}, CustomType{0}, CustomType{1});
111+
112+
return NumErrors;
113+
}
Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
// RUN: %ACC_RUN_PLACEHOLDER %t.out
5+
//
6+
// `Group algorithms are not supported on host device.` on Nvidia.
7+
// XFAIL: hip_nvidia
8+
9+
// TODO: test disabled due to sporadic fails in level_zero:gpu RT.
10+
// UNSUPPORTED: linux && level_zero
11+
12+
// This test performs basic checks of reductions initialized with a pack
13+
// containing at least one sycl::span
14+
15+
#include <CL/sycl.hpp>
16+
using namespace sycl;
17+
18+
int NumErrors = 0;
19+
20+
template <int Dimensions> size_t getLinearSize(range<Dimensions> Range) {
21+
return Range.size();
22+
}
23+
24+
template <int Dimensions> size_t getLinearSize(nd_range<Dimensions> NDRange) {
25+
return NDRange.get_global_range().size();
26+
}
27+
28+
template <int Dimensions>
29+
size_t getLinearId(nd_range<Dimensions>, nd_item<Dimensions> Item) {
30+
return Item.get_global_linear_id();
31+
}
32+
33+
size_t getLinearId(range<1>, id<1> Id) { return Id[0]; }
34+
35+
size_t getLinearId(range<2> Range, id<2> Id) {
36+
return Id[0] * Range[1] + Id[1];
37+
}
38+
39+
size_t getLinearId(range<3> Range, id<3> Id) {
40+
return Id[0] * Range[1] * Range[2] + Id[1] * Range[2] + Id[2];
41+
}
42+
43+
// Test a span and a regular sum
44+
template <size_t N, typename T, typename BinaryOperation, typename Range>
45+
void test1(queue Q, Range Rng, T Identity, T Value) {
46+
47+
// Initialize output to identity value
48+
int *Sum = malloc_shared<int>(1, Q);
49+
Q.single_task([=]() { *Sum = 0; }).wait();
50+
T *Output = malloc_shared<T>(N, Q);
51+
Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait();
52+
53+
// Perform generalized "histogram" with N bins
54+
// TODO: Test Q.parallel_for when code_location is fixed
55+
Q.submit([&](handler &CGH) {
56+
CGH.parallel_for(
57+
Rng, reduction(Sum, plus<>()),
58+
reduction(span<T, N>(Output, N), Identity, BinaryOperation()),
59+
[=](auto It, auto &ScalarReducer, auto &SpanReducer) {
60+
ScalarReducer++;
61+
size_t Index = getLinearId(Rng, It) % N;
62+
SpanReducer[Index].combine(Value);
63+
});
64+
}).wait();
65+
66+
size_t Size = getLinearSize(Rng);
67+
68+
// Each bin should have the same value unless B doesn't divide N
69+
T Expected = Identity;
70+
T ExpectedRemainder;
71+
for (size_t I = 0; I < Size; I += N) {
72+
ExpectedRemainder = Expected;
73+
Expected = BinaryOperation()(Expected, Value);
74+
}
75+
76+
bool Passed = true;
77+
for (size_t I = 0; I < N; ++I) {
78+
if (I < Size % N) {
79+
Passed &= (Output[I] == Expected);
80+
} else {
81+
Passed &= (Output[I] == ExpectedRemainder);
82+
}
83+
}
84+
Passed &= (*Sum == Size);
85+
86+
free(Output, Q);
87+
free(Sum, Q);
88+
NumErrors += (Passed) ? 0 : 1;
89+
}
90+
91+
// Test two spans
92+
template <size_t N, typename T, typename BinaryOperation, typename Range>
93+
void test2(queue Q, Range Rng, T Identity, T Value) {
94+
95+
// Initialize output to identity value
96+
int *Output1 = malloc_shared<int>(N, Q);
97+
Q.parallel_for(range<1>{N}, [=](id<1> I) { Output1[I] = 0; }).wait();
98+
T *Output2 = malloc_shared<T>(N, Q);
99+
Q.parallel_for(range<1>{N}, [=](id<1> I) { Output2[I] = Identity; }).wait();
100+
101+
// Perform generalized "histogram" with N bins
102+
// TODO: Test Q.parallel_for when code_location is fixed
103+
Q.submit([&](handler &CGH) {
104+
CGH.parallel_for(
105+
Rng, reduction(span<int, N>(Output1, N), plus<>()),
106+
reduction(span<T, N>(Output2, N), Identity, BinaryOperation()),
107+
[=](auto It, auto &Reducer1, auto &Reducer2) {
108+
size_t Index = getLinearId(Rng, It) % N;
109+
Reducer1[Index]++;
110+
Reducer2[Index].combine(Value);
111+
});
112+
}).wait();
113+
114+
size_t Size = getLinearSize(Rng);
115+
bool Passed = true;
116+
// Span1
117+
{
118+
int Expected = 0;
119+
int ExpectedRemainder;
120+
for (size_t I = 0; I < Size; I += N) {
121+
ExpectedRemainder = Expected;
122+
Expected += 1;
123+
}
124+
125+
for (size_t I = 0; I < N; ++I) {
126+
if (I < Size % N) {
127+
Passed &= (Output1[I] == Expected);
128+
} else {
129+
Passed &= (Output1[I] == ExpectedRemainder);
130+
}
131+
}
132+
}
133+
134+
// Span2
135+
{
136+
T Expected = Identity;
137+
T ExpectedRemainder;
138+
for (size_t I = 0; I < Size; I += N) {
139+
ExpectedRemainder = Expected;
140+
Expected = BinaryOperation()(Expected, Value);
141+
}
142+
143+
for (size_t I = 0; I < N; ++I) {
144+
if (I < Size % N) {
145+
Passed &= (Output2[I] == Expected);
146+
} else {
147+
Passed &= (Output2[I] == ExpectedRemainder);
148+
}
149+
}
150+
}
151+
152+
free(Output2, Q);
153+
free(Output1, Q);
154+
NumErrors += (Passed) ? 0 : 1;
155+
}
156+
157+
struct CustomType {
158+
int x;
159+
bool operator==(const CustomType &o) const { return (x == o.x); }
160+
};
161+
162+
struct CustomBinaryOperation {
163+
CustomType operator()(const CustomType &lhs, const CustomType &rhs) const {
164+
return CustomType{lhs.x + rhs.x};
165+
}
166+
};
167+
168+
template <size_t N, typename T, typename BinaryOperation, typename Range>
169+
void test(queue Q, Range Rng, T Identity, T Value) {
170+
test1<N, T, BinaryOperation, Range>(Q, Rng, Identity, Value);
171+
test2<N, T, BinaryOperation, Range>(Q, Rng, Identity, Value);
172+
}
173+
174+
int main() {
175+
queue Q;
176+
177+
// Tests for small spans that can be privatized efficiently
178+
// Each combination tests a different sycl::reduction implementation
179+
// TODO: Enable range<> tests once parallel_for accepts pack
180+
/*test<16, int, std::plus<int>, sycl::range<1>>(Q, 24, 0, 1);
181+
test<16, float, std::plus<float>, sycl::range<1>>(Q, 24, 0, 1);
182+
test<16, int, std::multiplies<int>, sycl::range<1>>(Q, 24, 1, 2);
183+
test<16, CustomType, CustomBinaryOperation, sycl::range<1>>(Q, 24,
184+
CustomType{0}, CustomType{1});*/
185+
186+
test<16, int, std::plus<int>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
187+
test<16, float, std::plus<float>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
188+
test<16, int, std::multiplies<int>, sycl::nd_range<1>>(Q, {24, 8}, 1, 2);
189+
test<16, int, std::bit_or<int>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
190+
test<16, CustomType, CustomBinaryOperation, sycl::nd_range<1>>(
191+
Q, {24, 8}, CustomType{0}, CustomType{1});
192+
193+
return NumErrors;
194+
}

0 commit comments

Comments
 (0)