Skip to content

Commit a0fdbde

Browse files
committed
[SYCL][Fusion][Tests] Test group functions, group algoriths and reductions
Add tests for fusing kernels with the same ND-range using group functions, group algorithms and reductions. Signed-off-by: Victor Perez <[email protected]>
1 parent 89327e0 commit a0fdbde

File tree

7 files changed

+515
-0
lines changed

7 files changed

+515
-0
lines changed
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
// RUN: %{build} -fsycl-embed-ir -I . -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include "../helpers.hpp"
5+
#include "support.h"
6+
#include <algorithm>
7+
#include <cassert>
8+
#include <iostream>
9+
#include <numeric>
10+
#include <sycl/sycl.hpp>
11+
12+
// COM: Check all_of works with kernel fusion.
13+
14+
using namespace sycl;
15+
16+
template <class Predicate> class all_of_kernel;
17+
18+
struct IsEven {
19+
bool operator()(int i) const { return (i % 2) == 0; }
20+
};
21+
22+
template <typename InputContainer, typename OutputContainer, class Predicate>
23+
void test(queue q, InputContainer input, OutputContainer output,
24+
Predicate pred) {
25+
typedef class all_of_kernel<Predicate> kernel_name;
26+
size_t N = input.size();
27+
size_t G = 64;
28+
{
29+
buffer<int> in_buf(input.data(), input.size());
30+
buffer<bool> out_buf(output.data(), output.size());
31+
32+
ext::codeplay::experimental::fusion_wrapper fw{q};
33+
fw.start_fusion();
34+
35+
iota(q, in_buf, 0);
36+
37+
q.submit([&](handler &cgh) {
38+
accessor in{in_buf, cgh, sycl::read_only};
39+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
40+
cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
41+
group<1> g = it.get_group();
42+
int lid = it.get_local_id(0);
43+
out[0] = all_of_group(g, pred(in[lid]));
44+
out[1] = all_of_group(g, in[lid], pred);
45+
out[2] = joint_all_of(
46+
g, in.template get_multi_ptr<access::decorated::no>(),
47+
in.template get_multi_ptr<access::decorated::no>() + N, pred);
48+
});
49+
});
50+
51+
complete_fusion_with_check(
52+
fw, ext::codeplay::experimental::property::no_barriers{});
53+
}
54+
bool expected = std::all_of(input.begin(), input.end(), pred);
55+
assert(output[0] == expected);
56+
assert(output[1] == expected);
57+
assert(output[2] == expected);
58+
}
59+
60+
int main() {
61+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
62+
if (!isSupportedDevice(q.get_device())) {
63+
std::cout << "Skipping test\n";
64+
return 0;
65+
}
66+
67+
constexpr int N = 128;
68+
std::array<int, N> input;
69+
std::array<bool, 3> output;
70+
std::fill(output.begin(), output.end(), false);
71+
72+
test(q, input, output, IsEven());
73+
74+
std::cout << "Test passed." << std::endl;
75+
}
Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
// RUN: %{build} -fsycl-embed-ir -I . -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include "../../helpers.hpp"
5+
#include "../helpers.hpp"
6+
#include "support.h"
7+
#include <algorithm>
8+
#include <cassert>
9+
#include <iostream>
10+
#include <limits>
11+
#include <numeric>
12+
#include <sycl/sycl.hpp>
13+
#include <vector>
14+
using namespace sycl;
15+
16+
// COM: Check exclusive_scan works with fusion
17+
18+
template <class SpecializationKernelName, int TestNumber>
19+
class exclusive_scan_kernel;
20+
21+
template <typename SpecializationKernelName, typename InputContainer,
22+
typename OutputContainer, class BinaryOperation>
23+
void test(queue q, InputContainer input, OutputContainer output,
24+
BinaryOperation binary_op,
25+
typename OutputContainer::value_type identity) {
26+
typedef typename InputContainer::value_type InputT;
27+
typedef typename OutputContainer::value_type OutputT;
28+
typedef class exclusive_scan_kernel<SpecializationKernelName, 0> kernel_name0;
29+
typedef class exclusive_scan_kernel<SpecializationKernelName, 1> kernel_name1;
30+
typedef class exclusive_scan_kernel<SpecializationKernelName, 2> kernel_name2;
31+
typedef class exclusive_scan_kernel<SpecializationKernelName, 3> kernel_name3;
32+
OutputT init = 42;
33+
size_t N = input.size();
34+
size_t G = 64;
35+
std::vector<OutputT> expected(N);
36+
{
37+
buffer<InputT> in_buf(input.data(), input.size());
38+
buffer<OutputT> out_buf(output.data(), output.size());
39+
ext::codeplay::experimental::fusion_wrapper fw{q};
40+
fw.start_fusion();
41+
42+
iota(q, in_buf, 0);
43+
44+
q.submit([&](handler &cgh) {
45+
accessor in{in_buf, cgh, sycl::read_only};
46+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
47+
cgh.parallel_for<kernel_name0>(nd_range<1>(G, G), [=](nd_item<1> it) {
48+
group<1> g = it.get_group();
49+
int lid = it.get_local_id(0);
50+
out[lid] = exclusive_scan_over_group(g, in[lid], binary_op);
51+
});
52+
});
53+
54+
complete_fusion_with_check(
55+
fw, ext::codeplay::experimental::property::no_barriers{});
56+
}
57+
emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(),
58+
identity, binary_op);
59+
assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
60+
61+
// Fill to test fusion again
62+
std::fill(input.begin(), input.end(), 0);
63+
64+
{
65+
buffer<InputT> in_buf(input.data(), input.size());
66+
buffer<OutputT> out_buf(output.data(), output.size());
67+
68+
ext::codeplay::experimental::fusion_wrapper fw{q};
69+
fw.start_fusion();
70+
71+
iota(q, in_buf, 0);
72+
73+
q.submit([&](handler &cgh) {
74+
accessor in{in_buf, cgh, sycl::read_only};
75+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
76+
cgh.parallel_for<kernel_name1>(nd_range<1>(G, G), [=](nd_item<1> it) {
77+
group<1> g = it.get_group();
78+
int lid = it.get_local_id(0);
79+
out[lid] = exclusive_scan_over_group(g, in[lid], init, binary_op);
80+
});
81+
});
82+
83+
complete_fusion_with_check(
84+
fw, ext::codeplay::experimental::property::no_barriers{});
85+
}
86+
emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), init,
87+
binary_op);
88+
assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
89+
90+
// Fill to test fusion again
91+
std::fill(input.begin(), input.end(), 0);
92+
93+
{
94+
buffer<InputT> in_buf(input.data(), input.size());
95+
buffer<OutputT> out_buf(output.data(), output.size());
96+
97+
ext::codeplay::experimental::fusion_wrapper fw{q};
98+
fw.start_fusion();
99+
100+
iota(q, in_buf, 0);
101+
102+
q.submit([&](handler &cgh) {
103+
accessor in{in_buf, cgh, sycl::read_only};
104+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
105+
cgh.parallel_for<kernel_name2>(nd_range<1>(G, G), [=](nd_item<1> it) {
106+
group<1> g = it.get_group();
107+
joint_exclusive_scan(
108+
g, in.template get_multi_ptr<access::decorated::no>(),
109+
in.template get_multi_ptr<access::decorated::no>() + N,
110+
out.template get_multi_ptr<access::decorated::no>(), binary_op);
111+
});
112+
});
113+
complete_fusion_with_check(
114+
fw, ext::codeplay::experimental::property::no_barriers{});
115+
}
116+
emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(),
117+
identity, binary_op);
118+
assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
119+
120+
// Fill to test fusion again
121+
std::fill(input.begin(), input.end(), 0);
122+
123+
{
124+
buffer<InputT> in_buf(input.data(), input.size());
125+
buffer<OutputT> out_buf(output.data(), output.size());
126+
ext::codeplay::experimental::fusion_wrapper fw{q};
127+
fw.start_fusion();
128+
129+
iota(q, in_buf, 0);
130+
131+
q.submit([&](handler &cgh) {
132+
accessor in{in_buf, cgh, sycl::read_only};
133+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
134+
cgh.parallel_for<kernel_name3>(nd_range<1>(G, G), [=](nd_item<1> it) {
135+
group<1> g = it.get_group();
136+
joint_exclusive_scan(
137+
g, in.template get_multi_ptr<access::decorated::no>(),
138+
in.template get_multi_ptr<access::decorated::no>() + N,
139+
out.template get_multi_ptr<access::decorated::no>(), init,
140+
binary_op);
141+
});
142+
});
143+
complete_fusion_with_check(
144+
fw, ext::codeplay::experimental::property::no_barriers{});
145+
}
146+
emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), init,
147+
binary_op);
148+
assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
149+
}
150+
151+
int main() {
152+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
153+
if (!isSupportedDevice(q.get_device())) {
154+
std::cout << "Skipping test\n";
155+
return 0;
156+
}
157+
158+
constexpr int N = 128;
159+
std::array<int, N> input;
160+
std::array<int, N> output;
161+
std::fill(output.begin(), output.end(), 0);
162+
163+
test<class KernelNamePlusV>(q, input, output, sycl::plus<>(), 0);
164+
test<class KernelNameMinimumV>(q, input, output, sycl::minimum<>(),
165+
std::numeric_limits<int>::max());
166+
test<class KernelNameMaximumV>(q, input, output, sycl::maximum<>(),
167+
std::numeric_limits<int>::lowest());
168+
169+
test<class KernelNamePlusI>(q, input, output, sycl::plus<int>(), 0);
170+
test<class KernelNameMinimumI>(q, input, output, sycl::minimum<int>(),
171+
std::numeric_limits<int>::max());
172+
test<class KernelNameMaximumI>(q, input, output, sycl::maximum<int>(),
173+
std::numeric_limits<int>::lowest());
174+
test<class KernelName_VzAPutpBRRJrQPB>(q, input, output,
175+
sycl::multiplies<int>(), 1);
176+
test<class KernelName_UXdGbr>(q, input, output, sycl::bit_or<int>(), 0);
177+
test<class KernelName_saYaodNyJknrPW>(q, input, output, sycl::bit_xor<int>(),
178+
0);
179+
test<class KernelName_GPcuAlvAOjrDyP>(q, input, output, sycl::bit_and<int>(),
180+
~0);
181+
182+
std::cout << "Test passed." << std::endl;
183+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#include <iostream>
2+
#include <sycl/sycl.hpp>
3+
4+
using namespace sycl;
5+
using namespace sycl::ext::oneapi;
6+
7+
bool isSupportedDevice(device D) {
8+
std::string PlatformName =
9+
D.get_platform().get_info<sycl::info::platform::name>();
10+
if (PlatformName.find("CUDA") != std::string::npos)
11+
return true;
12+
13+
if (PlatformName.find("Level-Zero") != std::string::npos)
14+
return true;
15+
16+
if (PlatformName.find("OpenCL") != std::string::npos) {
17+
std::string Version = D.get_info<sycl::info::device::version>();
18+
19+
// Group collectives are mandatory in OpenCL 2.0 but optional in 3.0.
20+
Version = Version.substr(7, 3);
21+
if (Version >= "2.0" && Version < "3.0")
22+
return true;
23+
}
24+
25+
return false;
26+
}
27+
28+
template <typename T, typename S> bool equal(const T &x, const S &y) {
29+
// vec equal returns a vector of which components were equal
30+
if constexpr (sycl::detail::is_vec<T>::value) {
31+
for (int i = 0; i < x.size(); ++i)
32+
if (x[i] != y[i])
33+
return false;
34+
return true;
35+
} else
36+
return x == y;
37+
}
38+
39+
template <typename T1, typename T2>
40+
bool ranges_equal(T1 begin1, T1 end1, T2 begin2) {
41+
for (; begin1 != end1; ++begin1, ++begin2)
42+
if (!equal(*begin1, *begin2))
43+
return false;
44+
return true;
45+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// RUN: %{build} -fsycl-embed-ir -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
// Test complete_fusion preserves barriers by launching a kernel that requires a
5+
// barrier for correctness.
6+
7+
#include <sycl/sycl.hpp>
8+
9+
#include "../helpers.hpp"
10+
11+
using namespace sycl;
12+
13+
class Kernel;
14+
15+
int main() {
16+
constexpr size_t dataSize = 512;
17+
constexpr size_t localSize = 64;
18+
std::array<int, dataSize> in;
19+
std::array<int, dataSize> out;
20+
out.fill(0);
21+
22+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
23+
{
24+
buffer<int> buff_in{in};
25+
buffer<int> buff_out{out};
26+
27+
ext::codeplay::experimental::fusion_wrapper fw{q};
28+
fw.start_fusion();
29+
30+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
31+
32+
iota(q, buff_in, 0);
33+
34+
// Needed implicit group barrier
35+
36+
q.submit([&](handler &cgh) {
37+
accessor in(buff_in, cgh, read_only);
38+
accessor out(buff_out, cgh, write_only, no_init);
39+
local_accessor<int> lacc(localSize, cgh);
40+
cgh.parallel_for<Kernel>(
41+
nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
42+
auto group = i.get_group();
43+
if (i.get_local_id() == 0) {
44+
auto begin = in.begin() + static_cast<int64_t>(
45+
localSize * group.get_group_id(0));
46+
auto end = begin + localSize;
47+
std::copy(begin, end, lacc.begin());
48+
}
49+
// Test following barrier is preserved
50+
group_barrier(i.get_group());
51+
out[i.get_global_id()] = lacc[i.get_local_id()];
52+
});
53+
});
54+
55+
complete_fusion_with_check(fw);
56+
}
57+
58+
// Check the results
59+
for (int i = 0, end = dataSize; i < end; ++i) {
60+
assert(out[i] == i && "Computation error");
61+
}
62+
63+
return 0;
64+
}

0 commit comments

Comments
 (0)