Skip to content

Commit f62def0

Browse files
authored
[SYCL][Fusion][Tests] Test group functions, group algorithms and reduction (#12402)
Add tests for fusing kernels with the same ND-range using group functions, group algorithms and reductions. --------- Signed-off-by: Victor Perez <[email protected]>
1 parent 402e228 commit f62def0

File tree

7 files changed

+513
-0
lines changed

7 files changed

+513
-0
lines changed
Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
// RUN: %{build} -fsycl-embed-ir -I . -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include "../helpers.hpp"
5+
#include "support.h"
6+
#include <algorithm>
7+
#include <cassert>
8+
#include <iostream>
9+
#include <numeric>
10+
#include <sycl/sycl.hpp>
11+
12+
// COM: Check all_of works with kernel fusion.
13+
14+
using namespace sycl;
15+
16+
template <class Predicate> class all_of_kernel;
17+
18+
struct IsEven {
19+
bool operator()(int i) const { return (i % 2) == 0; }
20+
};
21+
22+
template <typename InputContainer, typename OutputContainer, class Predicate>
23+
void test(queue q, InputContainer input, OutputContainer output,
24+
Predicate pred) {
25+
typedef class all_of_kernel<Predicate> kernel_name;
26+
size_t N = input.size();
27+
size_t G = 64;
28+
{
29+
buffer<int> in_buf(input.data(), input.size());
30+
buffer<bool> out_buf(output.data(), output.size());
31+
32+
ext::codeplay::experimental::fusion_wrapper fw{q};
33+
fw.start_fusion();
34+
35+
iota(q, in_buf, 0);
36+
37+
q.submit([&](handler &cgh) {
38+
accessor in{in_buf, cgh, sycl::read_only};
39+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
40+
cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
41+
group<1> g = it.get_group();
42+
int lid = it.get_local_id(0);
43+
out[0] = all_of_group(g, pred(in[lid]));
44+
out[1] = all_of_group(g, in[lid], pred);
45+
out[2] = joint_all_of(
46+
g, in.template get_multi_ptr<access::decorated::no>(),
47+
in.template get_multi_ptr<access::decorated::no>() + N, pred);
48+
});
49+
});
50+
51+
complete_fusion_with_check(
52+
fw, ext::codeplay::experimental::property::no_barriers{});
53+
}
54+
bool expected = std::all_of(input.begin(), input.end(), pred);
55+
assert(output[0] == expected);
56+
assert(output[1] == expected);
57+
assert(output[2] == expected);
58+
}
59+
60+
int main() {
61+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
62+
if (!isSupportedDevice(q.get_device())) {
63+
std::cout << "Skipping test\n";
64+
return 0;
65+
}
66+
67+
constexpr int N = 128;
68+
std::array<int, N> input;
69+
std::array<bool, 3> output;
70+
std::fill(output.begin(), output.end(), false);
71+
72+
test(q, input, output, IsEven());
73+
74+
std::cout << "Test passed." << std::endl;
75+
}
Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
// RUN: %{build} -fsycl-embed-ir -I . -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
#include "../../helpers.hpp"
5+
#include "../helpers.hpp"
6+
#include "support.h"
7+
#include <algorithm>
8+
#include <cassert>
9+
#include <iostream>
10+
#include <limits>
11+
#include <numeric>
12+
#include <sycl/sycl.hpp>
13+
#include <vector>
14+
using namespace sycl;
15+
16+
// COM: Check exclusive_scan works with fusion
17+
18+
template <class SpecializationKernelName, int TestNumber>
19+
class exclusive_scan_kernel;
20+
21+
template <typename BinaryOperation> class K0;
22+
template <typename BinaryOperation> class K1;
23+
template <typename BinaryOperation> class K2;
24+
template <typename BinaryOperation> class K3;
25+
26+
template <typename InputContainer, typename OutputContainer,
27+
class BinaryOperation>
28+
void test(queue q, InputContainer input, OutputContainer output,
29+
BinaryOperation binary_op,
30+
typename OutputContainer::value_type identity) {
31+
typedef typename InputContainer::value_type InputT;
32+
typedef typename OutputContainer::value_type OutputT;
33+
OutputT init = 42;
34+
size_t N = input.size();
35+
size_t G = 64;
36+
std::vector<OutputT> expected(N);
37+
{
38+
buffer<InputT> in_buf(input.data(), input.size());
39+
buffer<OutputT> out_buf(output.data(), output.size());
40+
ext::codeplay::experimental::fusion_wrapper fw{q};
41+
fw.start_fusion();
42+
43+
iota(q, in_buf, 0);
44+
45+
q.submit([&](handler &cgh) {
46+
accessor in{in_buf, cgh, sycl::read_only};
47+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
48+
cgh.parallel_for<K0<BinaryOperation>>(
49+
nd_range<1>(G, G), [=](nd_item<1> it) {
50+
group<1> g = it.get_group();
51+
int lid = it.get_local_id(0);
52+
out[lid] = exclusive_scan_over_group(g, in[lid], binary_op);
53+
});
54+
});
55+
56+
complete_fusion_with_check(
57+
fw, ext::codeplay::experimental::property::no_barriers{});
58+
}
59+
emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(),
60+
identity, binary_op);
61+
assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
62+
63+
// Fill to test fusion again
64+
std::fill(input.begin(), input.end(), 0);
65+
66+
{
67+
buffer<InputT> in_buf(input.data(), input.size());
68+
buffer<OutputT> out_buf(output.data(), output.size());
69+
70+
ext::codeplay::experimental::fusion_wrapper fw{q};
71+
fw.start_fusion();
72+
73+
iota(q, in_buf, 0);
74+
75+
q.submit([&](handler &cgh) {
76+
accessor in{in_buf, cgh, sycl::read_only};
77+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
78+
cgh.parallel_for<K1<BinaryOperation>>(
79+
nd_range<1>(G, G), [=](nd_item<1> it) {
80+
group<1> g = it.get_group();
81+
int lid = it.get_local_id(0);
82+
out[lid] = exclusive_scan_over_group(g, in[lid], init, binary_op);
83+
});
84+
});
85+
86+
complete_fusion_with_check(
87+
fw, ext::codeplay::experimental::property::no_barriers{});
88+
}
89+
emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), init,
90+
binary_op);
91+
assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
92+
93+
// Fill to test fusion again
94+
std::fill(input.begin(), input.end(), 0);
95+
96+
{
97+
buffer<InputT> in_buf(input.data(), input.size());
98+
buffer<OutputT> out_buf(output.data(), output.size());
99+
100+
ext::codeplay::experimental::fusion_wrapper fw{q};
101+
fw.start_fusion();
102+
103+
iota(q, in_buf, 0);
104+
105+
q.submit([&](handler &cgh) {
106+
accessor in{in_buf, cgh, sycl::read_only};
107+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
108+
cgh.parallel_for<K2<BinaryOperation>>(
109+
nd_range<1>(G, G), [=](nd_item<1> it) {
110+
group<1> g = it.get_group();
111+
joint_exclusive_scan(
112+
g, in.template get_multi_ptr<access::decorated::no>(),
113+
in.template get_multi_ptr<access::decorated::no>() + N,
114+
out.template get_multi_ptr<access::decorated::no>(), binary_op);
115+
});
116+
});
117+
complete_fusion_with_check(
118+
fw, ext::codeplay::experimental::property::no_barriers{});
119+
}
120+
emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(),
121+
identity, binary_op);
122+
assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
123+
124+
// Fill to test fusion again
125+
std::fill(input.begin(), input.end(), 0);
126+
127+
{
128+
buffer<InputT> in_buf(input.data(), input.size());
129+
buffer<OutputT> out_buf(output.data(), output.size());
130+
ext::codeplay::experimental::fusion_wrapper fw{q};
131+
fw.start_fusion();
132+
133+
iota(q, in_buf, 0);
134+
135+
q.submit([&](handler &cgh) {
136+
accessor in{in_buf, cgh, sycl::read_only};
137+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
138+
cgh.parallel_for<K3<BinaryOperation>>(
139+
nd_range<1>(G, G), [=](nd_item<1> it) {
140+
group<1> g = it.get_group();
141+
joint_exclusive_scan(
142+
g, in.template get_multi_ptr<access::decorated::no>(),
143+
in.template get_multi_ptr<access::decorated::no>() + N,
144+
out.template get_multi_ptr<access::decorated::no>(), init,
145+
binary_op);
146+
});
147+
});
148+
complete_fusion_with_check(
149+
fw, ext::codeplay::experimental::property::no_barriers{});
150+
}
151+
emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), init,
152+
binary_op);
153+
assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
154+
}
155+
156+
int main() {
157+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
158+
if (!isSupportedDevice(q.get_device())) {
159+
std::cout << "Skipping test\n";
160+
return 0;
161+
}
162+
163+
constexpr int N = 128;
164+
std::array<int, N> input;
165+
std::array<int, N> output;
166+
std::fill(output.begin(), output.end(), 0);
167+
168+
test(q, input, output, sycl::plus<>(), 0);
169+
test(q, input, output, sycl::minimum<>(), std::numeric_limits<int>::max());
170+
test(q, input, output, sycl::maximum<>(), std::numeric_limits<int>::lowest());
171+
test(q, input, output, sycl::plus<int>(), 0);
172+
test(q, input, output, sycl::minimum<int>(), std::numeric_limits<int>::max());
173+
test(q, input, output, sycl::maximum<int>(),
174+
std::numeric_limits<int>::lowest());
175+
test(q, input, output, sycl::multiplies<int>(), 1);
176+
test(q, input, output, sycl::bit_or<int>(), 0);
177+
test(q, input, output, sycl::bit_xor<int>(), 0);
178+
test(q, input, output, sycl::bit_and<int>(), ~0);
179+
180+
std::cout << "Test passed." << std::endl;
181+
}
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
#include <iostream>
2+
#include <sycl/sycl.hpp>
3+
4+
using namespace sycl;
5+
using namespace sycl::ext::oneapi;
6+
7+
bool isSupportedDevice(device D) {
8+
std::string PlatformName =
9+
D.get_platform().get_info<sycl::info::platform::name>();
10+
if (PlatformName.find("CUDA") != std::string::npos)
11+
return true;
12+
13+
if (PlatformName.find("Level-Zero") != std::string::npos)
14+
return true;
15+
16+
if (PlatformName.find("OpenCL") != std::string::npos) {
17+
std::string Version = D.get_info<sycl::info::device::version>();
18+
19+
// Group collectives are mandatory in OpenCL 2.0 but optional in 3.0.
20+
Version = Version.substr(7, 3);
21+
if (Version >= "2.0" && Version < "3.0")
22+
return true;
23+
}
24+
25+
return false;
26+
}
27+
28+
template <typename T, typename S> bool equal(const T &x, const S &y) {
29+
// vec equal returns a vector of which components were equal
30+
if constexpr (sycl::detail::is_vec<T>::value) {
31+
for (int i = 0; i < x.size(); ++i)
32+
if (x[i] != y[i])
33+
return false;
34+
return true;
35+
} else
36+
return x == y;
37+
}
38+
39+
template <typename T1, typename T2>
40+
bool ranges_equal(T1 begin1, T1 end1, T2 begin2) {
41+
for (; begin1 != end1; ++begin1, ++begin2)
42+
if (!equal(*begin1, *begin2))
43+
return false;
44+
return true;
45+
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// RUN: %{build} -fsycl-embed-ir -o %t.out
2+
// RUN: %{run} %t.out
3+
4+
// Test complete_fusion preserves barriers by launching a kernel that requires a
5+
// barrier for correctness.
6+
7+
#include <sycl/sycl.hpp>
8+
9+
#include "../helpers.hpp"
10+
11+
using namespace sycl;
12+
13+
class Kernel;
14+
15+
int main() {
16+
constexpr size_t dataSize = 512;
17+
constexpr size_t localSize = 64;
18+
std::array<int, dataSize> in;
19+
std::array<int, dataSize> out;
20+
out.fill(0);
21+
22+
queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
23+
{
24+
buffer<int> buff_in{in};
25+
buffer<int> buff_out{out};
26+
27+
ext::codeplay::experimental::fusion_wrapper fw{q};
28+
fw.start_fusion();
29+
30+
assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
31+
32+
iota(q, buff_in, 0);
33+
34+
// Needed implicit group barrier
35+
36+
q.submit([&](handler &cgh) {
37+
accessor in(buff_in, cgh, read_only);
38+
accessor out(buff_out, cgh, write_only, no_init);
39+
local_accessor<int> lacc(localSize, cgh);
40+
cgh.parallel_for<Kernel>(
41+
nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
42+
auto group = i.get_group();
43+
if (i.get_local_id() == 0) {
44+
auto begin = in.begin() + static_cast<int64_t>(
45+
localSize * group.get_group_id(0));
46+
auto end = begin + localSize;
47+
std::copy(begin, end, lacc.begin());
48+
}
49+
// Test following barrier is preserved
50+
group_barrier(i.get_group());
51+
out[i.get_global_id()] = lacc[i.get_local_id()];
52+
});
53+
});
54+
55+
complete_fusion_with_check(fw);
56+
}
57+
58+
// Check the results
59+
for (int i = 0, end = dataSize; i < end; ++i) {
60+
assert(out[i] == i && "Computation error");
61+
}
62+
63+
return 0;
64+
}

0 commit comments

Comments
 (0)