Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit a795f30

Browse files
[SYCL] Add test case for ext::oneapi::experimental::joint_reduce (#1452)
This patch add new test which fixes the case for user-defined reductions when WG size is bigger than the input data size.
1 parent 8ee6b8a commit a795f30

File tree

1 file changed

+80
-0
lines changed

1 file changed

+80
-0
lines changed
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
5+
#include <iostream>
6+
#include <numeric>
7+
8+
#include <sycl/ext/oneapi/experimental/user_defined_reductions.hpp>
9+
#include <sycl/sycl.hpp>
10+
11+
// 1. Allocate an buffer of 16 elements where first 8 elements filled with 1,
12+
// ..., 8 and the second 8 elements filled with 0.
13+
// 2. Submit a kernel with one wg of size 16.
14+
// 3. invoke joint_reduce with first == start of the buffer and last == start
15+
// of the buffer + 8 elems.
16+
// 4. The result should be equal to 1, as 1 is the minimum number in the
17+
// selection.
18+
19+
template <typename T = void> struct UserDefinedMinimum {
20+
T operator()(const T &lhs, const T &rhs) const {
21+
return std::less<T>()(lhs, rhs) ? lhs : rhs;
22+
}
23+
};
24+
25+
constexpr int segment_size = 8;
26+
27+
using namespace sycl;
28+
29+
template <typename InputContainer, typename OutputContainer,
30+
class BinaryOperation>
31+
void test(queue q, InputContainer input, OutputContainer output,
32+
BinaryOperation binary_op, size_t workgroup_size,
33+
typename OutputContainer::value_type init) {
34+
using InputT = typename InputContainer::value_type;
35+
using OutputT = typename OutputContainer::value_type;
36+
constexpr size_t N = input.size();
37+
{
38+
buffer<InputT> in_buf(input.data(), input.size());
39+
buffer<OutputT> out_buf(output.data(), output.size());
40+
41+
q.submit([&](handler &cgh) {
42+
accessor in{in_buf, cgh, sycl::read_only};
43+
accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
44+
45+
size_t temp_memory_size = workgroup_size * sizeof(InputT);
46+
auto scratch = sycl::local_accessor<std::byte, 1>(temp_memory_size, cgh);
47+
cgh.parallel_for(
48+
nd_range<1>(workgroup_size, workgroup_size), [=](nd_item<1> it) {
49+
InputT *segment_begin = in.get_pointer();
50+
InputT *segment_end = in.get_pointer() + segment_size;
51+
auto handle =
52+
sycl::ext::oneapi::experimental::group_with_scratchpad(
53+
it.get_group(), sycl::span(&scratch[0], temp_memory_size));
54+
OutputT group_aggregate =
55+
sycl::ext::oneapi::experimental::joint_reduce(
56+
handle, segment_begin, segment_end, init, binary_op);
57+
if (it.get_local_linear_id() == 0) {
58+
out[it.get_group_linear_id()] = group_aggregate;
59+
}
60+
});
61+
});
62+
q.wait();
63+
}
64+
assert(output[0] == 1);
65+
}
66+
67+
int main() {
68+
queue q;
69+
70+
constexpr int N = 16;
71+
std::array<int, N> input;
72+
std::iota(input.begin(), input.begin() + segment_size, 1);
73+
std::fill(input.begin() + segment_size, input.end(), 0);
74+
std::array<int, 1> output;
75+
76+
// queue, input array, output array, binary_op, segment_size, WG size, init
77+
test(q, input, output, sycl::minimum<int>{}, N, INT_MAX);
78+
test(q, input, output, UserDefinedMinimum<int>{}, N, INT_MAX);
79+
return 0;
80+
}

0 commit comments

Comments
 (0)