Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

Commit a9bc1ad

Browse files
committed
[SYCL] Add lit tests for reduction + range (#4101)
Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 1226e5c commit a9bc1ad

11 files changed

+670
-3
lines changed
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
5+
// TODO: accelerator may not suport atomics required by the current
6+
// implementation. Enable testing when implementation is fixed.
7+
// RUNx: %ACC_RUN_PLACEHOLDER %t.out
8+
9+
// This test performs basic checks of parallel_for(range<1>, reduction, func)
10+
// with reductions initialized with 0-dimensional discard_write accessor
11+
// accessing 1 element buffer.
12+
13+
#include "reduction_range_scalar.hpp"
14+
15+
using namespace cl::sycl;
16+
17+
template <typename Name, typename T, class BinaryOperation>
18+
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
19+
constexpr access::mode DW = access::mode::discard_write;
20+
test<Name, false, DW, T, 0>(Q, Identity, Init, BOp, range<1>{NWItems});
21+
}
22+
23+
int main() {
24+
queue Q;
25+
printDeviceInfo(Q);
26+
size_t MaxWGSize =
27+
Q.get_device().get_info<info::device::max_work_group_size>();
28+
29+
// Fast-reduce and Fast-atomics. Try various range types/sizes.
30+
tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
31+
tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
32+
tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
33+
tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
34+
tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
35+
tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
36+
37+
// Try various types & ranges.
38+
tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
39+
tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
40+
tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
41+
tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
42+
tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1);
43+
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
44+
ext::oneapi::minimum<>{}, MaxWGSize * 2);
45+
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
46+
ext::oneapi::maximum<>{}, 8);
47+
tests<class B8, uint64_t>(Q, 1, 99, std::multiplies<>{}, 37);
48+
49+
// Check with CUSTOM type.
50+
using CV = CustomVec<long long>;
51+
tests<class C1>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 64);
52+
tests<class C2>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, MaxWGSize * 3);
53+
54+
std::cout << "Test passed\n";
55+
return 0;
56+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
5+
// TODO: accelerator may not suport atomics required by the current
6+
// implementation. Enable testing when implementation is fixed.
7+
// RUNx: %ACC_RUN_PLACEHOLDER %t.out
8+
9+
// This test performs basic checks of parallel_for(range<1>, reduction, func)
10+
// with reductions initialized with 0-dimensional read_write accessor
11+
// accessing 1 element buffer.
12+
13+
#include "reduction_range_scalar.hpp"
14+
15+
using namespace cl::sycl;
16+
17+
template <typename Name, typename T, class BinaryOperation>
18+
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
19+
constexpr access::mode RW = access::mode::read_write;
20+
test<Name, false, RW, T, 0>(Q, Identity, Init, BOp, range<1>{NWItems});
21+
}
22+
23+
int main() {
24+
queue Q;
25+
printDeviceInfo(Q);
26+
size_t MaxWGSize =
27+
Q.get_device().get_info<info::device::max_work_group_size>();
28+
29+
// Fast-reduce and Fast-atomics. Try various range types/sizes.
30+
tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
31+
tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
32+
tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
33+
tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
34+
tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
35+
tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
36+
37+
// Try various types & ranges.
38+
tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
39+
tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
40+
tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
41+
tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
42+
tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1);
43+
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
44+
ext::oneapi::minimum<>{}, MaxWGSize * 2);
45+
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
46+
ext::oneapi::maximum<>{}, 8);
47+
tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);
48+
49+
// Check with CUSTOM type.
50+
tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
51+
CustomVecPlus<long long>{}, 64);
52+
tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
53+
CustomVecPlus<long long>{}, MaxWGSize * 3);
54+
55+
std::cout << "Test passed\n";
56+
return 0;
57+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
5+
// TODO: accelerator may not suport atomics required by the current
6+
// implementation. Enable testing when implementation is fixed.
7+
// RUNx: %ACC_RUN_PLACEHOLDER %t.out
8+
9+
#include "reduction_range_scalar.hpp"
10+
11+
// This test performs basic checks of parallel_for(range<1>, reduction, func)
12+
// with reductions initialized with 1-dimensional discard_write accessor
13+
// accessing 1 element buffer.
14+
15+
using namespace cl::sycl;
16+
17+
template <typename Name, typename T, class BinaryOperation>
18+
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
19+
constexpr access::mode DW = access::mode::discard_write;
20+
testBoth<Name, DW, T>(Q, Identity, Init, BOp, range<1>{NWItems});
21+
}
22+
23+
int main() {
24+
queue Q;
25+
printDeviceInfo(Q);
26+
size_t MaxWGSize =
27+
Q.get_device().get_info<info::device::max_work_group_size>();
28+
29+
// Fast-reduce and Fast-atomics. Try various range types/sizes.
30+
tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
31+
tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
32+
tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
33+
tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
34+
tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
35+
tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
36+
37+
// Try various types & ranges.
38+
tests<class B1, int>(Q, ~0, 99, std::bit_and<>{}, 7);
39+
tests<class B2, int>(Q, 0, 0xff99, std::bit_xor<>{}, MaxWGSize);
40+
tests<class B3, int>(Q, 0, 0xff99, std::bit_or<>{}, 3);
41+
tests<class B4, int>(Q, 1, 3, std::multiplies<>{}, 32);
42+
tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4);
43+
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
44+
ext::oneapi::minimum<>{}, MaxWGSize * 2);
45+
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
46+
ext::oneapi::maximum<>{}, 8);
47+
tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);
48+
49+
// Check with CUSTOM type.
50+
tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
51+
CustomVecPlus<long long>{}, 256);
52+
tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
53+
CustomVecPlus<long long>{}, MaxWGSize * 3);
54+
55+
std::cout << "Test passed\n";
56+
return 0;
57+
}
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
5+
// TODO: accelerator may not suport atomics required by the current
6+
// implementation. Enable testing when implementation is fixed.
7+
// RUNx: %ACC_RUN_PLACEHOLDER %t.out
8+
9+
// This test performs basic checks of parallel_for(range<1>, reduction, func)
10+
// with reductions initialized with 1-dimensional read_write accessor
11+
// accessing 1 element buffer.
12+
13+
#include "reduction_range_scalar.hpp"
14+
15+
using namespace cl::sycl;
16+
17+
template <typename Name, typename T, class BinaryOperation>
18+
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
19+
constexpr access::mode RW = access::mode::read_write;
20+
testBoth<Name, RW, T>(Q, Identity, Init, BOp, range<1>{NWItems});
21+
}
22+
23+
int main() {
24+
queue Q;
25+
printDeviceInfo(Q);
26+
size_t MaxWGSize =
27+
Q.get_device().get_info<info::device::max_work_group_size>();
28+
29+
// Fast-reduce and Fast-atomics. Try various range types/sizes.
30+
tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1);
31+
tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 2);
32+
tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
33+
tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
34+
tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
35+
tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
36+
37+
// Try various types & ranges.
38+
tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
39+
tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
40+
tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
41+
tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
42+
tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4);
43+
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
44+
ext::oneapi::minimum<>{}, MaxWGSize * 2);
45+
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
46+
ext::oneapi::maximum<>{}, 8);
47+
tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);
48+
49+
// Check with CUSTOM type.
50+
tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
51+
CustomVecPlus<long long>{}, 256);
52+
tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
53+
CustomVecPlus<long long>{}, MaxWGSize * 3);
54+
55+
std::cout << "Test passed\n";
56+
return 0;
57+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
5+
// TODO: accelerator may not suport atomics required by the current
6+
// implementation. Enable testing when implementation is fixed.
7+
// RUNx: %ACC_RUN_PLACEHOLDER %t.out
8+
9+
// This test performs basic checks of parallel_for(range<2>, reduction, func)
10+
// with reductions initialized with 1-dimensional discard_write accessor
11+
// accessing 1 element buffer.
12+
13+
#include "reduction_range_scalar.hpp"
14+
15+
using namespace cl::sycl;
16+
17+
template <typename Name, typename T, class BinaryOperation>
18+
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) {
19+
constexpr access::mode DW = access::mode::discard_write;
20+
testBoth<Name, DW, T>(Q, Identity, Init, BOp, Range);
21+
}
22+
23+
int main() {
24+
queue Q;
25+
printDeviceInfo(Q);
26+
size_t MaxWGSize =
27+
Q.get_device().get_info<info::device::max_work_group_size>();
28+
29+
tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
30+
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
31+
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
32+
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
33+
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
34+
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
35+
tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
36+
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});
37+
38+
tests<class B1, int>(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3});
39+
tests<class B2, int>(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3});
40+
tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2});
41+
tests<class B4, uint64_t>(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3});
42+
tests<class B5, uint64_t>(Q, 1, 3, std::multiplies<>{},
43+
range<2>{3, MaxWGSize});
44+
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
45+
ext::oneapi::minimum<>{}, range<2>{8, 3});
46+
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
47+
ext::oneapi::maximum<>{}, range<2>{3, 3});
48+
tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, range<2>{3, 3});
49+
50+
tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
51+
CustomVecPlus<long long>{}, range<2>{33, MaxWGSize});
52+
53+
std::cout << "Test passed\n";
54+
return 0;
55+
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
2+
// RUN: %CPU_RUN_PLACEHOLDER %t.out
3+
// RUN: %GPU_RUN_PLACEHOLDER %t.out
4+
5+
// TODO: accelerator may not suport atomics required by the current
6+
// implementation. Enable testing when implementation is fixed.
7+
// RUNx: %ACC_RUN_PLACEHOLDER %t.out
8+
9+
// This test performs basic checks of parallel_for(range<2>, reduction, func)
10+
// with reductions initialized with 1-dimensional discard_write accessor
11+
// accessing 1 element buffer.
12+
13+
#include "reduction_range_scalar.hpp"
14+
15+
using namespace cl::sycl;
16+
17+
template <typename Name, typename T, class BinaryOperation>
18+
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) {
19+
constexpr access::mode RW = access::mode::read_write;
20+
testBoth<Name, RW, T>(Q, Identity, Init, BOp, Range);
21+
}
22+
23+
int main() {
24+
queue Q;
25+
printDeviceInfo(Q);
26+
size_t MaxWGSize =
27+
Q.get_device().get_info<info::device::max_work_group_size>();
28+
29+
tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
30+
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
31+
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
32+
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
33+
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
34+
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
35+
tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
36+
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});
37+
38+
tests<class B1, int>(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3});
39+
tests<class B2, int>(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3});
40+
tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2});
41+
tests<class B4, uint64_t>(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3});
42+
tests<class B5, uint64_t>(Q, 1, 3, std::multiplies<>{},
43+
range<2>{3, MaxWGSize});
44+
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
45+
ext::oneapi::minimum<>{}, range<2>{8, 3});
46+
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
47+
ext::oneapi::maximum<>{}, range<2>{3, 3});
48+
tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, range<2>{3, 3});
49+
50+
tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
51+
CustomVecPlus<long long>{}, range<2>{33, MaxWGSize});
52+
53+
std::cout << "Test passed\n";
54+
return 0;
55+
}

0 commit comments

Comments
 (0)