Skip to content
This repository was archived by the owner on Mar 28, 2023. It is now read-only.

[SYCL] Add lit tests for reduction + range (#4101) #366

Merged
merged 2 commits into from
Jul 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions SYCL/Reduction/reduction_range_1d_s0_dw.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<1>, reduction, func)
// with reductions initialized with 0-dimensional discard_write accessor
// accessing 1 element buffer.

#include "reduction_range_scalar.hpp"

using namespace cl::sycl;

template <typename Name, typename T, class BinaryOperation>
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
constexpr access::mode DW = access::mode::discard_write;
test<Name, false, DW, T, 0>(Q, Identity, Init, BOp, range<1>{NWItems});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

// Fast-reduce and Fast-atomics. Try various range types/sizes.
tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);

// Try various types & ranges.
tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1);
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
ext::oneapi::minimum<>{}, MaxWGSize * 2);
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
ext::oneapi::maximum<>{}, 8);
tests<class B8, uint64_t>(Q, 1, 99, std::multiplies<>{}, 37);

// Check with CUSTOM type.
using CV = CustomVec<long long>;
tests<class C1>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 64);
tests<class C2>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, MaxWGSize * 3);

std::cout << "Test passed\n";
return 0;
}
57 changes: 57 additions & 0 deletions SYCL/Reduction/reduction_range_1d_s0_rw.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<1>, reduction, func)
// with reductions initialized with 0-dimensional read_write accessor
// accessing 1 element buffer.

#include "reduction_range_scalar.hpp"

using namespace cl::sycl;

template <typename Name, typename T, class BinaryOperation>
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
constexpr access::mode RW = access::mode::read_write;
test<Name, false, RW, T, 0>(Q, Identity, Init, BOp, range<1>{NWItems});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

// Fast-reduce and Fast-atomics. Try various range types/sizes.
tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);

// Try various types & ranges.
tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1);
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
ext::oneapi::minimum<>{}, MaxWGSize * 2);
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
ext::oneapi::maximum<>{}, 8);
tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);

// Check with CUSTOM type.
tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
CustomVecPlus<long long>{}, 64);
tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
CustomVecPlus<long long>{}, MaxWGSize * 3);

std::cout << "Test passed\n";
return 0;
}
57 changes: 57 additions & 0 deletions SYCL/Reduction/reduction_range_1d_s1_dw.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

#include "reduction_range_scalar.hpp"

// This test performs basic checks of parallel_for(range<1>, reduction, func)
// with reductions initialized with 1-dimensional discard_write accessor
// accessing 1 element buffer.

using namespace cl::sycl;

template <typename Name, typename T, class BinaryOperation>
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
constexpr access::mode DW = access::mode::discard_write;
testBoth<Name, DW, T>(Q, Identity, Init, BOp, range<1>{NWItems});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

// Fast-reduce and Fast-atomics. Try various range types/sizes.
tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);

// Try various types & ranges.
tests<class B1, int>(Q, ~0, 99, std::bit_and<>{}, 7);
tests<class B2, int>(Q, 0, 0xff99, std::bit_xor<>{}, MaxWGSize);
tests<class B3, int>(Q, 0, 0xff99, std::bit_or<>{}, 3);
tests<class B4, int>(Q, 1, 3, std::multiplies<>{}, 32);
tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4);
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
ext::oneapi::minimum<>{}, MaxWGSize * 2);
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
ext::oneapi::maximum<>{}, 8);
tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);

// Check with CUSTOM type.
tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
CustomVecPlus<long long>{}, 256);
tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
CustomVecPlus<long long>{}, MaxWGSize * 3);

std::cout << "Test passed\n";
return 0;
}
57 changes: 57 additions & 0 deletions SYCL/Reduction/reduction_range_1d_s1_rw.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<1>, reduction, func)
// with reductions initialized with 1-dimensional read_write accessor
// accessing 1 element buffer.

#include "reduction_range_scalar.hpp"

using namespace cl::sycl;

template <typename Name, typename T, class BinaryOperation>
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
constexpr access::mode RW = access::mode::read_write;
testBoth<Name, RW, T>(Q, Identity, Init, BOp, range<1>{NWItems});
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

// Fast-reduce and Fast-atomics. Try various range types/sizes.
tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1);
tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 2);
tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);

// Try various types & ranges.
tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4);
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
ext::oneapi::minimum<>{}, MaxWGSize * 2);
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
ext::oneapi::maximum<>{}, 8);
tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);

// Check with CUSTOM type.
tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
CustomVecPlus<long long>{}, 256);
tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
CustomVecPlus<long long>{}, MaxWGSize * 3);

std::cout << "Test passed\n";
return 0;
}
55 changes: 55 additions & 0 deletions SYCL/Reduction/reduction_range_2d_s1_dw.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<2>, reduction, func)
// with reductions initialized with 1-dimensional discard_write accessor
// accessing 1 element buffer.

#include "reduction_range_scalar.hpp"

using namespace cl::sycl;

template <typename Name, typename T, class BinaryOperation>
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) {
constexpr access::mode DW = access::mode::discard_write;
testBoth<Name, DW, T>(Q, Identity, Init, BOp, Range);
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});

tests<class B1, int>(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3});
tests<class B2, int>(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3});
tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2});
tests<class B4, uint64_t>(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3});
tests<class B5, uint64_t>(Q, 1, 3, std::multiplies<>{},
range<2>{3, MaxWGSize});
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
ext::oneapi::minimum<>{}, range<2>{8, 3});
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
ext::oneapi::maximum<>{}, range<2>{3, 3});
tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, range<2>{3, 3});

tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
CustomVecPlus<long long>{}, range<2>{33, MaxWGSize});

std::cout << "Test passed\n";
return 0;
}
55 changes: 55 additions & 0 deletions SYCL/Reduction/reduction_range_2d_s1_rw.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
// RUN: %CPU_RUN_PLACEHOLDER %t.out
// RUN: %GPU_RUN_PLACEHOLDER %t.out

// TODO: accelerator may not suport atomics required by the current
// implementation. Enable testing when implementation is fixed.
// RUNx: %ACC_RUN_PLACEHOLDER %t.out

// This test performs basic checks of parallel_for(range<2>, reduction, func)
// with reductions initialized with 1-dimensional discard_write accessor
// accessing 1 element buffer.

#include "reduction_range_scalar.hpp"

using namespace cl::sycl;

template <typename Name, typename T, class BinaryOperation>
void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) {
constexpr access::mode RW = access::mode::read_write;
testBoth<Name, RW, T>(Q, Identity, Init, BOp, Range);
}

int main() {
queue Q;
printDeviceInfo(Q);
size_t MaxWGSize =
Q.get_device().get_info<info::device::max_work_group_size>();

tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});

tests<class B1, int>(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3});
tests<class B2, int>(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3});
tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2});
tests<class B4, uint64_t>(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3});
tests<class B5, uint64_t>(Q, 1, 3, std::multiplies<>{},
range<2>{3, MaxWGSize});
tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
ext::oneapi::minimum<>{}, range<2>{8, 3});
tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
ext::oneapi::maximum<>{}, range<2>{3, 3});
tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, range<2>{3, 3});

tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
CustomVecPlus<long long>{}, range<2>{33, MaxWGSize});

std::cout << "Test passed\n";
return 0;
}
Loading