intel
diff --git a/‎SYCL/Reduction/reduction_range_1d_s0_dw.cpp
Lines changed: 56 additions & 0 deletions b/‎SYCL/Reduction/reduction_range_1d_s0_dw.cpp
Lines changed: 56 additions & 0 deletions
diff --git a/‎SYCL/Reduction/reduction_range_1d_s0_rw.cpp
Lines changed: 57 additions & 0 deletions b/‎SYCL/Reduction/reduction_range_1d_s0_rw.cpp
Lines changed: 57 additions & 0 deletions
diff --git a/‎SYCL/Reduction/reduction_range_1d_s1_dw.cpp
Lines changed: 57 additions & 0 deletions b/‎SYCL/Reduction/reduction_range_1d_s1_dw.cpp
Lines changed: 57 additions & 0 deletions
diff --git a/‎SYCL/Reduction/reduction_range_1d_s1_rw.cpp
Lines changed: 57 additions & 0 deletions b/‎SYCL/Reduction/reduction_range_1d_s1_rw.cpp
Lines changed: 57 additions & 0 deletions
diff --git a/‎SYCL/Reduction/reduction_range_2d_s1_dw.cpp
Lines changed: 55 additions & 0 deletions b/‎SYCL/Reduction/reduction_range_2d_s1_dw.cpp
Lines changed: 55 additions & 0 deletions
diff --git a/‎SYCL/Reduction/reduction_range_2d_s1_rw.cpp
Lines changed: 55 additions & 0 deletions b/‎SYCL/Reduction/reduction_range_2d_s1_rw.cpp
Lines changed: 55 additions & 0 deletions
@@ -0,0 +1,56 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with 0-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
+  constexpr access::mode DW = access::mode::discard_write;
+  test<Name, false, DW, T, 0>(Q, Identity, Init, BOp, range<1>{NWItems});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
+  tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
+
+  // Try various types & ranges.
+  tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
+  tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
+  tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
+  tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
+  tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, MaxWGSize * 2);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 8);
+  tests<class B8, uint64_t>(Q, 1, 99, std::multiplies<>{}, 37);
+
+  // Check with CUSTOM type.
+  using CV = CustomVec<long long>;
+  tests<class C1>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, 64);
+  tests<class C2>(Q, CV(0), CV(99), CustomVecPlus<long long>{}, MaxWGSize * 3);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
@@ -0,0 +1,57 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with 0-dimensional read_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
+  constexpr access::mode RW = access::mode::read_write;
+  test<Name, false, RW, T, 0>(Q, Identity, Init, BOp, range<1>{NWItems});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
+  tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
+
+  // Try various types & ranges.
+  tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
+  tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
+  tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
+  tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
+  tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize + 1);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, MaxWGSize * 2);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 8);
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);
+
+  // Check with CUSTOM type.
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, 64);
+  tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, MaxWGSize * 3);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
@@ -0,0 +1,57 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+#include "reduction_range_scalar.hpp"
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with 1-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
+  constexpr access::mode DW = access::mode::discard_write;
+  testBoth<Name, DW, T>(Q, Identity, Init, BOp, range<1>{NWItems});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, 1);
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, 2);
+  tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
+
+  // Try various types & ranges.
+  tests<class B1, int>(Q, ~0, 99, std::bit_and<>{}, 7);
+  tests<class B2, int>(Q, 0, 0xff99, std::bit_xor<>{}, MaxWGSize);
+  tests<class B3, int>(Q, 0, 0xff99, std::bit_or<>{}, 3);
+  tests<class B4, int>(Q, 1, 3, std::multiplies<>{}, 32);
+  tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, MaxWGSize * 2);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 8);
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);
+
+  // Check with CUSTOM type.
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, 256);
+  tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, MaxWGSize * 3);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
@@ -0,0 +1,57 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<1>, reduction, func)
+// with reductions initialized with 1-dimensional read_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, size_t NWItems) {
+  constexpr access::mode RW = access::mode::read_write;
+  testBoth<Name, RW, T>(Q, Identity, Init, BOp, range<1>{NWItems});
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  // Fast-reduce and Fast-atomics. Try various range types/sizes.
+  tests<class A1, int>(Q, 0, 99, std::plus<int>{}, 1);
+  tests<class A2, int>(Q, 0, 99, std::plus<int>{}, 2);
+  tests<class A3, int64_t>(Q, 0, 99, std::plus<>{}, 7);
+  tests<class A4, int64_t>(Q, 0, 99, std::plus<>{}, 64);
+  tests<class A5, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2);
+  tests<class A6, int>(Q, 0, 99, std::plus<>{}, MaxWGSize * 2 + 5);
+
+  // Try various types & ranges.
+  tests<class B1, int>(Q, ~0, ~0, std::bit_and<>{}, 8);
+  tests<class B2, int>(Q, 0, 0x12340000, std::bit_xor<>{}, 16);
+  tests<class B3, int>(Q, 0, 0x3400, std::bit_or<>{}, MaxWGSize * 4);
+  tests<class B4, int>(Q, 1, 2, std::multiplies<>{}, 256);
+  tests<class B5, int>(Q, 1, 3, std::multiplies<>{}, MaxWGSize * 4);
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, MaxWGSize * 2);
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, 8);
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, MaxWGSize);
+
+  // Check with CUSTOM type.
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, 256);
+  tests<class C2>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, MaxWGSize * 3);
+
+  std::cout << "Test passed\n";
+  return 0;
+}
@@ -0,0 +1,55 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<2>, reduction, func)
+// with reductions initialized with 1-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) {
+  constexpr access::mode DW = access::mode::discard_write;
+  testBoth<Name, DW, T>(Q, Identity, Init, BOp, Range);
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
+  tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});
+
+  tests<class B1, int>(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3});
+  tests<class B2, int>(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3});
+  tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2});
+  tests<class B4, uint64_t>(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3});
+  tests<class B5, uint64_t>(Q, 1, 3, std::multiplies<>{},
+                            range<2>{3, MaxWGSize});
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, range<2>{8, 3});
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, range<2>{3, 3});
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, range<2>{3, 3});
+
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, range<2>{33, MaxWGSize});
+
+  std::cout << "Test passed\n";
+  return 0;
+}
@@ -0,0 +1,55 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+// TODO: accelerator may not suport atomics required by the current
+// implementation. Enable testing when implementation is fixed.
+// RUNx: %ACC_RUN_PLACEHOLDER %t.out
+
+// This test performs basic checks of parallel_for(range<2>, reduction, func)
+// with reductions initialized with 1-dimensional discard_write accessor
+// accessing 1 element buffer.
+
+#include "reduction_range_scalar.hpp"
+
+using namespace cl::sycl;
+
+template <typename Name, typename T, class BinaryOperation>
+void tests(queue &Q, T Identity, T Init, BinaryOperation BOp, range<2> Range) {
+  constexpr access::mode RW = access::mode::read_write;
+  testBoth<Name, RW, T>(Q, Identity, Init, BOp, Range);
+}
+
+int main() {
+  queue Q;
+  printDeviceInfo(Q);
+  size_t MaxWGSize =
+      Q.get_device().get_info<info::device::max_work_group_size>();
+
+  tests<class A1, int>(Q, 0, 99, std::plus<>{}, range<2>{1, 1});
+  tests<class A2, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 2});
+  tests<class A3, int>(Q, 0, 99, std::plus<>{}, range<2>{2, 3});
+  tests<class A4, int>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize, 1});
+  tests<class A5, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{1, MaxWGSize});
+  tests<class A6, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{2, MaxWGSize * 2});
+  tests<class A7, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{MaxWGSize * 3, 7});
+  tests<class A8, int64_t>(Q, 0, 99, std::plus<>{}, range<2>{3, MaxWGSize * 3});
+
+  tests<class B1, int>(Q, 0, 0x2021ff99, std::bit_xor<>{}, range<2>{3, 3});
+  tests<class B2, int>(Q, ~0, 99, std::bit_and<>{}, range<2>{4, 3});
+  tests<class B3, int>(Q, 0, 99, std::bit_or<>{}, range<2>{2, 2});
+  tests<class B4, uint64_t>(Q, 1, 3, std::multiplies<>{}, range<2>{16, 3});
+  tests<class B5, uint64_t>(Q, 1, 3, std::multiplies<>{},
+                            range<2>{3, MaxWGSize});
+  tests<class B6, int>(Q, (std::numeric_limits<int>::max)(), -99,
+                       ext::oneapi::minimum<>{}, range<2>{8, 3});
+  tests<class B7, int>(Q, (std::numeric_limits<int>::min)(), 99,
+                       ext::oneapi::maximum<>{}, range<2>{3, 3});
+  tests<class B8, float>(Q, 1, 99, std::multiplies<>{}, range<2>{3, 3});
+
+  tests<class C1>(Q, CustomVec<long long>(0), CustomVec<long long>(99),
+                  CustomVecPlus<long long>{}, range<2>{33, MaxWGSize});
+
+  std::cout << "Test passed\n";
+  return 0;
+}