[SYCL] Add tests for span reductions

Pennycook · Pennycook · commit 8ca8249bd30a · 2022-04-26T16:44:08.000-04:00
Tests the following:
- Only spans with static extents are available
- Kernels with a single reduction span work
- Kernels with a reduction pack containing a span work

Signed-off-by: John Pennycook &lt;john.pennycook@intel.com&gt;
diff --git a/SYCL/Reduction/reduction_dynamic_span.cpp b/SYCL/Reduction/reduction_dynamic_span.cpp
@@ -0,0 +1,12 @@
+// RUN: not %clangxx -fsycl -fsyntax-only -fsycl-targets=%sycl_triple %s -o %t.out
+
+#include <CL/sycl.hpp>
+
+using namespace sycl;
+
+int main(int argc, char *argv[]) {
+
+  // SYCL 2020 reductions cannot be created from spans with dynamic extents
+  auto Span = span<int, dynamic_extent>(nullptr, 1);
+  auto Redu = reduction(Span, plus<>());
+}
diff --git a/SYCL/Reduction/reduction_span.cpp b/SYCL/Reduction/reduction_span.cpp
@@ -0,0 +1,113 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// `Group algorithms are not supported on host device.` on Nvidia.
+// XFAIL: hip_nvidia
+
+// TODO: test disabled due to sporadic fails in level_zero:gpu RT.
+// UNSUPPORTED: linux && level_zero
+
+// This test performs basic checks of reductions initialized with a sycl::span
+
+#include <CL/sycl.hpp>
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <int Dimensions> size_t getLinearSize(range<Dimensions> Range) {
+  return Range.size();
+}
+
+template <int Dimensions> size_t getLinearSize(nd_range<Dimensions> NDRange) {
+  return NDRange.get_global_range().size();
+}
+
+template <int Dimensions>
+size_t getLinearId(nd_range<Dimensions>, nd_item<Dimensions> Item) {
+  return Item.get_global_linear_id();
+}
+
+size_t getLinearId(range<1>, id<1> Id) { return Id[0]; }
+
+size_t getLinearId(range<2> Range, id<2> Id) {
+  return Id[0] * Range[1] + Id[1];
+}
+
+size_t getLinearId(range<3> Range, id<3> Id) {
+  return Id[0] * Range[1] * Range[2] + Id[1] * Range[2] + Id[2];
+}
+
+template <size_t N, typename T, typename BinaryOperation, typename Range>
+void test(queue Q, Range Rng, T Identity, T Value) {
+
+  // Initialize output to identity value
+  T *Output = malloc_shared<T>(N, Q);
+  Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait();
+
+  // Perform generalized "histogram" with N bins
+  // TODO: Test Q.parallel_for when code_location is fixed
+  Q.submit([&](handler &CGH) {
+     CGH.parallel_for(
+         Rng, reduction(span<T, N>(Output, N), Identity, BinaryOperation()),
+         [=](auto It, auto &Reducer) {
+           size_t Index = getLinearId(Rng, It) % N;
+           Reducer[Index].combine(Value);
+         });
+   }).wait();
+
+  size_t Size = getLinearSize(Rng);
+
+  // Each bin should have the same value unless B doesn't divide N
+  T Expected = Identity;
+  T ExpectedRemainder;
+  for (size_t I = 0; I < Size; I += N) {
+    ExpectedRemainder = Expected;
+    Expected = BinaryOperation()(Expected, Value);
+  }
+
+  bool Passed = true;
+  for (size_t I = 0; I < N; ++I) {
+    if (I < Size % N) {
+      Passed &= (Output[I] == Expected);
+    } else {
+      Passed &= (Output[I] == ExpectedRemainder);
+    }
+  }
+
+  free(Output, Q);
+  NumErrors += (Passed) ? 0 : 1;
+}
+
+struct CustomType {
+  int x;
+  bool operator==(const CustomType &o) const { return (x == o.x); }
+};
+
+struct CustomBinaryOperation {
+  CustomType operator()(const CustomType &lhs, const CustomType &rhs) const {
+    return CustomType{lhs.x + rhs.x};
+  }
+};
+
+int main() {
+  queue Q;
+
+  // Tests for small spans that can be privatized efficiently
+  // Each combination tests a different sycl::reduction implementation
+  test<16, int, std::plus<int>, sycl::range<1>>(Q, 24, 0, 1);
+  test<16, float, std::plus<float>, sycl::range<1>>(Q, 24, 0, 1);
+  test<16, int, std::multiplies<int>, sycl::range<1>>(Q, 24, 1, 2);
+  test<16, CustomType, CustomBinaryOperation, sycl::range<1>>(
+      Q, 24, CustomType{0}, CustomType{1});
+
+  test<16, int, std::plus<int>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
+  test<16, float, std::plus<float>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
+  test<16, int, std::multiplies<int>, sycl::nd_range<1>>(Q, {24, 8}, 1, 2);
+  test<16, int, std::bit_or<int>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
+  test<16, CustomType, CustomBinaryOperation, sycl::nd_range<1>>(
+      Q, {24, 8}, CustomType{0}, CustomType{1});
+
+  return NumErrors;
+}
diff --git a/SYCL/Reduction/reduction_span_pack.cpp b/SYCL/Reduction/reduction_span_pack.cpp
@@ -0,0 +1,194 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.out
+// RUN: %CPU_RUN_PLACEHOLDER %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUN: %ACC_RUN_PLACEHOLDER %t.out
+//
+// `Group algorithms are not supported on host device.` on Nvidia.
+// XFAIL: hip_nvidia
+
+// TODO: test disabled due to sporadic fails in level_zero:gpu RT.
+// UNSUPPORTED: linux && level_zero
+
+// This test performs basic checks of reductions initialized with a pack
+// containing at least one sycl::span
+
+#include <CL/sycl.hpp>
+using namespace sycl;
+
+int NumErrors = 0;
+
+template <int Dimensions> size_t getLinearSize(range<Dimensions> Range) {
+  return Range.size();
+}
+
+template <int Dimensions> size_t getLinearSize(nd_range<Dimensions> NDRange) {
+  return NDRange.get_global_range().size();
+}
+
+template <int Dimensions>
+size_t getLinearId(nd_range<Dimensions>, nd_item<Dimensions> Item) {
+  return Item.get_global_linear_id();
+}
+
+size_t getLinearId(range<1>, id<1> Id) { return Id[0]; }
+
+size_t getLinearId(range<2> Range, id<2> Id) {
+  return Id[0] * Range[1] + Id[1];
+}
+
+size_t getLinearId(range<3> Range, id<3> Id) {
+  return Id[0] * Range[1] * Range[2] + Id[1] * Range[2] + Id[2];
+}
+
+// Test a span and a regular sum
+template <size_t N, typename T, typename BinaryOperation, typename Range>
+void test1(queue Q, Range Rng, T Identity, T Value) {
+
+  // Initialize output to identity value
+  int *Sum = malloc_shared<int>(1, Q);
+  Q.single_task([=]() { *Sum = 0; }).wait();
+  T *Output = malloc_shared<T>(N, Q);
+  Q.parallel_for(range<1>{N}, [=](id<1> I) { Output[I] = Identity; }).wait();
+
+  // Perform generalized "histogram" with N bins
+  // TODO: Test Q.parallel_for when code_location is fixed
+  Q.submit([&](handler &CGH) {
+     CGH.parallel_for(
+         Rng, reduction(Sum, plus<>()),
+         reduction(span<T, N>(Output, N), Identity, BinaryOperation()),
+         [=](auto It, auto &ScalarReducer, auto &SpanReducer) {
+           ScalarReducer++;
+           size_t Index = getLinearId(Rng, It) % N;
+           SpanReducer[Index].combine(Value);
+         });
+   }).wait();
+
+  size_t Size = getLinearSize(Rng);
+
+  // Each bin should have the same value unless B doesn't divide N
+  T Expected = Identity;
+  T ExpectedRemainder;
+  for (size_t I = 0; I < Size; I += N) {
+    ExpectedRemainder = Expected;
+    Expected = BinaryOperation()(Expected, Value);
+  }
+
+  bool Passed = true;
+  for (size_t I = 0; I < N; ++I) {
+    if (I < Size % N) {
+      Passed &= (Output[I] == Expected);
+    } else {
+      Passed &= (Output[I] == ExpectedRemainder);
+    }
+  }
+  Passed &= (*Sum == Size);
+
+  free(Output, Q);
+  free(Sum, Q);
+  NumErrors += (Passed) ? 0 : 1;
+}
+
+// Test two spans
+template <size_t N, typename T, typename BinaryOperation, typename Range>
+void test2(queue Q, Range Rng, T Identity, T Value) {
+
+  // Initialize output to identity value
+  int *Output1 = malloc_shared<int>(N, Q);
+  Q.parallel_for(range<1>{N}, [=](id<1> I) { Output1[I] = 0; }).wait();
+  T *Output2 = malloc_shared<T>(N, Q);
+  Q.parallel_for(range<1>{N}, [=](id<1> I) { Output2[I] = Identity; }).wait();
+
+  // Perform generalized "histogram" with N bins
+  // TODO: Test Q.parallel_for when code_location is fixed
+  Q.submit([&](handler &CGH) {
+     CGH.parallel_for(
+         Rng, reduction(span<int, N>(Output1, N), plus<>()),
+         reduction(span<T, N>(Output2, N), Identity, BinaryOperation()),
+         [=](auto It, auto &Reducer1, auto &Reducer2) {
+           size_t Index = getLinearId(Rng, It) % N;
+           Reducer1[Index]++;
+           Reducer2[Index].combine(Value);
+         });
+   }).wait();
+
+  size_t Size = getLinearSize(Rng);
+  bool Passed = true;
+  // Span1
+  {
+    int Expected = 0;
+    int ExpectedRemainder;
+    for (size_t I = 0; I < Size; I += N) {
+      ExpectedRemainder = Expected;
+      Expected += 1;
+    }
+
+    for (size_t I = 0; I < N; ++I) {
+      if (I < Size % N) {
+        Passed &= (Output1[I] == Expected);
+      } else {
+        Passed &= (Output1[I] == ExpectedRemainder);
+      }
+    }
+  }
+
+  // Span2
+  {
+    T Expected = Identity;
+    T ExpectedRemainder;
+    for (size_t I = 0; I < Size; I += N) {
+      ExpectedRemainder = Expected;
+      Expected = BinaryOperation()(Expected, Value);
+    }
+
+    for (size_t I = 0; I < N; ++I) {
+      if (I < Size % N) {
+        Passed &= (Output2[I] == Expected);
+      } else {
+        Passed &= (Output2[I] == ExpectedRemainder);
+      }
+    }
+  }
+
+  free(Output2, Q);
+  free(Output1, Q);
+  NumErrors += (Passed) ? 0 : 1;
+}
+
+struct CustomType {
+  int x;
+  bool operator==(const CustomType &o) const { return (x == o.x); }
+};
+
+struct CustomBinaryOperation {
+  CustomType operator()(const CustomType &lhs, const CustomType &rhs) const {
+    return CustomType{lhs.x + rhs.x};
+  }
+};
+
+template <size_t N, typename T, typename BinaryOperation, typename Range>
+void test(queue Q, Range Rng, T Identity, T Value) {
+  test1<N, T, BinaryOperation, Range>(Q, Rng, Identity, Value);
+  test2<N, T, BinaryOperation, Range>(Q, Rng, Identity, Value);
+}
+
+int main() {
+  queue Q;
+
+  // Tests for small spans that can be privatized efficiently
+  // Each combination tests a different sycl::reduction implementation
+  // TODO: Enable range<> tests once parallel_for accepts pack
+  /*test<16, int, std::plus<int>, sycl::range<1>>(Q, 24, 0, 1);
+  test<16, float, std::plus<float>, sycl::range<1>>(Q, 24, 0, 1);
+  test<16, int, std::multiplies<int>, sycl::range<1>>(Q, 24, 1, 2);
+  test<16, CustomType, CustomBinaryOperation, sycl::range<1>>(Q, 24,
+  CustomType{0}, CustomType{1});*/
+
+  test<16, int, std::plus<int>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
+  test<16, float, std::plus<float>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
+  test<16, int, std::multiplies<int>, sycl::nd_range<1>>(Q, {24, 8}, 1, 2);
+  test<16, int, std::bit_or<int>, sycl::nd_range<1>>(Q, {24, 8}, 0, 1);
+  test<16, CustomType, CustomBinaryOperation, sycl::nd_range<1>>(
+      Q, {24, 8}, CustomType{0}, CustomType{1});
+
+  return NumErrors;
+}