intel
diff --git a/‎sycl/test-e2e/KernelFusion/GroupAlgorithm/all_of.cpp
Lines changed: 75 additions & 0 deletions b/‎sycl/test-e2e/KernelFusion/GroupAlgorithm/all_of.cpp
Lines changed: 75 additions & 0 deletions
diff --git a/‎sycl/test-e2e/KernelFusion/GroupAlgorithm/exclusive_scan.cpp
Lines changed: 183 additions & 0 deletions b/‎sycl/test-e2e/KernelFusion/GroupAlgorithm/exclusive_scan.cpp
Lines changed: 183 additions & 0 deletions
diff --git a/‎sycl/test-e2e/KernelFusion/GroupAlgorithm/support.h
Lines changed: 45 additions & 0 deletions b/‎sycl/test-e2e/KernelFusion/GroupAlgorithm/support.h
Lines changed: 45 additions & 0 deletions
diff --git a/‎sycl/test-e2e/KernelFusion/GroupFunctions/group_barrier.cpp
Lines changed: 64 additions & 0 deletions b/‎sycl/test-e2e/KernelFusion/GroupFunctions/group_barrier.cpp
Lines changed: 64 additions & 0 deletions
@@ -0,0 +1,75 @@
+// RUN: %{build} -fsycl-embed-ir -I . -o %t.out
+// RUN: %{run} %t.out
+
+#include "../helpers.hpp"
+#include "support.h"
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <numeric>
+#include <sycl/sycl.hpp>
+
+// COM: Check all_of works with kernel fusion.
+
+using namespace sycl;
+
+template <class Predicate> class all_of_kernel;
+
+struct IsEven {
+  bool operator()(int i) const { return (i % 2) == 0; }
+};
+
+template <typename InputContainer, typename OutputContainer, class Predicate>
+void test(queue q, InputContainer input, OutputContainer output,
+          Predicate pred) {
+  typedef class all_of_kernel<Predicate> kernel_name;
+  size_t N = input.size();
+  size_t G = 64;
+  {
+    buffer<int> in_buf(input.data(), input.size());
+    buffer<bool> out_buf(output.data(), output.size());
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    iota(q, in_buf, 0);
+
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[0] = all_of_group(g, pred(in[lid]));
+        out[1] = all_of_group(g, in[lid], pred);
+        out[2] = joint_all_of(
+            g, in.template get_multi_ptr<access::decorated::no>(),
+            in.template get_multi_ptr<access::decorated::no>() + N, pred);
+      });
+    });
+
+    complete_fusion_with_check(
+        fw, ext::codeplay::experimental::property::no_barriers{});
+  }
+  bool expected = std::all_of(input.begin(), input.end(), pred);
+  assert(output[0] == expected);
+  assert(output[1] == expected);
+  assert(output[2] == expected);
+}
+
+int main() {
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+  if (!isSupportedDevice(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 128;
+  std::array<int, N> input;
+  std::array<bool, 3> output;
+  std::fill(output.begin(), output.end(), false);
+
+  test(q, input, output, IsEven());
+
+  std::cout << "Test passed." << std::endl;
+}
@@ -0,0 +1,183 @@
+// RUN: %{build} -fsycl-embed-ir -I . -o %t.out
+// RUN: %{run} %t.out
+
+#include "../../helpers.hpp"
+#include "../helpers.hpp"
+#include "support.h"
+#include <algorithm>
+#include <cassert>
+#include <iostream>
+#include <limits>
+#include <numeric>
+#include <sycl/sycl.hpp>
+#include <vector>
+using namespace sycl;
+
+// COM: Check exclusive_scan works with fusion
+
+template <class SpecializationKernelName, int TestNumber>
+class exclusive_scan_kernel;
+
+template <typename SpecializationKernelName, typename InputContainer,
+          typename OutputContainer, class BinaryOperation>
+void test(queue q, InputContainer input, OutputContainer output,
+          BinaryOperation binary_op,
+          typename OutputContainer::value_type identity) {
+  typedef typename InputContainer::value_type InputT;
+  typedef typename OutputContainer::value_type OutputT;
+  typedef class exclusive_scan_kernel<SpecializationKernelName, 0> kernel_name0;
+  typedef class exclusive_scan_kernel<SpecializationKernelName, 1> kernel_name1;
+  typedef class exclusive_scan_kernel<SpecializationKernelName, 2> kernel_name2;
+  typedef class exclusive_scan_kernel<SpecializationKernelName, 3> kernel_name3;
+  OutputT init = 42;
+  size_t N = input.size();
+  size_t G = 64;
+  std::vector<OutputT> expected(N);
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    iota(q, in_buf, 0);
+
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name0>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = exclusive_scan_over_group(g, in[lid], binary_op);
+      });
+    });
+
+    complete_fusion_with_check(
+        fw, ext::codeplay::experimental::property::no_barriers{});
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(),
+                      identity, binary_op);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  // Fill to test fusion again
+  std::fill(input.begin(), input.end(), 0);
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    iota(q, in_buf, 0);
+
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name1>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        int lid = it.get_local_id(0);
+        out[lid] = exclusive_scan_over_group(g, in[lid], init, binary_op);
+      });
+    });
+
+    complete_fusion_with_check(
+        fw, ext::codeplay::experimental::property::no_barriers{});
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + G, expected.begin(), init,
+                      binary_op);
+  assert(std::equal(output.begin(), output.begin() + G, expected.begin()));
+
+  // Fill to test fusion again
+  std::fill(input.begin(), input.end(), 0);
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    iota(q, in_buf, 0);
+
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name2>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        joint_exclusive_scan(
+            g, in.template get_multi_ptr<access::decorated::no>(),
+            in.template get_multi_ptr<access::decorated::no>() + N,
+            out.template get_multi_ptr<access::decorated::no>(), binary_op);
+      });
+    });
+    complete_fusion_with_check(
+        fw, ext::codeplay::experimental::property::no_barriers{});
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(),
+                      identity, binary_op);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+
+  // Fill to test fusion again
+  std::fill(input.begin(), input.end(), 0);
+
+  {
+    buffer<InputT> in_buf(input.data(), input.size());
+    buffer<OutputT> out_buf(output.data(), output.size());
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    iota(q, in_buf, 0);
+
+    q.submit([&](handler &cgh) {
+      accessor in{in_buf, cgh, sycl::read_only};
+      accessor out{out_buf, cgh, sycl::write_only, sycl::no_init};
+      cgh.parallel_for<kernel_name3>(nd_range<1>(G, G), [=](nd_item<1> it) {
+        group<1> g = it.get_group();
+        joint_exclusive_scan(
+            g, in.template get_multi_ptr<access::decorated::no>(),
+            in.template get_multi_ptr<access::decorated::no>() + N,
+            out.template get_multi_ptr<access::decorated::no>(), init,
+            binary_op);
+      });
+    });
+    complete_fusion_with_check(
+        fw, ext::codeplay::experimental::property::no_barriers{});
+  }
+  emu::exclusive_scan(input.begin(), input.begin() + N, expected.begin(), init,
+                      binary_op);
+  assert(std::equal(output.begin(), output.begin() + N, expected.begin()));
+}
+
+int main() {
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+  if (!isSupportedDevice(q.get_device())) {
+    std::cout << "Skipping test\n";
+    return 0;
+  }
+
+  constexpr int N = 128;
+  std::array<int, N> input;
+  std::array<int, N> output;
+  std::fill(output.begin(), output.end(), 0);
+
+  test<class KernelNamePlusV>(q, input, output, sycl::plus<>(), 0);
+  test<class KernelNameMinimumV>(q, input, output, sycl::minimum<>(),
+                                 std::numeric_limits<int>::max());
+  test<class KernelNameMaximumV>(q, input, output, sycl::maximum<>(),
+                                 std::numeric_limits<int>::lowest());
+
+  test<class KernelNamePlusI>(q, input, output, sycl::plus<int>(), 0);
+  test<class KernelNameMinimumI>(q, input, output, sycl::minimum<int>(),
+                                 std::numeric_limits<int>::max());
+  test<class KernelNameMaximumI>(q, input, output, sycl::maximum<int>(),
+                                 std::numeric_limits<int>::lowest());
+  test<class KernelName_VzAPutpBRRJrQPB>(q, input, output,
+                                         sycl::multiplies<int>(), 1);
+  test<class KernelName_UXdGbr>(q, input, output, sycl::bit_or<int>(), 0);
+  test<class KernelName_saYaodNyJknrPW>(q, input, output, sycl::bit_xor<int>(),
+                                        0);
+  test<class KernelName_GPcuAlvAOjrDyP>(q, input, output, sycl::bit_and<int>(),
+                                        ~0);
+
+  std::cout << "Test passed." << std::endl;
+}
@@ -0,0 +1,45 @@
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+
+bool isSupportedDevice(device D) {
+  std::string PlatformName =
+      D.get_platform().get_info<sycl::info::platform::name>();
+  if (PlatformName.find("CUDA") != std::string::npos)
+    return true;
+
+  if (PlatformName.find("Level-Zero") != std::string::npos)
+    return true;
+
+  if (PlatformName.find("OpenCL") != std::string::npos) {
+    std::string Version = D.get_info<sycl::info::device::version>();
+
+    // Group collectives are mandatory in OpenCL 2.0 but optional in 3.0.
+    Version = Version.substr(7, 3);
+    if (Version >= "2.0" && Version < "3.0")
+      return true;
+  }
+
+  return false;
+}
+
+template <typename T, typename S> bool equal(const T &x, const S &y) {
+  // vec equal returns a vector of which components were equal
+  if constexpr (sycl::detail::is_vec<T>::value) {
+    for (int i = 0; i < x.size(); ++i)
+      if (x[i] != y[i])
+        return false;
+    return true;
+  } else
+    return x == y;
+}
+
+template <typename T1, typename T2>
+bool ranges_equal(T1 begin1, T1 end1, T2 begin2) {
+  for (; begin1 != end1; ++begin1, ++begin2)
+    if (!equal(*begin1, *begin2))
+      return false;
+  return true;
+}
@@ -0,0 +1,64 @@
+// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{run} %t.out
+
+// Test complete_fusion preserves barriers by launching a kernel that requires a
+// barrier for correctness.
+
+#include <sycl/sycl.hpp>
+
+#include "../helpers.hpp"
+
+using namespace sycl;
+
+class Kernel;
+
+int main() {
+  constexpr size_t dataSize = 512;
+  constexpr size_t localSize = 64;
+  std::array<int, dataSize> in;
+  std::array<int, dataSize> out;
+  out.fill(0);
+
+  queue q{ext::codeplay::experimental::property::queue::enable_fusion{}};
+  {
+    buffer<int> buff_in{in};
+    buffer<int> buff_out{out};
+
+    ext::codeplay::experimental::fusion_wrapper fw{q};
+    fw.start_fusion();
+
+    assert(fw.is_in_fusion_mode() && "Queue should be in fusion mode");
+
+    iota(q, buff_in, 0);
+
+    // Needed implicit group barrier
+
+    q.submit([&](handler &cgh) {
+      accessor in(buff_in, cgh, read_only);
+      accessor out(buff_out, cgh, write_only, no_init);
+      local_accessor<int> lacc(localSize, cgh);
+      cgh.parallel_for<Kernel>(
+          nd_range<1>{{dataSize}, {localSize}}, [=](nd_item<1> i) {
+            auto group = i.get_group();
+            if (i.get_local_id() == 0) {
+              auto begin = in.begin() + static_cast<int64_t>(
+                                            localSize * group.get_group_id(0));
+              auto end = begin + localSize;
+              std::copy(begin, end, lacc.begin());
+            }
+            // Test following barrier is preserved
+            group_barrier(i.get_group());
+            out[i.get_global_id()] = lacc[i.get_local_id()];
+          });
+    });
+
+    complete_fusion_with_check(fw);
+  }
+
+  // Check the results
+  for (int i = 0, end = dataSize; i < end; ++i) {
+    assert(out[i] == i && "Computation error");
+  }
+
+  return 0;
+}