intel · steffenlarsen · May 16, 2023 · May 10, 2023 · May 15, 2023 · May 15, 2023
@@ -0,0 +1,140 @@
+//==--- root_group.hpp --- SYCL extension for root groups ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/builtins.hpp>
+#include <sycl/ext/oneapi/properties/properties.hpp>
+#include <sycl/memory_enums.hpp>
+#include <sycl/queue.hpp>
+
+#define SYCL_EXT_ONEAPI_ROOT_GROUP 1
+
+namespace sycl {
+__SYCL_INLINE_VER_NAMESPACE(_V1) {
+namespace ext::oneapi::experimental {
+
+namespace info::kernel_queue_specific {
+// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension once
+// #7598 is merged.
+struct max_num_work_group_sync {
+  using return_type = size_t;
+};
+} // namespace info::kernel_queue_specific
+
+struct use_root_sync_key {
+  using value_t = property_value<use_root_sync_key>;
+};
+
+inline constexpr use_root_sync_key::value_t use_root_sync;
+
+template <> struct is_property_key<use_root_sync_key> : std::true_type {};
+
+template <> struct detail::PropertyToKind<use_root_sync_key> {
+  static constexpr PropKind Kind = PropKind::UseRootSync;
+};
+
+template <>
+struct detail::IsCompileTimeProperty<use_root_sync_key> : std::true_type {};
+
+template <int Dimensions> class root_group {
+public:
+  using id_type = id<Dimensions>;
+  using range_type = range<Dimensions>;
+  using linear_id_type = size_t;
+  static constexpr int dimensions = Dimensions;
+  static constexpr memory_scope fence_scope = memory_scope::device;
+
+  id<Dimensions> get_group_id() const { return id<Dimensions>{}; };
+
+  id<Dimensions> get_local_id() const { return it.get_global_id(); }
+
+  range<Dimensions> get_group_range() const {
+    if constexpr (Dimensions == 3) {
+      return range<3>{1, 1, 1};
+    } else if constexpr (Dimensions == 2) {
+      return range<2>{1, 1};
+    } else {
+      return range<1>{1};
+    }
+  }
+
+  range<Dimensions> get_local_range() const { return it.get_global_range(); };
+
+  range<Dimensions> get_max_local_range() const { return get_local_range(); };
+
+  size_t get_group_linear_id() const { return 0; };
+
+  size_t get_local_linear_id() const { return it.get_global_linear_id(); }
+
+  size_t get_group_linear_range() const { return get_group_range().size(); };
+
+  size_t get_local_linear_range() const { return get_local_range().size(); };
+
+  bool leader() const { return get_local_id() == 0; };
+
+private:
+  friend root_group<Dimensions>
+  nd_item<Dimensions>::ext_oneapi_get_root_group() const;
+
+  root_group(nd_item<Dimensions> it) : it{it} {}
+
+  sycl::nd_item<Dimensions> it;
+};
+
+template <int Dimensions>
+group<Dimensions> get_child_group(root_group<Dimensions> g) {
+  (void)g;
+  return this_group<Dimensions>();
+}
+
+template <int Dimensions> sub_group get_child_group(group<Dimensions> g) {
+  (void)g;
+  return this_sub_group();
+}
+
+namespace this_kernel {
+template <int Dimensions> root_group<Dimensions> get_root_group() {
+  return this_nd_item<Dimensions>().ext_oneapi_get_root_group();
+}
+} // namespace this_kernel
+
+} // namespace ext::oneapi::experimental
+
+template <>
+typename ext::oneapi::experimental::info::kernel_queue_specific::
+    max_num_work_group_sync::return_type
+    kernel::ext_oneapi_get_info<
+        ext::oneapi::experimental::info::kernel_queue_specific::
+            max_num_work_group_sync>(const queue &q) const {
+  // TODO: query the backend to return a value >= 1.
+  return 1;
+}
+
+template <int dimensions>
+void group_barrier(ext::oneapi::experimental::root_group<dimensions> G,
+                   memory_scope FenceScope = decltype(G)::fence_scope) {
+  (void)G;
+  (void)FenceScope;
+#ifdef __SYCL_DEVICE_ONLY__
+  // TODO: Change __spv::Scope::Workgroup to __spv::Scope::Device once backends
+  // support device scope. __spv::Scope::Workgroup is only valid when
+  // max_num_work_group_sync is 1, so that all work items in a root group will
+  // also be in the same work group.
+  __spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup,
+                         __spv::MemorySemanticsMask::SubgroupMemory |
+                             __spv::MemorySemanticsMask::WorkgroupMemory |
+                             __spv::MemorySemanticsMask::CrossWorkgroupMemory);
+#else
+  throw sycl::runtime_error("Barriers are not supported on host device",
+                            PI_ERROR_INVALID_DEVICE);
+#endif
+}
+
+} // __SYCL_INLINE_VER_NAMESPACE(_V1)
+} // namespace sycl
@@ -193,8 +193,9 @@ enum PropKind : uint32_t {
   PipeProtocol = 27,
   ReadyLatency = 28,
   UsesValid = 29,
+  UseRootSync = 30,
   // PropKindSize must always be the last value.
-  PropKindSize = 30,
+  PropKindSize = 31,
 };
 
 // This trait must be specialized for all properties and must have a unique

@@ -25,6 +25,7 @@ namespace sycl {
 __SYCL_INLINE_VER_NAMESPACE(_V1) {
 // Forward declaration
 class context;
+class queue;
 template <backend Backend> class backend_traits;
 template <bundle_state State> class kernel_bundle;
 template <backend BackendName, class SyclObjectT>
@@ -157,6 +158,11 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
   typename detail::is_kernel_device_specific_info_desc<Param>::return_type
       get_info(const device &Device, const range<3> &WGSize) const;
 
+  // TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension
+  // once #7598 is merged.
+  template <typename Param>
+  typename Param::return_type ext_oneapi_get_info(const queue &q) const;
+
 private:
   /// Constructs a SYCL kernel object from a valid kernel_impl instance.
   kernel(std::shared_ptr<detail::kernel_impl> Impl);

@@ -29,6 +29,10 @@ namespace detail {
 class Builder;
 }
 
+namespace ext::oneapi::experimental {
+template <int dimensions> class root_group;
+}
+
 /// Identifies an instance of the function object executing at each point in an
 /// nd_range.
 ///
@@ -198,6 +202,11 @@ template <int dimensions = 1> class nd_item {
     Group.wait_for(events...);
   }
 
+  sycl::ext::oneapi::experimental::root_group<dimensions>
+  ext_oneapi_get_root_group() const {
+    return sycl::ext::oneapi::experimental::root_group<dimensions>{*this};
+  }
+
   nd_item(const nd_item &rhs) = default;
 
   nd_item(nd_item &&rhs) = default;

@@ -0,0 +1,123 @@
+// RUN: %{build} -I . -o %t.out
+// RUN: %{run} %t.out
+
+#include <cassert>
+#include <cstdlib>
+#include <type_traits>
+
+#include <sycl/ext/oneapi/experimental/root_group.hpp>
+#include <sycl/sycl.hpp>
+
+static constexpr int WorkGroupSize = 32;
+
+void testFeatureMacro() {
+  static_assert(SYCL_EXT_ONEAPI_ROOT_GROUP == 1,
+                "SYCL_EXT_ONEAPI_ROOT_GROUP must have a value of 1");
+}
+
+void testQueriesAndProperties() {
+  sycl::queue q;
+  const auto bundle =
+      sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
+  const auto kernel = bundle.get_kernel<class QueryKernel>();
+  const auto maxWGs = kernel.ext_oneapi_get_info<
+      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
+          max_num_work_group_sync>(q);
+  const auto props = sycl::ext::oneapi::experimental::properties{
+      sycl::ext::oneapi::experimental::use_root_sync};
+  q.single_task<class QueryKernel>(props, []() {});
+  static_assert(std::is_same_v<std::remove_cv<decltype(maxWGs)>::type, size_t>,
+                "max_num_work_group_sync query must return size_t");
+  assert(maxWGs >= 1 && "max_num_work_group_sync query failed");
+}
+
+void testRootGroup() {
+  sycl::queue q;
+  const auto bundle =
+      sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
+  const auto kernel = bundle.get_kernel<class RootGroupKernel>();
+  const auto maxWGs = kernel.ext_oneapi_get_info<
+      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
+          max_num_work_group_sync>(q);
+  const auto props = sycl::ext::oneapi::experimental::properties{
+      sycl::ext::oneapi::experimental::use_root_sync};
+
+  int *data = sycl::malloc_shared<int>(maxWGs * WorkGroupSize, q);
+  const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
+  q.parallel_for<class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
+    auto root = it.ext_oneapi_get_root_group();
+    data[root.get_local_id()] = root.get_local_id();
+    sycl::group_barrier(root);
+
+    root = sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
+    int sum = data[root.get_local_id()] +
+              data[root.get_local_range() - root.get_local_id() - 1];
+    sycl::group_barrier(root);
+    data[root.get_local_id()] = sum;
+  });
+  q.wait();
+
+  const int workItemCount = static_cast<int>(range.get_global_range().size());
+  for (int i = 0; i < workItemCount; i++) {
+    assert(data[i] == (workItemCount - 1));
+  }
+}
+
+void testRootGroupFunctions() {
+  sycl::queue q;
+  const auto bundle =
+      sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
+  const auto kernel = bundle.get_kernel<class RootGroupFunctionsKernel>();
+  const auto maxWGs = kernel.ext_oneapi_get_info<
+      sycl::ext::oneapi::experimental::info::kernel_queue_specific::
+          max_num_work_group_sync>(q);
+  const auto props = sycl::ext::oneapi::experimental::properties{
+      sycl::ext::oneapi::experimental::use_root_sync};
+
+  constexpr int testCount = 10;
+  bool *testResults = sycl::malloc_shared<bool>(testCount, q);
+  const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
+  q.parallel_for<class RootGroupFunctionsKernel>(
+      range, props, [=](sycl::nd_item<1> it) {
+        const auto root = it.ext_oneapi_get_root_group();
+        if (root.leader() || root.get_local_id() == 3) {
+          testResults[0] = root.get_group_id() == sycl::id<1>(0);
+          testResults[1] = root.leader()
+                               ? root.get_local_id() == sycl::id<1>(0)
+                               : root.get_local_id() == sycl::id<1>(3);
+          testResults[2] = root.get_group_range() == sycl::range<1>(1);
+          testResults[3] =
+              root.get_local_range() == sycl::range<1>(WorkGroupSize);
+          testResults[4] =
+              root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
+          testResults[5] = root.get_group_linear_id() == 0;
+          testResults[6] =
+              root.get_local_linear_id() == root.get_local_id().get(0);
+          testResults[7] = root.get_group_linear_range() == 1;
+          testResults[8] = root.get_local_linear_range() == WorkGroupSize;
+
+          const auto child =
+              sycl::ext::oneapi::experimental::get_child_group(root);
+          const auto grandchild =
+              sycl::ext::oneapi::experimental::get_child_group(child);
+          testResults[9] = child == it.get_group();
+          static_assert(
+              std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
+                             sycl::sub_group>,
+              "get_child_group(sycl::group) must return a sycl::sub_group");
+        }
+      });
+  q.wait();
+
+  for (int i = 0; i < testCount; i++) {
+    assert(testResults[i]);
+  }
+}
+
+int main() {
+  testFeatureMacro();
+  testQueriesAndProperties();
+  testRootGroup();
+  testRootGroupFunctions();
+  return EXIT_SUCCESS;
+}