Skip to content

[SYCL] Implement sycl_ext_oneapi_root_group #9396

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
May 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 140 additions & 0 deletions sycl/include/sycl/ext/oneapi/experimental/root_group.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
//==--- root_group.hpp --- SYCL extension for root groups ------------------==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#pragma once

#include <sycl/builtins.hpp>
#include <sycl/ext/oneapi/properties/properties.hpp>
#include <sycl/memory_enums.hpp>
#include <sycl/queue.hpp>

#define SYCL_EXT_ONEAPI_ROOT_GROUP 1

namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {
namespace ext::oneapi::experimental {

namespace info::kernel_queue_specific {
// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension once
// #7598 is merged.
struct max_num_work_group_sync {
using return_type = size_t;
};
} // namespace info::kernel_queue_specific

struct use_root_sync_key {
using value_t = property_value<use_root_sync_key>;
};

inline constexpr use_root_sync_key::value_t use_root_sync;

template <> struct is_property_key<use_root_sync_key> : std::true_type {};

template <> struct detail::PropertyToKind<use_root_sync_key> {
static constexpr PropKind Kind = PropKind::UseRootSync;
};

template <>
struct detail::IsCompileTimeProperty<use_root_sync_key> : std::true_type {};

template <int Dimensions> class root_group {
public:
using id_type = id<Dimensions>;
using range_type = range<Dimensions>;
using linear_id_type = size_t;
static constexpr int dimensions = Dimensions;
static constexpr memory_scope fence_scope = memory_scope::device;

id<Dimensions> get_group_id() const { return id<Dimensions>{}; };

id<Dimensions> get_local_id() const { return it.get_global_id(); }

range<Dimensions> get_group_range() const {
if constexpr (Dimensions == 3) {
return range<3>{1, 1, 1};
} else if constexpr (Dimensions == 2) {
return range<2>{1, 1};
} else {
return range<1>{1};
}
}

range<Dimensions> get_local_range() const { return it.get_global_range(); };

range<Dimensions> get_max_local_range() const { return get_local_range(); };

size_t get_group_linear_id() const { return 0; };

size_t get_local_linear_id() const { return it.get_global_linear_id(); }

size_t get_group_linear_range() const { return get_group_range().size(); };

size_t get_local_linear_range() const { return get_local_range().size(); };

bool leader() const { return get_local_id() == 0; };

private:
friend root_group<Dimensions>
nd_item<Dimensions>::ext_oneapi_get_root_group() const;

root_group(nd_item<Dimensions> it) : it{it} {}

sycl::nd_item<Dimensions> it;
};

template <int Dimensions>
group<Dimensions> get_child_group(root_group<Dimensions> g) {
(void)g;
return this_group<Dimensions>();
}

template <int Dimensions> sub_group get_child_group(group<Dimensions> g) {
(void)g;
return this_sub_group();
}

namespace this_kernel {
template <int Dimensions> root_group<Dimensions> get_root_group() {
return this_nd_item<Dimensions>().ext_oneapi_get_root_group();
}
} // namespace this_kernel

} // namespace ext::oneapi::experimental

template <>
typename ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync::return_type
kernel::ext_oneapi_get_info<
ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(const queue &q) const {
// TODO: query the backend to return a value >= 1.
return 1;
}

template <int dimensions>
void group_barrier(ext::oneapi::experimental::root_group<dimensions> G,
memory_scope FenceScope = decltype(G)::fence_scope) {
(void)G;
(void)FenceScope;
#ifdef __SYCL_DEVICE_ONLY__
// TODO: Change __spv::Scope::Workgroup to __spv::Scope::Device once backends
// support device scope. __spv::Scope::Workgroup is only valid when
// max_num_work_group_sync is 1, so that all work items in a root group will
// also be in the same work group.
__spirv_ControlBarrier(__spv::Scope::Workgroup, __spv::Scope::Workgroup,
__spv::MemorySemanticsMask::SubgroupMemory |
__spv::MemorySemanticsMask::WorkgroupMemory |
__spv::MemorySemanticsMask::CrossWorkgroupMemory);
#else
throw sycl::runtime_error("Barriers are not supported on host device",
PI_ERROR_INVALID_DEVICE);
#endif
}

} // __SYCL_INLINE_VER_NAMESPACE(_V1)
} // namespace sycl
3 changes: 2 additions & 1 deletion sycl/include/sycl/ext/oneapi/properties/property.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,8 +193,9 @@ enum PropKind : uint32_t {
PipeProtocol = 27,
ReadyLatency = 28,
UsesValid = 29,
UseRootSync = 30,
// PropKindSize must always be the last value.
PropKindSize = 30,
PropKindSize = 31,
};

// This trait must be specialized for all properties and must have a unique
Expand Down
6 changes: 6 additions & 0 deletions sycl/include/sycl/kernel.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ namespace sycl {
__SYCL_INLINE_VER_NAMESPACE(_V1) {
// Forward declaration
class context;
class queue;
template <backend Backend> class backend_traits;
template <bundle_state State> class kernel_bundle;
template <backend BackendName, class SyclObjectT>
Expand Down Expand Up @@ -157,6 +158,11 @@ class __SYCL_EXPORT kernel : public detail::OwnerLessBase<kernel> {
typename detail::is_kernel_device_specific_info_desc<Param>::return_type
get_info(const device &Device, const range<3> &WGSize) const;

// TODO: Revisit and align with sycl_ext_oneapi_forward_progress extension
// once #7598 is merged.
template <typename Param>
typename Param::return_type ext_oneapi_get_info(const queue &q) const;

private:
/// Constructs a SYCL kernel object from a valid kernel_impl instance.
kernel(std::shared_ptr<detail::kernel_impl> Impl);
Expand Down
9 changes: 9 additions & 0 deletions sycl/include/sycl/nd_item.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ namespace detail {
class Builder;
}

namespace ext::oneapi::experimental {
template <int dimensions> class root_group;
}

/// Identifies an instance of the function object executing at each point in an
/// nd_range.
///
Expand Down Expand Up @@ -198,6 +202,11 @@ template <int dimensions = 1> class nd_item {
Group.wait_for(events...);
}

sycl::ext::oneapi::experimental::root_group<dimensions>
ext_oneapi_get_root_group() const {
return sycl::ext::oneapi::experimental::root_group<dimensions>{*this};
}

nd_item(const nd_item &rhs) = default;

nd_item(nd_item &&rhs) = default;
Expand Down
123 changes: 123 additions & 0 deletions sycl/test-e2e/GroupAlgorithm/root_group.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
// RUN: %{build} -I . -o %t.out
// RUN: %{run} %t.out

#include <cassert>
#include <cstdlib>
#include <type_traits>

#include <sycl/ext/oneapi/experimental/root_group.hpp>
#include <sycl/sycl.hpp>

static constexpr int WorkGroupSize = 32;

void testFeatureMacro() {
static_assert(SYCL_EXT_ONEAPI_ROOT_GROUP == 1,
"SYCL_EXT_ONEAPI_ROOT_GROUP must have a value of 1");
}

void testQueriesAndProperties() {
sycl::queue q;
const auto bundle =
sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
const auto kernel = bundle.get_kernel<class QueryKernel>();
const auto maxWGs = kernel.ext_oneapi_get_info<
sycl::ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(q);
const auto props = sycl::ext::oneapi::experimental::properties{
sycl::ext::oneapi::experimental::use_root_sync};
q.single_task<class QueryKernel>(props, []() {});
static_assert(std::is_same_v<std::remove_cv<decltype(maxWGs)>::type, size_t>,
"max_num_work_group_sync query must return size_t");
assert(maxWGs >= 1 && "max_num_work_group_sync query failed");
}

void testRootGroup() {
sycl::queue q;
const auto bundle =
sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
const auto kernel = bundle.get_kernel<class RootGroupKernel>();
const auto maxWGs = kernel.ext_oneapi_get_info<
sycl::ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(q);
const auto props = sycl::ext::oneapi::experimental::properties{
sycl::ext::oneapi::experimental::use_root_sync};

int *data = sycl::malloc_shared<int>(maxWGs * WorkGroupSize, q);
const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
q.parallel_for<class RootGroupKernel>(range, props, [=](sycl::nd_item<1> it) {
auto root = it.ext_oneapi_get_root_group();
data[root.get_local_id()] = root.get_local_id();
sycl::group_barrier(root);

root = sycl::ext::oneapi::experimental::this_kernel::get_root_group<1>();
int sum = data[root.get_local_id()] +
data[root.get_local_range() - root.get_local_id() - 1];
sycl::group_barrier(root);
data[root.get_local_id()] = sum;
});
q.wait();

const int workItemCount = static_cast<int>(range.get_global_range().size());
for (int i = 0; i < workItemCount; i++) {
assert(data[i] == (workItemCount - 1));
}
}

void testRootGroupFunctions() {
sycl::queue q;
const auto bundle =
sycl::get_kernel_bundle<sycl::bundle_state::executable>(q.get_context());
const auto kernel = bundle.get_kernel<class RootGroupFunctionsKernel>();
const auto maxWGs = kernel.ext_oneapi_get_info<
sycl::ext::oneapi::experimental::info::kernel_queue_specific::
max_num_work_group_sync>(q);
const auto props = sycl::ext::oneapi::experimental::properties{
sycl::ext::oneapi::experimental::use_root_sync};

constexpr int testCount = 10;
bool *testResults = sycl::malloc_shared<bool>(testCount, q);
const auto range = sycl::nd_range<1>{maxWGs * WorkGroupSize, WorkGroupSize};
q.parallel_for<class RootGroupFunctionsKernel>(
range, props, [=](sycl::nd_item<1> it) {
const auto root = it.ext_oneapi_get_root_group();
if (root.leader() || root.get_local_id() == 3) {
testResults[0] = root.get_group_id() == sycl::id<1>(0);
testResults[1] = root.leader()
? root.get_local_id() == sycl::id<1>(0)
: root.get_local_id() == sycl::id<1>(3);
testResults[2] = root.get_group_range() == sycl::range<1>(1);
testResults[3] =
root.get_local_range() == sycl::range<1>(WorkGroupSize);
testResults[4] =
root.get_max_local_range() == sycl::range<1>(WorkGroupSize);
testResults[5] = root.get_group_linear_id() == 0;
testResults[6] =
root.get_local_linear_id() == root.get_local_id().get(0);
testResults[7] = root.get_group_linear_range() == 1;
testResults[8] = root.get_local_linear_range() == WorkGroupSize;

const auto child =
sycl::ext::oneapi::experimental::get_child_group(root);
const auto grandchild =
sycl::ext::oneapi::experimental::get_child_group(child);
testResults[9] = child == it.get_group();
static_assert(
std::is_same_v<std::remove_cv<decltype(grandchild)>::type,
sycl::sub_group>,
"get_child_group(sycl::group) must return a sycl::sub_group");
}
});
q.wait();

for (int i = 0; i < testCount; i++) {
assert(testResults[i]);
}
}

int main() {
testFeatureMacro();
testQueriesAndProperties();
testRootGroup();
testRootGroupFunctions();
return EXIT_SUCCESS;
}