Skip to content

[SYCL][ABI Break] Remove ext::oneapi::reduction #6634

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Aug 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ set(SYCL_MINOR_VERSION 7)
set(SYCL_PATCH_VERSION 0)
# Don't forget to re-enable sycl_symbols_windows.dump once we leave ABI-breaking
# window!
set(SYCL_DEV_ABI_VERSION 10)
set(SYCL_DEV_ABI_VERSION 11)
if (SYCL_ADD_DEV_VERSION_POSTFIX)
set(SYCL_VERSION_POSTFIX "-${SYCL_DEV_ABI_VERSION}")
endif()
Expand Down
2,552 changes: 0 additions & 2,552 deletions sycl/include/sycl/ext/oneapi/reduction.hpp

This file was deleted.

68 changes: 25 additions & 43 deletions sycl/include/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -234,11 +234,6 @@ class RoundedRangeKernelWithKH {
KernelType KernelFunc;
};

} // namespace detail

namespace ext {
namespace oneapi {
namespace detail {
template <typename T, class BinaryOperation, int Dims, size_t Extent,
typename RedOutVar>
class reduction_impl_algo;
Expand Down Expand Up @@ -320,8 +315,6 @@ template <class FunctorTy>
event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost,
FunctorTy Func);
} // namespace detail
} // namespace oneapi
} // namespace ext

/// Command group handler class.
///
Expand Down Expand Up @@ -468,8 +461,7 @@ class __SYCL_EXPORT handler {
}

template <class FunctorTy>
friend event
ext::oneapi::detail::withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
friend event detail::withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
bool IsHost, FunctorTy Func);
/// }@

Expand Down Expand Up @@ -1616,20 +1608,18 @@ class __SYCL_EXPORT handler {
#ifdef __SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS
__SYCL_REDUCTION_NUM_CONCURRENT_WORKGROUPS;
#else
ext::oneapi::detail::reduGetMaxNumConcurrentWorkGroups(MQueue);
detail::reduGetMaxNumConcurrentWorkGroups(MQueue);
#endif
// TODO: currently the preferred work group size is determined for the given
// queue/device, while it is safer to use queries to the kernel pre-compiled
// for the device.
size_t PrefWGSize =
ext::oneapi::detail::reduGetPreferredWGSize(MQueue, OneElemSize);
if (ext::oneapi::detail::reduCGFuncForRange<KernelName>(
*this, KernelFunc, Range, PrefWGSize, NumConcurrentWorkGroups,
Redu)) {
size_t PrefWGSize = detail::reduGetPreferredWGSize(MQueue, OneElemSize);
if (detail::reduCGFuncForRange<KernelName>(*this, KernelFunc, Range,
PrefWGSize,
NumConcurrentWorkGroups, Redu)) {
this->finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
});
}
}
Expand All @@ -1655,8 +1645,8 @@ class __SYCL_EXPORT handler {

if (D.has(aspect::atomic64)) {

ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
Range, Redu);
detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc, Range,
Redu);
} else {
// Resort to basic implementation as well.
parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
Expand All @@ -1665,8 +1655,7 @@ class __SYCL_EXPORT handler {
} else {
// Use fast sycl::atomic operations to update reduction variable at the
// end of each work-group work.
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
Redu);
detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
}
// If the reduction variable must be initialized with the identity value
// before the kernel run, then an additional working accessor is created,
Expand All @@ -1680,8 +1669,7 @@ class __SYCL_EXPORT handler {
if (Reduction::is_usm || Redu.initializeToIdentity()) {
this->finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
});
}
}
Expand Down Expand Up @@ -1717,8 +1705,7 @@ class __SYCL_EXPORT handler {
// TODO: currently the maximal work group size is determined for the given
// queue/device, while it may be safer to use queries to the kernel compiled
// for the device.
size_t MaxWGSize =
ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
size_t MaxWGSize = detail::reduGetMaxWGSize(MQueue, OneElemSize);
if (Range.get_local_range().size() > MaxWGSize)
throw sycl::runtime_error("The implementation handling parallel_for with"
" reduction requires work group size not bigger"
Expand All @@ -1727,7 +1714,7 @@ class __SYCL_EXPORT handler {
PI_ERROR_INVALID_WORK_GROUP_SIZE);

// 1. Call the kernel that includes user's lambda function.
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
this->finalize();

Expand All @@ -1747,15 +1734,14 @@ class __SYCL_EXPORT handler {
size_t NWorkItems = Range.get_group_range().size();
while (NWorkItems > 1) {
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
NWorkItems = detail::reduAuxCGFunc<KernelName, KernelType>(
AuxHandler, NWorkItems, MaxWGSize, Redu);
});
} // end while (NWorkItems > 1)

if (Reduction::is_usm || Reduction::is_dw_acc) {
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
});
}
}
Expand Down Expand Up @@ -1796,46 +1782,42 @@ class __SYCL_EXPORT handler {
// c) Repeat the steps (a) and (b) to get one final sum.
template <typename KernelName = detail::auto_name, int Dims,
typename... RestT>
std::enable_if_t<
(sizeof...(RestT) >= 3 &&
ext::oneapi::detail::AreAllButLastReductions<RestT...>::value)>
std::enable_if_t<(sizeof...(RestT) >= 3 &&
detail::AreAllButLastReductions<RestT...>::value)>
parallel_for(nd_range<Dims> Range, RestT... Rest) {
std::tuple<RestT...> ArgsTuple(Rest...);
constexpr size_t NumArgs = sizeof...(RestT);
auto KernelFunc = std::get<NumArgs - 1>(ArgsTuple);
auto ReduIndices = std::make_index_sequence<NumArgs - 1>();
auto ReduTuple =
ext::oneapi::detail::tuple_select_elements(ArgsTuple, ReduIndices);
auto ReduTuple = detail::tuple_select_elements(ArgsTuple, ReduIndices);

size_t LocalMemPerWorkItem =
ext::oneapi::detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices);
detail::reduGetMemPerWorkItem(ReduTuple, ReduIndices);
// TODO: currently the maximal work group size is determined for the given
// queue/device, while it is safer to use queries to the kernel compiled
// for the device.
size_t MaxWGSize =
ext::oneapi::detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem);
size_t MaxWGSize = detail::reduGetMaxWGSize(MQueue, LocalMemPerWorkItem);
if (Range.get_local_range().size() > MaxWGSize)
throw sycl::runtime_error("The implementation handling parallel_for with"
" reduction requires work group size not bigger"
" than " +
std::to_string(MaxWGSize),
PI_ERROR_INVALID_WORK_GROUP_SIZE);

ext::oneapi::detail::reduCGFuncMulti<KernelName>(*this, KernelFunc, Range,
ReduTuple, ReduIndices);
detail::reduCGFuncMulti<KernelName>(*this, KernelFunc, Range, ReduTuple,
ReduIndices);
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
this->finalize();

size_t NWorkItems = Range.get_group_range().size();
while (NWorkItems > 1) {
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName,
decltype(KernelFunc)>(
NWorkItems = detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
});
} // end while (NWorkItems > 1)

auto CopyEvent = ext::oneapi::detail::reduSaveFinalResultToUserMem(
auto CopyEvent = detail::reduSaveFinalResultToUserMem(
QueueCopy, MIsHost, ReduTuple, ReduIndices);
if (CopyEvent)
MLastEvent = *CopyEvent;
Expand Down Expand Up @@ -2639,7 +2621,7 @@ class __SYCL_EXPORT handler {
// in handler from reduction methods.
template <typename T, class BinaryOperation, int Dims, size_t Extent,
typename RedOutVar>
friend class ext::oneapi::detail::reduction_impl_algo;
friend class detail::reduction_impl_algo;

#ifndef __SYCL_DEVICE_ONLY__
friend void detail::associateWithHandler(handler &,
Expand Down
6 changes: 2 additions & 4 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -1003,8 +1003,7 @@ class __SYCL_EXPORT queue {
/// const KernelType &KernelFunc".
template <typename KernelName = detail::auto_name, int Dims,
typename... RestT>
std::enable_if_t<
ext::oneapi::detail::AreAllButLastReductions<RestT...>::value, event>
std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value, event>
parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
// Actual code location needs to be captured from KernelInfo object.
const detail::code_location CodeLoc = {};
Expand Down Expand Up @@ -1134,8 +1133,7 @@ class __SYCL_EXPORT queue {
/// \param Range specifies the global work space of the kernel
/// \param KernelFunc is the Kernel functor or lambda
template <typename KernelName, int Dims, typename... RestT>
std::enable_if_t<
ext::oneapi::detail::AreAllButLastReductions<RestT...>::value, event>
std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value, event>
parallel_for_impl(range<Dims> Range, RestT &&...Rest) {
// Actual code location needs to be captured from KernelInfo object.
const detail::code_location CodeLoc = {};
Expand Down
Loading