Skip to content

[SYCL][Reduction] Use "if constexpr" over SFINAE #6343

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jul 7, 2022
266 changes: 112 additions & 154 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -248,37 +248,36 @@ class reduction_impl_algo;
using cl::sycl::detail::enable_if_t;
using cl::sycl::detail::queue_impl;

template <typename KernelName, typename KernelType, int Dims, class Reduction>
void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
Reduction &Redu);
// Kernels with single reduction

/// If we are given sycl::range and not sycl::nd_range we have more freedom in
/// how to split the iteration space.
template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_atomic_add_float64>
reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu);
void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
const range<Dims> &Range, size_t MaxWGSize,
uint32_t NumConcurrentWorkGroups, Reduction &Redu);

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<Reduction::has_fast_atomics>
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &Redu);
void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu);

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<!Reduction::has_fast_atomics>
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &Redu);
void reduCGFunc(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range, Reduction &Redu);

template <typename KernelName, typename KernelType, class Reduction>
enable_if_t<!Reduction::has_fast_atomics, size_t>
reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
Reduction &Redu);
// Kernels with multiple reductions

// sycl::nd_range version
template <typename KernelName, typename KernelType, int Dims,
typename... Reductions, size_t... Is>
void reduCGFunc(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range,
std::tuple<Reductions...> &ReduTuple,
std::index_sequence<Is...>);
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
const nd_range<Dims> &Range,
std::tuple<Reductions...> &ReduTuple,
std::index_sequence<Is...>);

template <typename KernelName, typename KernelType, class Reduction>
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
Reduction &Redu);

template <typename KernelName, typename KernelType, typename... Reductions,
size_t... Is>
Expand All @@ -300,12 +299,6 @@ reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
bool IsHost, std::tuple<Reduction...> &ReduTuple,
std::index_sequence<Is...>);

template <typename Reduction, typename... RestT>
std::enable_if_t<!Reduction::is_usm>
reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
std::shared_ptr<detail::queue_impl> Queue,
bool IsHost, Reduction &Redu, RestT... Rest);

__SYCL_EXPORT uint32_t
reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);

Expand Down Expand Up @@ -470,6 +463,27 @@ class __SYCL_EXPORT handler {
MStreamStorage.push_back(Stream);
}

/// Helper utility for operation widely used through different reduction
/// implementations.
/// @{
template <class FunctorTy>
event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
FunctorTy Func) {
handler AuxHandler(Queue, MIsHost);
AuxHandler.saveCodeLoc(MCodeLoc);
Func(AuxHandler);
return AuxHandler.finalize();
}

template <class FunctorTy>
static event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This version is not used, if I am not missing anything.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

bool IsHost, FunctorTy Func) {
handler AuxHandler(Queue, IsHost);
Func(AuxHandler);
return AuxHandler.finalize();
}
/// }@

/// Saves buffers created by handling reduction feature in handler.
/// They are then forwarded to command group and destroyed only after
/// the command group finishes the work on device/host.
Expand Down Expand Up @@ -1587,6 +1601,9 @@ class __SYCL_EXPORT handler {
#endif
}

// "if constexpr" simplifies implementation/increases readability in comparison
// with SFINAE-based approach.
#if __cplusplus >= 201703L
/// Defines and invokes a SYCL kernel function for the specified nd_range.
///
/// The SYCL kernel function is defined as a lambda function or a named
Expand Down Expand Up @@ -1618,123 +1635,76 @@ class __SYCL_EXPORT handler {
// for the device.
size_t MaxWGSize =
ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
ext::oneapi::detail::reduCGFunc<KernelName>(
ext::oneapi::detail::reduCGFuncForRange<KernelName>(
*this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
if (Reduction::is_usm ||
(Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
(!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor())) {
this->finalize();
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
Redu);
MLastEvent = CopyHandler.finalize();
}
}

/// Implements parallel_for() accepting nd_range \p Range and one reduction
/// object. This version uses fast sycl::atomic operations to update reduction
/// variable at the end of each work-group work.
//
// If the reduction variable must be initialized with the identity value
// before the kernel run, then an additional working accessor is created,
// initialized with the identity value and used in the kernel. That working
// accessor is then copied to user's accessor or USM pointer after
// the kernel run.
// For USM pointers without initialize_to_identity properties the same scheme
// with working accessor is used as re-using user's USM pointer in the kernel
// would require creation of another variant of user's kernel, which does not
// seem efficient.
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<Reduction::has_fast_atomics>
parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);

if (Reduction::is_usm || Redu.initializeToIdentity()) {
this->finalize();
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
Redu);
MLastEvent = CopyHandler.finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
});
}
}

/// Implements parallel_for() accepting nd_range \p Range and one reduction
/// object. This version is a specialization for the add operator.
/// It performs runtime checks for device aspect "atomic64"; if found, fast
/// sycl::atomic_ref operations are used to update the reduction at the
/// end of each work-group work. Otherwise the default implementation is
/// used.
//
// If the reduction variable must be initialized with the identity value
// before the kernel run, then an additional working accessor is created,
// initialized with the identity value and used in the kernel. That working
// accessor is then copied to user's accessor or USM pointer after
// the kernel run.
// For USM pointers without initialize_to_identity properties the same scheme
// with working accessor is used as re-using user's USM pointer in the kernel
// would require creation of another variant of user's kernel, which does not
// seem efficient.
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<Reduction::has_atomic_add_float64>
parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {

std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
device D = detail::getDeviceFromHandler(*this);

if (D.has(aspect::atomic64)) {

ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
Range, Redu);

void parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {
if constexpr (!Reduction::has_fast_atomics &&
!Reduction::has_atomic_add_float64) {
// The most basic implementation.
parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
return;
} else { // Can't "early" return for "if constexpr".
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
if constexpr (Reduction::has_atomic_add_float64) {
/// This version is a specialization for the add
/// operator. It performs runtime checks for device aspect "atomic64";
/// if found, fast sycl::atomic_ref operations are used to update the
/// reduction at the end of each work-group work. Otherwise the
/// default implementation is used.
device D = detail::getDeviceFromHandler(*this);

if (D.has(aspect::atomic64)) {

ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
Range, Redu);
} else {
// Resort to basic implementation as well.
parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
return;
}
} else {
// Use fast sycl::atomic operations to update reduction variable at the
// end of each work-group work.
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
Redu);
}
// If the reduction variable must be initialized with the identity value
// before the kernel run, then an additional working accessor is created,
// initialized with the identity value and used in the kernel. That
// working accessor is then copied to user's accessor or USM pointer after
// the kernel run.
// For USM pointers without initialize_to_identity properties the same
// scheme with working accessor is used as re-using user's USM pointer in
// the kernel would require creation of another variant of user's kernel,
// which does not seem efficient.
if (Reduction::is_usm || Redu.initializeToIdentity()) {
this->finalize();
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
MLastEvent = CopyHandler.finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
});
}
} else {
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
}
}

/// Defines and invokes a SYCL kernel function for the specified nd_range.
/// Performs reduction operation specified in \p Redu.
///
/// The SYCL kernel function is defined as a lambda function or a named
/// function object type and given an id or item for indexing in the indexing
/// space defined by \p Range.
/// If it is a named function object and the function object type is
/// globally visible, there is no need for the developer to provide
/// a kernel name for it.
///
/// TODO: Support HOST. The kernels called by this parallel_for() may use
/// some functionality that is not yet supported on HOST such as:
/// barrier(), and ext::oneapi::reduce() that also may be used in more
/// optimized implementations waiting for their turn of code-review.
template <typename KernelName = detail::auto_name, typename KernelType,
int Dims, typename Reduction>
detail::enable_if_t<!Reduction::has_fast_atomics &&
!Reduction::has_atomic_add_float64>
parallel_for(nd_range<Dims> Range, Reduction Redu,
_KERNELFUNCPARAM(KernelFunc)) {

parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
}

template <typename KernelName, typename KernelType, int Dims,
typename Reduction>
detail::enable_if_t<!Reduction::has_fast_atomics>
parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
KernelType KernelFunc) {
void parallel_for_impl(nd_range<Dims> Range, Reduction Redu,
KernelType KernelFunc) {
// This parallel_for() is lowered to the following sequence:
// 1) Call a kernel that a) call user's lambda function and b) performs
// one iteration of reduction, storing the partial reductions/sums
Expand Down Expand Up @@ -1790,20 +1760,17 @@ class __SYCL_EXPORT handler {
PI_ERROR_INVALID_WORK_GROUP_SIZE);
size_t NWorkItems = Range.get_group_range().size();
while (NWorkItems > 1) {
handler AuxHandler(QueueCopy, MIsHost);
AuxHandler.saveCodeLoc(MCodeLoc);

NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
AuxHandler, NWorkItems, MaxWGSize, Redu);
MLastEvent = AuxHandler.finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
AuxHandler, NWorkItems, MaxWGSize, Redu);
});
} // end while (NWorkItems > 1)

if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) {
handler CopyHandler(QueueCopy, MIsHost);
CopyHandler.saveCodeLoc(MCodeLoc);
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
Redu);
MLastEvent = CopyHandler.finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
CopyHandler, Redu);
});
}
}

Expand Down Expand Up @@ -1868,27 +1835,26 @@ class __SYCL_EXPORT handler {
std::to_string(MaxWGSize),
PI_ERROR_INVALID_WORK_GROUP_SIZE);

ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
ReduTuple, ReduIndices);
ext::oneapi::detail::reduCGFuncMulti<KernelName>(*this, KernelFunc, Range,
ReduTuple, ReduIndices);
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
this->finalize();

size_t NWorkItems = Range.get_group_range().size();
while (NWorkItems > 1) {
handler AuxHandler(QueueCopy, MIsHost);
AuxHandler.saveCodeLoc(MCodeLoc);

NWorkItems =
ext::oneapi::detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
MLastEvent = AuxHandler.finalize();
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName,
decltype(KernelFunc)>(
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
});
} // end while (NWorkItems > 1)

auto CopyEvent = ext::oneapi::detail::reduSaveFinalResultToUserMem(
QueueCopy, MIsHost, ReduTuple, ReduIndices);
if (CopyEvent)
MLastEvent = *CopyEvent;
}
#endif // __cplusplus >= 201703L

/// Hierarchical kernel invocation method of a kernel defined as a lambda
/// encoding the body of each work-group to launch.
Expand Down Expand Up @@ -2689,14 +2655,6 @@ class __SYCL_EXPORT handler {
class Algorithm>
friend class ext::oneapi::detail::reduction_impl_algo;

// This method needs to call the method finalize() and also access to private
// ctor/dtor.
template <typename Reduction, typename... RestT>
std::enable_if_t<!Reduction::is_usm> friend ext::oneapi::detail::
reduSaveFinalResultToUserMemHelper(
std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
bool IsHost, Reduction &, RestT...);

friend void detail::associateWithHandler(handler &,
detail::AccessorBaseHost *,
access::target);
Expand Down
5 changes: 5 additions & 0 deletions sycl/include/CL/sycl/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@

#pragma once

#if __cplusplus >= 201703L
// Entire feature is dependent on C++17.

#include <CL/sycl/known_identity.hpp>

#include "sycl/ext/oneapi/reduction.hpp"
Expand Down Expand Up @@ -171,3 +174,5 @@ reduction(span<T, Extent> Span, const T &Identity, BinaryOperation Combiner,

} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)

#endif // __cplusplus >= 201703L
Loading