Skip to content

Commit 0cf7b45

Browse files
[SYCL][Reduction] Use "if constexpr" over SFINAE (#6343)
Several reasons: * The code is easier to read/understand. * Ability to mix compiler/run-time conditions in a single instance of logic description, including those in future (e.g. discrete vs integrated GPU). * Eliminate duplication of similar parts of different algorithms so that distinctions are better highlighted. * Ease of experiments when trying to switch algorithm used by decoupling hard requirements from the peformance tuning logic (i.e., calling slower code must not result in a compile-time error). * Allows to stop encoding implementation selection logic into overloads, use distinct names instead (e.g., see forward declarations in handler.hpp). Drawbacks: * The feature is only enabled in C++17 mode now. The customers who need C++14 mode aren't expected to require it. Mitigation plan would be to change remaining SFINAE logic inside reduction.hpp into runtime asserts and have "constexpr" changed to a compile-time macro __SYCL_REDUCTION_CONSTEXPR to resort to run-time only code selection in C++14 mode.
1 parent 12ac4c3 commit 0cf7b45

File tree

3 files changed

+509
-552
lines changed

3 files changed

+509
-552
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 112 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -248,37 +248,36 @@ class reduction_impl_algo;
248248
using cl::sycl::detail::enable_if_t;
249249
using cl::sycl::detail::queue_impl;
250250

251-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
252-
void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
253-
size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
254-
Reduction &Redu);
251+
// Kernels with single reduction
255252

253+
/// If we are given sycl::range and not sycl::nd_range we have more freedom in
254+
/// how to split the iteration space.
256255
template <typename KernelName, typename KernelType, int Dims, class Reduction>
257-
enable_if_t<Reduction::has_atomic_add_float64>
258-
reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
259-
const nd_range<Dims> &Range, Reduction &Redu);
256+
void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
257+
const range<Dims> &Range, size_t MaxWGSize,
258+
uint32_t NumConcurrentWorkGroups, Reduction &Redu);
260259

261260
template <typename KernelName, typename KernelType, int Dims, class Reduction>
262-
enable_if_t<Reduction::has_fast_atomics>
263-
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
264-
Reduction &Redu);
261+
void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
262+
const nd_range<Dims> &Range, Reduction &Redu);
265263

266264
template <typename KernelName, typename KernelType, int Dims, class Reduction>
267-
enable_if_t<!Reduction::has_fast_atomics>
268-
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
269-
Reduction &Redu);
265+
void reduCGFunc(handler &CGH, KernelType KernelFunc,
266+
const nd_range<Dims> &Range, Reduction &Redu);
270267

271-
template <typename KernelName, typename KernelType, class Reduction>
272-
enable_if_t<!Reduction::has_fast_atomics, size_t>
273-
reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
274-
Reduction &Redu);
268+
// Kernels with multiple reductions
275269

270+
// sycl::nd_range version
276271
template <typename KernelName, typename KernelType, int Dims,
277272
typename... Reductions, size_t... Is>
278-
void reduCGFunc(handler &CGH, KernelType KernelFunc,
279-
const nd_range<Dims> &Range,
280-
std::tuple<Reductions...> &ReduTuple,
281-
std::index_sequence<Is...>);
273+
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
274+
const nd_range<Dims> &Range,
275+
std::tuple<Reductions...> &ReduTuple,
276+
std::index_sequence<Is...>);
277+
278+
template <typename KernelName, typename KernelType, class Reduction>
279+
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
280+
Reduction &Redu);
282281

283282
template <typename KernelName, typename KernelType, typename... Reductions,
284283
size_t... Is>
@@ -300,12 +299,6 @@ reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
300299
bool IsHost, std::tuple<Reduction...> &ReduTuple,
301300
std::index_sequence<Is...>);
302301

303-
template <typename Reduction, typename... RestT>
304-
std::enable_if_t<!Reduction::is_usm>
305-
reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
306-
std::shared_ptr<detail::queue_impl> Queue,
307-
bool IsHost, Reduction &Redu, RestT... Rest);
308-
309302
__SYCL_EXPORT uint32_t
310303
reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
311304

@@ -470,6 +463,27 @@ class __SYCL_EXPORT handler {
470463
MStreamStorage.push_back(Stream);
471464
}
472465

466+
/// Helper utility for operation widely used through different reduction
467+
/// implementations.
468+
/// @{
469+
template <class FunctorTy>
470+
event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
471+
FunctorTy Func) {
472+
handler AuxHandler(Queue, MIsHost);
473+
AuxHandler.saveCodeLoc(MCodeLoc);
474+
Func(AuxHandler);
475+
return AuxHandler.finalize();
476+
}
477+
478+
template <class FunctorTy>
479+
static event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
480+
bool IsHost, FunctorTy Func) {
481+
handler AuxHandler(Queue, IsHost);
482+
Func(AuxHandler);
483+
return AuxHandler.finalize();
484+
}
485+
/// }@
486+
473487
/// Saves buffers created by handling reduction feature in handler.
474488
/// They are then forwarded to command group and destroyed only after
475489
/// the command group finishes the work on device/host.
@@ -1587,6 +1601,9 @@ class __SYCL_EXPORT handler {
15871601
#endif
15881602
}
15891603

1604+
// "if constexpr" simplifies implementation/increases readability in comparison
1605+
// with SFINAE-based approach.
1606+
#if __cplusplus >= 201703L
15901607
/// Defines and invokes a SYCL kernel function for the specified nd_range.
15911608
///
15921609
/// The SYCL kernel function is defined as a lambda function or a named
@@ -1618,123 +1635,76 @@ class __SYCL_EXPORT handler {
16181635
// for the device.
16191636
size_t MaxWGSize =
16201637
ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1621-
ext::oneapi::detail::reduCGFunc<KernelName>(
1638+
ext::oneapi::detail::reduCGFuncForRange<KernelName>(
16221639
*this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
16231640
if (Reduction::is_usm ||
16241641
(Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
16251642
(!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor())) {
16261643
this->finalize();
1627-
handler CopyHandler(QueueCopy, MIsHost);
1628-
CopyHandler.saveCodeLoc(MCodeLoc);
1629-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1630-
Redu);
1631-
MLastEvent = CopyHandler.finalize();
1632-
}
1633-
}
1634-
1635-
/// Implements parallel_for() accepting nd_range \p Range and one reduction
1636-
/// object. This version uses fast sycl::atomic operations to update reduction
1637-
/// variable at the end of each work-group work.
1638-
//
1639-
// If the reduction variable must be initialized with the identity value
1640-
// before the kernel run, then an additional working accessor is created,
1641-
// initialized with the identity value and used in the kernel. That working
1642-
// accessor is then copied to user's accessor or USM pointer after
1643-
// the kernel run.
1644-
// For USM pointers without initialize_to_identity properties the same scheme
1645-
// with working accessor is used as re-using user's USM pointer in the kernel
1646-
// would require creation of another variant of user's kernel, which does not
1647-
// seem efficient.
1648-
template <typename KernelName = detail::auto_name, typename KernelType,
1649-
int Dims, typename Reduction>
1650-
detail::enable_if_t<Reduction::has_fast_atomics>
1651-
parallel_for(nd_range<Dims> Range, Reduction Redu,
1652-
_KERNELFUNCPARAM(KernelFunc)) {
1653-
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1654-
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1655-
1656-
if (Reduction::is_usm || Redu.initializeToIdentity()) {
1657-
this->finalize();
1658-
handler CopyHandler(QueueCopy, MIsHost);
1659-
CopyHandler.saveCodeLoc(MCodeLoc);
1660-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1661-
Redu);
1662-
MLastEvent = CopyHandler.finalize();
1644+
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1645+
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1646+
CopyHandler, Redu);
1647+
});
16631648
}
16641649
}
16651650

1666-
/// Implements parallel_for() accepting nd_range \p Range and one reduction
1667-
/// object. This version is a specialization for the add operator.
1668-
/// It performs runtime checks for device aspect "atomic64"; if found, fast
1669-
/// sycl::atomic_ref operations are used to update the reduction at the
1670-
/// end of each work-group work. Otherwise the default implementation is
1671-
/// used.
1672-
//
1673-
// If the reduction variable must be initialized with the identity value
1674-
// before the kernel run, then an additional working accessor is created,
1675-
// initialized with the identity value and used in the kernel. That working
1676-
// accessor is then copied to user's accessor or USM pointer after
1677-
// the kernel run.
1678-
// For USM pointers without initialize_to_identity properties the same scheme
1679-
// with working accessor is used as re-using user's USM pointer in the kernel
1680-
// would require creation of another variant of user's kernel, which does not
1681-
// seem efficient.
16821651
template <typename KernelName = detail::auto_name, typename KernelType,
16831652
int Dims, typename Reduction>
1684-
detail::enable_if_t<Reduction::has_atomic_add_float64>
1685-
parallel_for(nd_range<Dims> Range, Reduction Redu,
1686-
_KERNELFUNCPARAM(KernelFunc)) {
1687-
1688-
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1689-
device D = detail::getDeviceFromHandler(*this);
1690-
1691-
if (D.has(aspect::atomic64)) {
1692-
1693-
ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
1694-
Range, Redu);
1695-
1653+
void parallel_for(nd_range<Dims> Range, Reduction Redu,
1654+
_KERNELFUNCPARAM(KernelFunc)) {
1655+
if constexpr (!Reduction::has_fast_atomics &&
1656+
!Reduction::has_atomic_add_float64) {
1657+
// The most basic implementation.
1658+
parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
1659+
return;
1660+
} else { // Can't "early" return for "if constexpr".
1661+
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1662+
if constexpr (Reduction::has_atomic_add_float64) {
1663+
/// This version is a specialization for the add
1664+
/// operator. It performs runtime checks for device aspect "atomic64";
1665+
/// if found, fast sycl::atomic_ref operations are used to update the
1666+
/// reduction at the end of each work-group work. Otherwise the
1667+
/// default implementation is used.
1668+
device D = detail::getDeviceFromHandler(*this);
1669+
1670+
if (D.has(aspect::atomic64)) {
1671+
1672+
ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
1673+
Range, Redu);
1674+
} else {
1675+
// Resort to basic implementation as well.
1676+
parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
1677+
return;
1678+
}
1679+
} else {
1680+
// Use fast sycl::atomic operations to update reduction variable at the
1681+
// end of each work-group work.
1682+
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
1683+
Redu);
1684+
}
1685+
// If the reduction variable must be initialized with the identity value
1686+
// before the kernel run, then an additional working accessor is created,
1687+
// initialized with the identity value and used in the kernel. That
1688+
// working accessor is then copied to user's accessor or USM pointer after
1689+
// the kernel run.
1690+
// For USM pointers without initialize_to_identity properties the same
1691+
// scheme with working accessor is used as re-using user's USM pointer in
1692+
// the kernel would require creation of another variant of user's kernel,
1693+
// which does not seem efficient.
16961694
if (Reduction::is_usm || Redu.initializeToIdentity()) {
16971695
this->finalize();
1698-
handler CopyHandler(QueueCopy, MIsHost);
1699-
CopyHandler.saveCodeLoc(MCodeLoc);
1700-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1701-
CopyHandler, Redu);
1702-
MLastEvent = CopyHandler.finalize();
1696+
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1697+
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1698+
CopyHandler, Redu);
1699+
});
17031700
}
1704-
} else {
1705-
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
17061701
}
17071702
}
17081703

1709-
/// Defines and invokes a SYCL kernel function for the specified nd_range.
1710-
/// Performs reduction operation specified in \p Redu.
1711-
///
1712-
/// The SYCL kernel function is defined as a lambda function or a named
1713-
/// function object type and given an id or item for indexing in the indexing
1714-
/// space defined by \p Range.
1715-
/// If it is a named function object and the function object type is
1716-
/// globally visible, there is no need for the developer to provide
1717-
/// a kernel name for it.
1718-
///
1719-
/// TODO: Support HOST. The kernels called by this parallel_for() may use
1720-
/// some functionality that is not yet supported on HOST such as:
1721-
/// barrier(), and ext::oneapi::reduce() that also may be used in more
1722-
/// optimized implementations waiting for their turn of code-review.
1723-
template <typename KernelName = detail::auto_name, typename KernelType,
1724-
int Dims, typename Reduction>
1725-
detail::enable_if_t<!Reduction::has_fast_atomics &&
1726-
!Reduction::has_atomic_add_float64>
1727-
parallel_for(nd_range<Dims> Range, Reduction Redu,
1728-
_KERNELFUNCPARAM(KernelFunc)) {
1729-
1730-
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1731-
}
1732-
17331704
template <typename KernelName, typename KernelType, int Dims,
17341705
typename Reduction>
1735-
detail::enable_if_t<!Reduction::has_fast_atomics>
1736-
parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
1737-
KernelType KernelFunc) {
1706+
void parallel_for_impl(nd_range<Dims> Range, Reduction Redu,
1707+
KernelType KernelFunc) {
17381708
// This parallel_for() is lowered to the following sequence:
17391709
// 1) Call a kernel that a) call user's lambda function and b) performs
17401710
// one iteration of reduction, storing the partial reductions/sums
@@ -1790,20 +1760,17 @@ class __SYCL_EXPORT handler {
17901760
PI_ERROR_INVALID_WORK_GROUP_SIZE);
17911761
size_t NWorkItems = Range.get_group_range().size();
17921762
while (NWorkItems > 1) {
1793-
handler AuxHandler(QueueCopy, MIsHost);
1794-
AuxHandler.saveCodeLoc(MCodeLoc);
1795-
1796-
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1797-
AuxHandler, NWorkItems, MaxWGSize, Redu);
1798-
MLastEvent = AuxHandler.finalize();
1763+
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
1764+
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1765+
AuxHandler, NWorkItems, MaxWGSize, Redu);
1766+
});
17991767
} // end while (NWorkItems > 1)
18001768

18011769
if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) {
1802-
handler CopyHandler(QueueCopy, MIsHost);
1803-
CopyHandler.saveCodeLoc(MCodeLoc);
1804-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1805-
Redu);
1806-
MLastEvent = CopyHandler.finalize();
1770+
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1771+
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1772+
CopyHandler, Redu);
1773+
});
18071774
}
18081775
}
18091776

@@ -1868,27 +1835,26 @@ class __SYCL_EXPORT handler {
18681835
std::to_string(MaxWGSize),
18691836
PI_ERROR_INVALID_WORK_GROUP_SIZE);
18701837

1871-
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
1872-
ReduTuple, ReduIndices);
1838+
ext::oneapi::detail::reduCGFuncMulti<KernelName>(*this, KernelFunc, Range,
1839+
ReduTuple, ReduIndices);
18731840
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
18741841
this->finalize();
18751842

18761843
size_t NWorkItems = Range.get_group_range().size();
18771844
while (NWorkItems > 1) {
1878-
handler AuxHandler(QueueCopy, MIsHost);
1879-
AuxHandler.saveCodeLoc(MCodeLoc);
1880-
1881-
NWorkItems =
1882-
ext::oneapi::detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
1883-
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1884-
MLastEvent = AuxHandler.finalize();
1845+
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
1846+
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName,
1847+
decltype(KernelFunc)>(
1848+
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1849+
});
18851850
} // end while (NWorkItems > 1)
18861851

18871852
auto CopyEvent = ext::oneapi::detail::reduSaveFinalResultToUserMem(
18881853
QueueCopy, MIsHost, ReduTuple, ReduIndices);
18891854
if (CopyEvent)
18901855
MLastEvent = *CopyEvent;
18911856
}
1857+
#endif // __cplusplus >= 201703L
18921858

18931859
/// Hierarchical kernel invocation method of a kernel defined as a lambda
18941860
/// encoding the body of each work-group to launch.
@@ -2689,14 +2655,6 @@ class __SYCL_EXPORT handler {
26892655
class Algorithm>
26902656
friend class ext::oneapi::detail::reduction_impl_algo;
26912657

2692-
// This method needs to call the method finalize() and also access to private
2693-
// ctor/dtor.
2694-
template <typename Reduction, typename... RestT>
2695-
std::enable_if_t<!Reduction::is_usm> friend ext::oneapi::detail::
2696-
reduSaveFinalResultToUserMemHelper(
2697-
std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
2698-
bool IsHost, Reduction &, RestT...);
2699-
27002658
friend void detail::associateWithHandler(handler &,
27012659
detail::AccessorBaseHost *,
27022660
access::target);

sycl/include/CL/sycl/reduction.hpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@
88

99
#pragma once
1010

11+
#if __cplusplus >= 201703L
12+
// Entire feature is dependent on C++17.
13+
1114
#include <CL/sycl/known_identity.hpp>
1215

1316
#include "sycl/ext/oneapi/reduction.hpp"
@@ -171,3 +174,5 @@ reduction(span<T, Extent> Span, const T &Identity, BinaryOperation Combiner,
171174

172175
} // namespace sycl
173176
} // __SYCL_INLINE_NAMESPACE(cl)
177+
178+
#endif // __cplusplus >= 201703L

0 commit comments

Comments
 (0)