Skip to content

Commit ac3fd7c

Browse files
[SYCL][Reduction] Use "if constexpr" over SFINAE
Several reasons: * The code is easier to read/understand. * Ability to mix compiler/run-time conditions in a single instance of logic description, including those in future (e.g. discrete vs integrated GPU). * Eliminate duplication of similar parts of different algorithms so that distinctions are better highlighted. * Ease of experiments when trying to switch algorithm used by decoupling hard requirements from the peformance tuning logic (i.e., calling slower code must not result in a compile-time error). * Allows to stop encoding implementation selection logic into overloads, use distinct names instead (e.g., see forward declarations in handler.hpp). Drawbacks: * The feature is only enabled in C++17 mode now. The customers who need C++14 mode aren't expected to require it. Mitigation plan would be to change remaining SFINAE logic inside reduction.hpp into runtime asserts and have "constexpr" changed to a compile-time macro __SYCL_REDUCTION_CONSTEXPR to resort to run-time only code selection in C++14 mode.
1 parent 49f72fa commit ac3fd7c

File tree

2 files changed

+427
-497
lines changed

2 files changed

+427
-497
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 112 additions & 154 deletions
Original file line numberDiff line numberDiff line change
@@ -247,37 +247,36 @@ class reduction_impl_algo;
247247
using cl::sycl::detail::enable_if_t;
248248
using cl::sycl::detail::queue_impl;
249249

250-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
251-
void reduCGFunc(handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
252-
size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
253-
Reduction &Redu);
250+
// Kernels with single reduction
254251

252+
/// If we are given sycl::range and not sycl::nd_range we have more freedom in
253+
/// how to split the iteration space.
255254
template <typename KernelName, typename KernelType, int Dims, class Reduction>
256-
enable_if_t<Reduction::has_atomic_add_float64>
257-
reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
258-
const nd_range<Dims> &Range, Reduction &Redu);
255+
void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
256+
const range<Dims> &Range, size_t MaxWGSize,
257+
uint32_t NumConcurrentWorkGroups, Reduction &Redu);
259258

260259
template <typename KernelName, typename KernelType, int Dims, class Reduction>
261-
enable_if_t<Reduction::has_fast_atomics>
262-
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
263-
Reduction &Redu);
260+
void reduCGFuncAtomic64(handler &CGH, KernelType KernelFunc,
261+
const nd_range<Dims> &Range, Reduction &Redu);
264262

265263
template <typename KernelName, typename KernelType, int Dims, class Reduction>
266-
enable_if_t<!Reduction::has_fast_atomics>
267-
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
268-
Reduction &Redu);
264+
void reduCGFunc(handler &CGH, KernelType KernelFunc,
265+
const nd_range<Dims> &Range, Reduction &Redu);
269266

270-
template <typename KernelName, typename KernelType, class Reduction>
271-
enable_if_t<!Reduction::has_fast_atomics, size_t>
272-
reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
273-
Reduction &Redu);
267+
// Kernels with multiple reductions
274268

269+
// sycl::nd_range version
275270
template <typename KernelName, typename KernelType, int Dims,
276271
typename... Reductions, size_t... Is>
277-
void reduCGFunc(handler &CGH, KernelType KernelFunc,
278-
const nd_range<Dims> &Range,
279-
std::tuple<Reductions...> &ReduTuple,
280-
std::index_sequence<Is...>);
272+
void reduCGFuncMulti(handler &CGH, KernelType KernelFunc,
273+
const nd_range<Dims> &Range,
274+
std::tuple<Reductions...> &ReduTuple,
275+
std::index_sequence<Is...>);
276+
277+
template <typename KernelName, typename KernelType, class Reduction>
278+
size_t reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
279+
Reduction &Redu);
281280

282281
template <typename KernelName, typename KernelType, typename... Reductions,
283282
size_t... Is>
@@ -299,12 +298,6 @@ reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
299298
bool IsHost, std::tuple<Reduction...> &ReduTuple,
300299
std::index_sequence<Is...>);
301300

302-
template <typename Reduction, typename... RestT>
303-
std::enable_if_t<!Reduction::is_usm>
304-
reduSaveFinalResultToUserMemHelper(std::vector<event> &Events,
305-
std::shared_ptr<detail::queue_impl> Queue,
306-
bool IsHost, Reduction &Redu, RestT... Rest);
307-
308301
__SYCL_EXPORT uint32_t
309302
reduGetMaxNumConcurrentWorkGroups(std::shared_ptr<queue_impl> Queue);
310303

@@ -469,6 +462,27 @@ class __SYCL_EXPORT handler {
469462
MStreamStorage.push_back(Stream);
470463
}
471464

465+
/// Helper utility for operation widely used throught different reduction
466+
/// implementations.
467+
/// @{
468+
template <class FunctorTy>
469+
event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
470+
FunctorTy Func) {
471+
handler AuxHandler(Queue, MIsHost);
472+
AuxHandler.saveCodeLoc(MCodeLoc);
473+
Func(AuxHandler);
474+
return AuxHandler.finalize();
475+
}
476+
477+
template <class FunctorTy>
478+
static event withAuxHandler(std::shared_ptr<detail::queue_impl> Queue,
479+
bool IsHost, FunctorTy Func) {
480+
handler AuxHandler(Queue, IsHost);
481+
Func(AuxHandler);
482+
return AuxHandler.finalize();
483+
}
484+
/// }@
485+
472486
/// Saves buffers created by handling reduction feature in handler.
473487
/// They are then forwarded to command group and destroyed only after
474488
/// the command group finishes the work on device/host.
@@ -1586,6 +1600,9 @@ class __SYCL_EXPORT handler {
15861600
#endif
15871601
}
15881602

1603+
// "if constexpr" simplifies implementation/increases readability in comparison
1604+
// with SFINAE-based approach.
1605+
#if __cplusplus >= 201703L
15891606
/// Defines and invokes a SYCL kernel function for the specified nd_range.
15901607
///
15911608
/// The SYCL kernel function is defined as a lambda function or a named
@@ -1617,123 +1634,76 @@ class __SYCL_EXPORT handler {
16171634
// for the device.
16181635
size_t MaxWGSize =
16191636
ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1620-
ext::oneapi::detail::reduCGFunc<KernelName>(
1637+
ext::oneapi::detail::reduCGFuncForRange<KernelName>(
16211638
*this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
16221639
if (Reduction::is_usm ||
16231640
(Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
16241641
(!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor())) {
16251642
this->finalize();
1626-
handler CopyHandler(QueueCopy, MIsHost);
1627-
CopyHandler.saveCodeLoc(MCodeLoc);
1628-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1629-
Redu);
1630-
MLastEvent = CopyHandler.finalize();
1631-
}
1632-
}
1633-
1634-
/// Implements parallel_for() accepting nd_range \p Range and one reduction
1635-
/// object. This version uses fast sycl::atomic operations to update reduction
1636-
/// variable at the end of each work-group work.
1637-
//
1638-
// If the reduction variable must be initialized with the identity value
1639-
// before the kernel run, then an additional working accessor is created,
1640-
// initialized with the identity value and used in the kernel. That working
1641-
// accessor is then copied to user's accessor or USM pointer after
1642-
// the kernel run.
1643-
// For USM pointers without initialize_to_identity properties the same scheme
1644-
// with working accessor is used as re-using user's USM pointer in the kernel
1645-
// would require creation of another variant of user's kernel, which does not
1646-
// seem efficient.
1647-
template <typename KernelName = detail::auto_name, typename KernelType,
1648-
int Dims, typename Reduction>
1649-
detail::enable_if_t<Reduction::has_fast_atomics>
1650-
parallel_for(nd_range<Dims> Range, Reduction Redu,
1651-
_KERNELFUNCPARAM(KernelFunc)) {
1652-
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1653-
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1654-
1655-
if (Reduction::is_usm || Redu.initializeToIdentity()) {
1656-
this->finalize();
1657-
handler CopyHandler(QueueCopy, MIsHost);
1658-
CopyHandler.saveCodeLoc(MCodeLoc);
1659-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1660-
Redu);
1661-
MLastEvent = CopyHandler.finalize();
1643+
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1644+
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1645+
CopyHandler, Redu);
1646+
});
16621647
}
16631648
}
16641649

1665-
/// Implements parallel_for() accepting nd_range \p Range and one reduction
1666-
/// object. This version is a specialization for the add operator.
1667-
/// It performs runtime checks for device aspect "atomic64"; if found, fast
1668-
/// sycl::atomic_ref operations are used to update the reduction at the
1669-
/// end of each work-group work. Otherwise the default implementation is
1670-
/// used.
1671-
//
1672-
// If the reduction variable must be initialized with the identity value
1673-
// before the kernel run, then an additional working accessor is created,
1674-
// initialized with the identity value and used in the kernel. That working
1675-
// accessor is then copied to user's accessor or USM pointer after
1676-
// the kernel run.
1677-
// For USM pointers without initialize_to_identity properties the same scheme
1678-
// with working accessor is used as re-using user's USM pointer in the kernel
1679-
// would require creation of another variant of user's kernel, which does not
1680-
// seem efficient.
16811650
template <typename KernelName = detail::auto_name, typename KernelType,
16821651
int Dims, typename Reduction>
1683-
detail::enable_if_t<Reduction::has_atomic_add_float64>
1684-
parallel_for(nd_range<Dims> Range, Reduction Redu,
1685-
_KERNELFUNCPARAM(KernelFunc)) {
1686-
1687-
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1688-
device D = detail::getDeviceFromHandler(*this);
1689-
1690-
if (D.has(aspect::atomic64)) {
1691-
1692-
ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
1693-
Range, Redu);
1694-
1652+
void parallel_for(nd_range<Dims> Range, Reduction Redu,
1653+
_KERNELFUNCPARAM(KernelFunc)) {
1654+
if constexpr (!Reduction::has_fast_atomics &&
1655+
!Reduction::has_atomic_add_float64) {
1656+
// The most basic implementation.
1657+
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1658+
return;
1659+
} else { // Can't "early" return for "if constexpr".
1660+
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1661+
if constexpr (Reduction::has_atomic_add_float64) {
1662+
/// This version is a specialization for the add
1663+
/// operator. It performs runtime checks for device aspect "atomic64";
1664+
/// if found, fast sycl::atomic_ref operations are used to update the
1665+
/// reduction at the end of each work-group work. Otherwise the
1666+
/// default implementation is used.
1667+
device D = detail::getDeviceFromHandler(*this);
1668+
1669+
if (D.has(aspect::atomic64)) {
1670+
1671+
ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this, KernelFunc,
1672+
Range, Redu);
1673+
} else {
1674+
// Resort to basic implementation as well.
1675+
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1676+
return;
1677+
}
1678+
} else {
1679+
// Use fast sycl::atomic operations to update reduction variable at the
1680+
// end of each work-group work.
1681+
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
1682+
Redu);
1683+
}
1684+
// If the reduction variable must be initialized with the identity value
1685+
// before the kernel run, then an additional working accessor is created,
1686+
// initialized with the identity value and used in the kernel. That
1687+
// working accessor is then copied to user's accessor or USM pointer after
1688+
// the kernel run.
1689+
// For USM pointers without initialize_to_identity properties the same
1690+
// scheme with working accessor is used as re-using user's USM pointer in
1691+
// the kernel would require creation of another variant of user's kernel,
1692+
// which does not seem efficient.
16951693
if (Reduction::is_usm || Redu.initializeToIdentity()) {
16961694
this->finalize();
1697-
handler CopyHandler(QueueCopy, MIsHost);
1698-
CopyHandler.saveCodeLoc(MCodeLoc);
1699-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1700-
CopyHandler, Redu);
1701-
MLastEvent = CopyHandler.finalize();
1695+
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1696+
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1697+
CopyHandler, Redu);
1698+
});
17021699
}
1703-
} else {
1704-
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
17051700
}
17061701
}
17071702

1708-
/// Defines and invokes a SYCL kernel function for the specified nd_range.
1709-
/// Performs reduction operation specified in \p Redu.
1710-
///
1711-
/// The SYCL kernel function is defined as a lambda function or a named
1712-
/// function object type and given an id or item for indexing in the indexing
1713-
/// space defined by \p Range.
1714-
/// If it is a named function object and the function object type is
1715-
/// globally visible, there is no need for the developer to provide
1716-
/// a kernel name for it.
1717-
///
1718-
/// TODO: Support HOST. The kernels called by this parallel_for() may use
1719-
/// some functionality that is not yet supported on HOST such as:
1720-
/// barrier(), and ext::oneapi::reduce() that also may be used in more
1721-
/// optimized implementations waiting for their turn of code-review.
1722-
template <typename KernelName = detail::auto_name, typename KernelType,
1723-
int Dims, typename Reduction>
1724-
detail::enable_if_t<!Reduction::has_fast_atomics &&
1725-
!Reduction::has_atomic_add_float64>
1726-
parallel_for(nd_range<Dims> Range, Reduction Redu,
1727-
_KERNELFUNCPARAM(KernelFunc)) {
1728-
1729-
parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1730-
}
1731-
17321703
template <typename KernelName, typename KernelType, int Dims,
17331704
typename Reduction>
1734-
detail::enable_if_t<!Reduction::has_fast_atomics>
1735-
parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
1736-
KernelType KernelFunc) {
1705+
void parallel_for_Impl(nd_range<Dims> Range, Reduction Redu,
1706+
KernelType KernelFunc) {
17371707
// This parallel_for() is lowered to the following sequence:
17381708
// 1) Call a kernel that a) call user's lambda function and b) performs
17391709
// one iteration of reduction, storing the partial reductions/sums
@@ -1789,20 +1759,17 @@ class __SYCL_EXPORT handler {
17891759
PI_ERROR_INVALID_WORK_GROUP_SIZE);
17901760
size_t NWorkItems = Range.get_group_range().size();
17911761
while (NWorkItems > 1) {
1792-
handler AuxHandler(QueueCopy, MIsHost);
1793-
AuxHandler.saveCodeLoc(MCodeLoc);
1794-
1795-
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1796-
AuxHandler, NWorkItems, MaxWGSize, Redu);
1797-
MLastEvent = AuxHandler.finalize();
1762+
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
1763+
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1764+
AuxHandler, NWorkItems, MaxWGSize, Redu);
1765+
});
17981766
} // end while (NWorkItems > 1)
17991767

18001768
if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor()) {
1801-
handler CopyHandler(QueueCopy, MIsHost);
1802-
CopyHandler.saveCodeLoc(MCodeLoc);
1803-
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1804-
Redu);
1805-
MLastEvent = CopyHandler.finalize();
1769+
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
1770+
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1771+
CopyHandler, Redu);
1772+
});
18061773
}
18071774
}
18081775

@@ -1867,27 +1834,26 @@ class __SYCL_EXPORT handler {
18671834
std::to_string(MaxWGSize),
18681835
PI_ERROR_INVALID_WORK_GROUP_SIZE);
18691836

1870-
ext::oneapi::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range,
1871-
ReduTuple, ReduIndices);
1837+
ext::oneapi::detail::reduCGFuncMulti<KernelName>(*this, KernelFunc, Range,
1838+
ReduTuple, ReduIndices);
18721839
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
18731840
this->finalize();
18741841

18751842
size_t NWorkItems = Range.get_group_range().size();
18761843
while (NWorkItems > 1) {
1877-
handler AuxHandler(QueueCopy, MIsHost);
1878-
AuxHandler.saveCodeLoc(MCodeLoc);
1879-
1880-
NWorkItems =
1881-
ext::oneapi::detail::reduAuxCGFunc<KernelName, decltype(KernelFunc)>(
1882-
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1883-
MLastEvent = AuxHandler.finalize();
1844+
MLastEvent = withAuxHandler(QueueCopy, [&](handler &AuxHandler) {
1845+
NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName,
1846+
decltype(KernelFunc)>(
1847+
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1848+
});
18841849
} // end while (NWorkItems > 1)
18851850

18861851
auto CopyEvent = ext::oneapi::detail::reduSaveFinalResultToUserMem(
18871852
QueueCopy, MIsHost, ReduTuple, ReduIndices);
18881853
if (CopyEvent)
18891854
MLastEvent = *CopyEvent;
18901855
}
1856+
#endif
18911857

18921858
/// Hierarchical kernel invocation method of a kernel defined as a lambda
18931859
/// encoding the body of each work-group to launch.
@@ -2688,14 +2654,6 @@ class __SYCL_EXPORT handler {
26882654
class Algorithm>
26892655
friend class ext::oneapi::detail::reduction_impl_algo;
26902656

2691-
// This method needs to call the method finalize() and also access to private
2692-
// ctor/dtor.
2693-
template <typename Reduction, typename... RestT>
2694-
std::enable_if_t<!Reduction::is_usm> friend ext::oneapi::detail::
2695-
reduSaveFinalResultToUserMemHelper(
2696-
std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
2697-
bool IsHost, Reduction &, RestT...);
2698-
26992657
friend void detail::associateWithHandler(handler &,
27002658
detail::AccessorBaseHost *,
27012659
access::target);

0 commit comments

Comments
 (0)