@@ -248,37 +248,36 @@ class reduction_impl_algo;
248
248
using cl::sycl::detail::enable_if_t ;
249
249
using cl::sycl::detail::queue_impl;
250
250
251
- template <typename KernelName, typename KernelType, int Dims, class Reduction >
252
- void reduCGFunc (handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
253
- size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
254
- Reduction &Redu);
251
+ // Kernels with single reduction
255
252
253
+ // / If we are given sycl::range and not sycl::nd_range we have more freedom in
254
+ // / how to split the iteration space.
256
255
template <typename KernelName, typename KernelType, int Dims, class Reduction >
257
- enable_if_t <Reduction::has_atomic_add_float64>
258
- reduCGFuncAtomic64 (handler &CGH, KernelType KernelFunc ,
259
- const nd_range<Dims> &Range , Reduction &Redu);
256
+ void reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
257
+ const range<Dims> &Range, size_t MaxWGSize ,
258
+ uint32_t NumConcurrentWorkGroups , Reduction &Redu);
260
259
261
260
template <typename KernelName, typename KernelType, int Dims, class Reduction >
262
- enable_if_t <Reduction::has_fast_atomics>
263
- reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
264
- Reduction &Redu);
261
+ void reduCGFuncAtomic64 (handler &CGH, KernelType KernelFunc,
262
+ const nd_range<Dims> &Range, Reduction &Redu);
265
263
266
264
template <typename KernelName, typename KernelType, int Dims, class Reduction >
267
- enable_if_t <!Reduction::has_fast_atomics>
268
- reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
269
- Reduction &Redu);
265
+ void reduCGFunc (handler &CGH, KernelType KernelFunc,
266
+ const nd_range<Dims> &Range, Reduction &Redu);
270
267
271
- template <typename KernelName, typename KernelType, class Reduction >
272
- enable_if_t <!Reduction::has_fast_atomics, size_t >
273
- reduAuxCGFunc (handler &CGH, size_t NWorkItems, size_t MaxWGSize,
274
- Reduction &Redu);
268
+ // Kernels with multiple reductions
275
269
270
+ // sycl::nd_range version
276
271
template <typename KernelName, typename KernelType, int Dims,
277
272
typename ... Reductions, size_t ... Is>
278
- void reduCGFunc (handler &CGH, KernelType KernelFunc,
279
- const nd_range<Dims> &Range,
280
- std::tuple<Reductions...> &ReduTuple,
281
- std::index_sequence<Is...>);
273
+ void reduCGFuncMulti (handler &CGH, KernelType KernelFunc,
274
+ const nd_range<Dims> &Range,
275
+ std::tuple<Reductions...> &ReduTuple,
276
+ std::index_sequence<Is...>);
277
+
278
+ template <typename KernelName, typename KernelType, class Reduction >
279
+ size_t reduAuxCGFunc (handler &CGH, size_t NWorkItems, size_t MaxWGSize,
280
+ Reduction &Redu);
282
281
283
282
template <typename KernelName, typename KernelType, typename ... Reductions,
284
283
size_t ... Is>
@@ -300,12 +299,6 @@ reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
300
299
bool IsHost, std::tuple<Reduction...> &ReduTuple,
301
300
std::index_sequence<Is...>);
302
301
303
- template <typename Reduction, typename ... RestT>
304
- std::enable_if_t <!Reduction::is_usm>
305
- reduSaveFinalResultToUserMemHelper (std::vector<event> &Events,
306
- std::shared_ptr<detail::queue_impl> Queue,
307
- bool IsHost, Reduction &Redu, RestT... Rest);
308
-
309
302
__SYCL_EXPORT uint32_t
310
303
reduGetMaxNumConcurrentWorkGroups (std::shared_ptr<queue_impl> Queue);
311
304
@@ -470,6 +463,27 @@ class __SYCL_EXPORT handler {
470
463
MStreamStorage.push_back (Stream);
471
464
}
472
465
466
+ // / Helper utility for operation widely used through different reduction
467
+ // / implementations.
468
+ // / @{
469
+ template <class FunctorTy >
470
+ event withAuxHandler (std::shared_ptr<detail::queue_impl> Queue,
471
+ FunctorTy Func) {
472
+ handler AuxHandler (Queue, MIsHost);
473
+ AuxHandler.saveCodeLoc (MCodeLoc);
474
+ Func (AuxHandler);
475
+ return AuxHandler.finalize ();
476
+ }
477
+
478
+ template <class FunctorTy >
479
+ static event withAuxHandler (std::shared_ptr<detail::queue_impl> Queue,
480
+ bool IsHost, FunctorTy Func) {
481
+ handler AuxHandler (Queue, IsHost);
482
+ Func (AuxHandler);
483
+ return AuxHandler.finalize ();
484
+ }
485
+ // / }@
486
+
473
487
// / Saves buffers created by handling reduction feature in handler.
474
488
// / They are then forwarded to command group and destroyed only after
475
489
// / the command group finishes the work on device/host.
@@ -1587,6 +1601,9 @@ class __SYCL_EXPORT handler {
1587
1601
#endif
1588
1602
}
1589
1603
1604
+ // "if constexpr" simplifies implementation/increases readability in comparison
1605
+ // with SFINAE-based approach.
1606
+ #if __cplusplus >= 201703L
1590
1607
// / Defines and invokes a SYCL kernel function for the specified nd_range.
1591
1608
// /
1592
1609
// / The SYCL kernel function is defined as a lambda function or a named
@@ -1618,123 +1635,76 @@ class __SYCL_EXPORT handler {
1618
1635
// for the device.
1619
1636
size_t MaxWGSize =
1620
1637
ext::oneapi::detail::reduGetMaxWGSize (MQueue, OneElemSize);
1621
- ext::oneapi::detail::reduCGFunc <KernelName>(
1638
+ ext::oneapi::detail::reduCGFuncForRange <KernelName>(
1622
1639
*this , KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
1623
1640
if (Reduction::is_usm ||
1624
1641
(Reduction::has_fast_atomics && Redu.initializeToIdentity ()) ||
1625
1642
(!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor ())) {
1626
1643
this ->finalize ();
1627
- handler CopyHandler (QueueCopy, MIsHost);
1628
- CopyHandler.saveCodeLoc (MCodeLoc);
1629
- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1630
- Redu);
1631
- MLastEvent = CopyHandler.finalize ();
1632
- }
1633
- }
1634
-
1635
- // / Implements parallel_for() accepting nd_range \p Range and one reduction
1636
- // / object. This version uses fast sycl::atomic operations to update reduction
1637
- // / variable at the end of each work-group work.
1638
- //
1639
- // If the reduction variable must be initialized with the identity value
1640
- // before the kernel run, then an additional working accessor is created,
1641
- // initialized with the identity value and used in the kernel. That working
1642
- // accessor is then copied to user's accessor or USM pointer after
1643
- // the kernel run.
1644
- // For USM pointers without initialize_to_identity properties the same scheme
1645
- // with working accessor is used as re-using user's USM pointer in the kernel
1646
- // would require creation of another variant of user's kernel, which does not
1647
- // seem efficient.
1648
- template <typename KernelName = detail::auto_name, typename KernelType,
1649
- int Dims, typename Reduction>
1650
- detail::enable_if_t <Reduction::has_fast_atomics>
1651
- parallel_for (nd_range<Dims> Range, Reduction Redu,
1652
- _KERNELFUNCPARAM (KernelFunc)) {
1653
- std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1654
- ext::oneapi::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu);
1655
-
1656
- if (Reduction::is_usm || Redu.initializeToIdentity ()) {
1657
- this ->finalize ();
1658
- handler CopyHandler (QueueCopy, MIsHost);
1659
- CopyHandler.saveCodeLoc (MCodeLoc);
1660
- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1661
- Redu);
1662
- MLastEvent = CopyHandler.finalize ();
1644
+ MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1645
+ ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1646
+ CopyHandler, Redu);
1647
+ });
1663
1648
}
1664
1649
}
1665
1650
1666
- // / Implements parallel_for() accepting nd_range \p Range and one reduction
1667
- // / object. This version is a specialization for the add operator.
1668
- // / It performs runtime checks for device aspect "atomic64"; if found, fast
1669
- // / sycl::atomic_ref operations are used to update the reduction at the
1670
- // / end of each work-group work. Otherwise the default implementation is
1671
- // / used.
1672
- //
1673
- // If the reduction variable must be initialized with the identity value
1674
- // before the kernel run, then an additional working accessor is created,
1675
- // initialized with the identity value and used in the kernel. That working
1676
- // accessor is then copied to user's accessor or USM pointer after
1677
- // the kernel run.
1678
- // For USM pointers without initialize_to_identity properties the same scheme
1679
- // with working accessor is used as re-using user's USM pointer in the kernel
1680
- // would require creation of another variant of user's kernel, which does not
1681
- // seem efficient.
1682
1651
template <typename KernelName = detail::auto_name, typename KernelType,
1683
1652
int Dims, typename Reduction>
1684
- detail::enable_if_t <Reduction::has_atomic_add_float64>
1685
- parallel_for (nd_range<Dims> Range, Reduction Redu,
1686
- _KERNELFUNCPARAM (KernelFunc)) {
1687
-
1688
- std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1689
- device D = detail::getDeviceFromHandler (*this );
1690
-
1691
- if (D.has (aspect::atomic64)) {
1692
-
1693
- ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this , KernelFunc,
1694
- Range, Redu);
1695
-
1653
+ void parallel_for (nd_range<Dims> Range, Reduction Redu,
1654
+ _KERNELFUNCPARAM (KernelFunc)) {
1655
+ if constexpr (!Reduction::has_fast_atomics &&
1656
+ !Reduction::has_atomic_add_float64) {
1657
+ // The most basic implementation.
1658
+ parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
1659
+ return ;
1660
+ } else { // Can't "early" return for "if constexpr".
1661
+ std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1662
+ if constexpr (Reduction::has_atomic_add_float64) {
1663
+ // / This version is a specialization for the add
1664
+ // / operator. It performs runtime checks for device aspect "atomic64";
1665
+ // / if found, fast sycl::atomic_ref operations are used to update the
1666
+ // / reduction at the end of each work-group work. Otherwise the
1667
+ // / default implementation is used.
1668
+ device D = detail::getDeviceFromHandler (*this );
1669
+
1670
+ if (D.has (aspect::atomic64)) {
1671
+
1672
+ ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this , KernelFunc,
1673
+ Range, Redu);
1674
+ } else {
1675
+ // Resort to basic implementation as well.
1676
+ parallel_for_impl<KernelName>(Range, Redu, KernelFunc);
1677
+ return ;
1678
+ }
1679
+ } else {
1680
+ // Use fast sycl::atomic operations to update reduction variable at the
1681
+ // end of each work-group work.
1682
+ ext::oneapi::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range,
1683
+ Redu);
1684
+ }
1685
+ // If the reduction variable must be initialized with the identity value
1686
+ // before the kernel run, then an additional working accessor is created,
1687
+ // initialized with the identity value and used in the kernel. That
1688
+ // working accessor is then copied to user's accessor or USM pointer after
1689
+ // the kernel run.
1690
+ // For USM pointers without initialize_to_identity properties the same
1691
+ // scheme with working accessor is used as re-using user's USM pointer in
1692
+ // the kernel would require creation of another variant of user's kernel,
1693
+ // which does not seem efficient.
1696
1694
if (Reduction::is_usm || Redu.initializeToIdentity ()) {
1697
1695
this ->finalize ();
1698
- handler CopyHandler (QueueCopy, MIsHost);
1699
- CopyHandler.saveCodeLoc (MCodeLoc);
1700
- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1701
- CopyHandler, Redu);
1702
- MLastEvent = CopyHandler.finalize ();
1696
+ MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1697
+ ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1698
+ CopyHandler, Redu);
1699
+ });
1703
1700
}
1704
- } else {
1705
- parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1706
1701
}
1707
1702
}
1708
1703
1709
- // / Defines and invokes a SYCL kernel function for the specified nd_range.
1710
- // / Performs reduction operation specified in \p Redu.
1711
- // /
1712
- // / The SYCL kernel function is defined as a lambda function or a named
1713
- // / function object type and given an id or item for indexing in the indexing
1714
- // / space defined by \p Range.
1715
- // / If it is a named function object and the function object type is
1716
- // / globally visible, there is no need for the developer to provide
1717
- // / a kernel name for it.
1718
- // /
1719
- // / TODO: Support HOST. The kernels called by this parallel_for() may use
1720
- // / some functionality that is not yet supported on HOST such as:
1721
- // / barrier(), and ext::oneapi::reduce() that also may be used in more
1722
- // / optimized implementations waiting for their turn of code-review.
1723
- template <typename KernelName = detail::auto_name, typename KernelType,
1724
- int Dims, typename Reduction>
1725
- detail::enable_if_t <!Reduction::has_fast_atomics &&
1726
- !Reduction::has_atomic_add_float64>
1727
- parallel_for (nd_range<Dims> Range, Reduction Redu,
1728
- _KERNELFUNCPARAM (KernelFunc)) {
1729
-
1730
- parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1731
- }
1732
-
1733
1704
template <typename KernelName, typename KernelType, int Dims,
1734
1705
typename Reduction>
1735
- detail::enable_if_t <!Reduction::has_fast_atomics>
1736
- parallel_for_Impl (nd_range<Dims> Range, Reduction Redu,
1737
- KernelType KernelFunc) {
1706
+ void parallel_for_impl (nd_range<Dims> Range, Reduction Redu,
1707
+ KernelType KernelFunc) {
1738
1708
// This parallel_for() is lowered to the following sequence:
1739
1709
// 1) Call a kernel that a) call user's lambda function and b) performs
1740
1710
// one iteration of reduction, storing the partial reductions/sums
@@ -1790,20 +1760,17 @@ class __SYCL_EXPORT handler {
1790
1760
PI_ERROR_INVALID_WORK_GROUP_SIZE);
1791
1761
size_t NWorkItems = Range.get_group_range ().size ();
1792
1762
while (NWorkItems > 1 ) {
1793
- handler AuxHandler (QueueCopy, MIsHost);
1794
- AuxHandler.saveCodeLoc (MCodeLoc);
1795
-
1796
- NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1797
- AuxHandler, NWorkItems, MaxWGSize, Redu);
1798
- MLastEvent = AuxHandler.finalize ();
1763
+ MLastEvent = withAuxHandler (QueueCopy, [&](handler &AuxHandler) {
1764
+ NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1765
+ AuxHandler, NWorkItems, MaxWGSize, Redu);
1766
+ });
1799
1767
} // end while (NWorkItems > 1)
1800
1768
1801
1769
if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor ()) {
1802
- handler CopyHandler (QueueCopy, MIsHost);
1803
- CopyHandler.saveCodeLoc (MCodeLoc);
1804
- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1805
- Redu);
1806
- MLastEvent = CopyHandler.finalize ();
1770
+ MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1771
+ ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1772
+ CopyHandler, Redu);
1773
+ });
1807
1774
}
1808
1775
}
1809
1776
@@ -1868,27 +1835,26 @@ class __SYCL_EXPORT handler {
1868
1835
std::to_string (MaxWGSize),
1869
1836
PI_ERROR_INVALID_WORK_GROUP_SIZE);
1870
1837
1871
- ext::oneapi::detail::reduCGFunc <KernelName>(*this , KernelFunc, Range,
1872
- ReduTuple, ReduIndices);
1838
+ ext::oneapi::detail::reduCGFuncMulti <KernelName>(*this , KernelFunc, Range,
1839
+ ReduTuple, ReduIndices);
1873
1840
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1874
1841
this ->finalize ();
1875
1842
1876
1843
size_t NWorkItems = Range.get_group_range ().size ();
1877
1844
while (NWorkItems > 1 ) {
1878
- handler AuxHandler (QueueCopy, MIsHost);
1879
- AuxHandler.saveCodeLoc (MCodeLoc);
1880
-
1881
- NWorkItems =
1882
- ext::oneapi::detail::reduAuxCGFunc<KernelName, decltype (KernelFunc)>(
1883
- AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1884
- MLastEvent = AuxHandler.finalize ();
1845
+ MLastEvent = withAuxHandler (QueueCopy, [&](handler &AuxHandler) {
1846
+ NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName,
1847
+ decltype (KernelFunc)>(
1848
+ AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1849
+ });
1885
1850
} // end while (NWorkItems > 1)
1886
1851
1887
1852
auto CopyEvent = ext::oneapi::detail::reduSaveFinalResultToUserMem (
1888
1853
QueueCopy, MIsHost, ReduTuple, ReduIndices);
1889
1854
if (CopyEvent)
1890
1855
MLastEvent = *CopyEvent;
1891
1856
}
1857
+ #endif // __cplusplus >= 201703L
1892
1858
1893
1859
// / Hierarchical kernel invocation method of a kernel defined as a lambda
1894
1860
// / encoding the body of each work-group to launch.
@@ -2689,14 +2655,6 @@ class __SYCL_EXPORT handler {
2689
2655
class Algorithm >
2690
2656
friend class ext ::oneapi::detail::reduction_impl_algo;
2691
2657
2692
- // This method needs to call the method finalize() and also access to private
2693
- // ctor/dtor.
2694
- template <typename Reduction, typename ... RestT>
2695
- std::enable_if_t <!Reduction::is_usm> friend ext::oneapi::detail::
2696
- reduSaveFinalResultToUserMemHelper (
2697
- std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
2698
- bool IsHost, Reduction &, RestT...);
2699
-
2700
2658
friend void detail::associateWithHandler (handler &,
2701
2659
detail::AccessorBaseHost *,
2702
2660
access::target);
0 commit comments