@@ -247,37 +247,36 @@ class reduction_impl_algo;
247
247
using cl::sycl::detail::enable_if_t ;
248
248
using cl::sycl::detail::queue_impl;
249
249
250
- template <typename KernelName, typename KernelType, int Dims, class Reduction >
251
- void reduCGFunc (handler &CGH, KernelType KernelFunc, const range<Dims> &Range,
252
- size_t MaxWGSize, uint32_t NumConcurrentWorkGroups,
253
- Reduction &Redu);
250
+ // Kernels with single reduction
254
251
252
+ // / If we are given sycl::range and not sycl::nd_range we have more freedom in
253
+ // / how to split the iteration space.
255
254
template <typename KernelName, typename KernelType, int Dims, class Reduction >
256
- enable_if_t <Reduction::has_atomic_add_float64>
257
- reduCGFuncAtomic64 (handler &CGH, KernelType KernelFunc ,
258
- const nd_range<Dims> &Range , Reduction &Redu);
255
+ void reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
256
+ const range<Dims> &Range, size_t MaxWGSize ,
257
+ uint32_t NumConcurrentWorkGroups , Reduction &Redu);
259
258
260
259
template <typename KernelName, typename KernelType, int Dims, class Reduction >
261
- enable_if_t <Reduction::has_fast_atomics>
262
- reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
263
- Reduction &Redu);
260
+ void reduCGFuncAtomic64 (handler &CGH, KernelType KernelFunc,
261
+ const nd_range<Dims> &Range, Reduction &Redu);
264
262
265
263
template <typename KernelName, typename KernelType, int Dims, class Reduction >
266
- enable_if_t <!Reduction::has_fast_atomics>
267
- reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
268
- Reduction &Redu);
264
+ void reduCGFunc (handler &CGH, KernelType KernelFunc,
265
+ const nd_range<Dims> &Range, Reduction &Redu);
269
266
270
- template <typename KernelName, typename KernelType, class Reduction >
271
- enable_if_t <!Reduction::has_fast_atomics, size_t >
272
- reduAuxCGFunc (handler &CGH, size_t NWorkItems, size_t MaxWGSize,
273
- Reduction &Redu);
267
+ // Kernels with multiple reductions
274
268
269
+ // sycl::nd_range version
275
270
template <typename KernelName, typename KernelType, int Dims,
276
271
typename ... Reductions, size_t ... Is>
277
- void reduCGFunc (handler &CGH, KernelType KernelFunc,
278
- const nd_range<Dims> &Range,
279
- std::tuple<Reductions...> &ReduTuple,
280
- std::index_sequence<Is...>);
272
+ void reduCGFuncMulti (handler &CGH, KernelType KernelFunc,
273
+ const nd_range<Dims> &Range,
274
+ std::tuple<Reductions...> &ReduTuple,
275
+ std::index_sequence<Is...>);
276
+
277
+ template <typename KernelName, typename KernelType, class Reduction >
278
+ size_t reduAuxCGFunc (handler &CGH, size_t NWorkItems, size_t MaxWGSize,
279
+ Reduction &Redu);
281
280
282
281
template <typename KernelName, typename KernelType, typename ... Reductions,
283
282
size_t ... Is>
@@ -299,12 +298,6 @@ reduSaveFinalResultToUserMem(std::shared_ptr<detail::queue_impl> Queue,
299
298
bool IsHost, std::tuple<Reduction...> &ReduTuple,
300
299
std::index_sequence<Is...>);
301
300
302
- template <typename Reduction, typename ... RestT>
303
- std::enable_if_t <!Reduction::is_usm>
304
- reduSaveFinalResultToUserMemHelper (std::vector<event> &Events,
305
- std::shared_ptr<detail::queue_impl> Queue,
306
- bool IsHost, Reduction &Redu, RestT... Rest);
307
-
308
301
__SYCL_EXPORT uint32_t
309
302
reduGetMaxNumConcurrentWorkGroups (std::shared_ptr<queue_impl> Queue);
310
303
@@ -469,6 +462,27 @@ class __SYCL_EXPORT handler {
469
462
MStreamStorage.push_back (Stream);
470
463
}
471
464
465
+ // / Helper utility for operation widely used throught different reduction
466
+ // / implementations.
467
+ // / @{
468
+ template <class FunctorTy >
469
+ event withAuxHandler (std::shared_ptr<detail::queue_impl> Queue,
470
+ FunctorTy Func) {
471
+ handler AuxHandler (Queue, MIsHost);
472
+ AuxHandler.saveCodeLoc (MCodeLoc);
473
+ Func (AuxHandler);
474
+ return AuxHandler.finalize ();
475
+ }
476
+
477
+ template <class FunctorTy >
478
+ static event withAuxHandler (std::shared_ptr<detail::queue_impl> Queue,
479
+ bool IsHost, FunctorTy Func) {
480
+ handler AuxHandler (Queue, IsHost);
481
+ Func (AuxHandler);
482
+ return AuxHandler.finalize ();
483
+ }
484
+ // / }@
485
+
472
486
// / Saves buffers created by handling reduction feature in handler.
473
487
// / They are then forwarded to command group and destroyed only after
474
488
// / the command group finishes the work on device/host.
@@ -1586,6 +1600,9 @@ class __SYCL_EXPORT handler {
1586
1600
#endif
1587
1601
}
1588
1602
1603
+ // "if constexpr" simplifies implementation/increases readability in comparison
1604
+ // with SFINAE-based approach.
1605
+ #if __cplusplus >= 201703L
1589
1606
// / Defines and invokes a SYCL kernel function for the specified nd_range.
1590
1607
// /
1591
1608
// / The SYCL kernel function is defined as a lambda function or a named
@@ -1617,123 +1634,76 @@ class __SYCL_EXPORT handler {
1617
1634
// for the device.
1618
1635
size_t MaxWGSize =
1619
1636
ext::oneapi::detail::reduGetMaxWGSize (MQueue, OneElemSize);
1620
- ext::oneapi::detail::reduCGFunc <KernelName>(
1637
+ ext::oneapi::detail::reduCGFuncForRange <KernelName>(
1621
1638
*this , KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
1622
1639
if (Reduction::is_usm ||
1623
1640
(Reduction::has_fast_atomics && Redu.initializeToIdentity ()) ||
1624
1641
(!Reduction::has_fast_atomics && Redu.hasUserDiscardWriteAccessor ())) {
1625
1642
this ->finalize ();
1626
- handler CopyHandler (QueueCopy, MIsHost);
1627
- CopyHandler.saveCodeLoc (MCodeLoc);
1628
- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1629
- Redu);
1630
- MLastEvent = CopyHandler.finalize ();
1631
- }
1632
- }
1633
-
1634
- // / Implements parallel_for() accepting nd_range \p Range and one reduction
1635
- // / object. This version uses fast sycl::atomic operations to update reduction
1636
- // / variable at the end of each work-group work.
1637
- //
1638
- // If the reduction variable must be initialized with the identity value
1639
- // before the kernel run, then an additional working accessor is created,
1640
- // initialized with the identity value and used in the kernel. That working
1641
- // accessor is then copied to user's accessor or USM pointer after
1642
- // the kernel run.
1643
- // For USM pointers without initialize_to_identity properties the same scheme
1644
- // with working accessor is used as re-using user's USM pointer in the kernel
1645
- // would require creation of another variant of user's kernel, which does not
1646
- // seem efficient.
1647
- template <typename KernelName = detail::auto_name, typename KernelType,
1648
- int Dims, typename Reduction>
1649
- detail::enable_if_t <Reduction::has_fast_atomics>
1650
- parallel_for (nd_range<Dims> Range, Reduction Redu,
1651
- _KERNELFUNCPARAM (KernelFunc)) {
1652
- std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1653
- ext::oneapi::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu);
1654
-
1655
- if (Reduction::is_usm || Redu.initializeToIdentity ()) {
1656
- this ->finalize ();
1657
- handler CopyHandler (QueueCopy, MIsHost);
1658
- CopyHandler.saveCodeLoc (MCodeLoc);
1659
- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1660
- Redu);
1661
- MLastEvent = CopyHandler.finalize ();
1643
+ MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1644
+ ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1645
+ CopyHandler, Redu);
1646
+ });
1662
1647
}
1663
1648
}
1664
1649
1665
- // / Implements parallel_for() accepting nd_range \p Range and one reduction
1666
- // / object. This version is a specialization for the add operator.
1667
- // / It performs runtime checks for device aspect "atomic64"; if found, fast
1668
- // / sycl::atomic_ref operations are used to update the reduction at the
1669
- // / end of each work-group work. Otherwise the default implementation is
1670
- // / used.
1671
- //
1672
- // If the reduction variable must be initialized with the identity value
1673
- // before the kernel run, then an additional working accessor is created,
1674
- // initialized with the identity value and used in the kernel. That working
1675
- // accessor is then copied to user's accessor or USM pointer after
1676
- // the kernel run.
1677
- // For USM pointers without initialize_to_identity properties the same scheme
1678
- // with working accessor is used as re-using user's USM pointer in the kernel
1679
- // would require creation of another variant of user's kernel, which does not
1680
- // seem efficient.
1681
1650
template <typename KernelName = detail::auto_name, typename KernelType,
1682
1651
int Dims, typename Reduction>
1683
- detail::enable_if_t <Reduction::has_atomic_add_float64>
1684
- parallel_for (nd_range<Dims> Range, Reduction Redu,
1685
- _KERNELFUNCPARAM (KernelFunc)) {
1686
-
1687
- std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1688
- device D = detail::getDeviceFromHandler (*this );
1689
-
1690
- if (D.has (aspect::atomic64)) {
1691
-
1692
- ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this , KernelFunc,
1693
- Range, Redu);
1694
-
1652
+ void parallel_for (nd_range<Dims> Range, Reduction Redu,
1653
+ _KERNELFUNCPARAM (KernelFunc)) {
1654
+ if constexpr (!Reduction::has_fast_atomics &&
1655
+ !Reduction::has_atomic_add_float64) {
1656
+ // The most basic implementation.
1657
+ parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1658
+ return ;
1659
+ } else { // Can't "early" return for "if constexpr".
1660
+ std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1661
+ if constexpr (Reduction::has_atomic_add_float64) {
1662
+ // / This version is a specialization for the add
1663
+ // / operator. It performs runtime checks for device aspect "atomic64";
1664
+ // / if found, fast sycl::atomic_ref operations are used to update the
1665
+ // / reduction at the end of each work-group work. Otherwise the
1666
+ // / default implementation is used.
1667
+ device D = detail::getDeviceFromHandler (*this );
1668
+
1669
+ if (D.has (aspect::atomic64)) {
1670
+
1671
+ ext::oneapi::detail::reduCGFuncAtomic64<KernelName>(*this , KernelFunc,
1672
+ Range, Redu);
1673
+ } else {
1674
+ // Resort to basic implementation as well.
1675
+ parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1676
+ return ;
1677
+ }
1678
+ } else {
1679
+ // Use fast sycl::atomic operations to update reduction variable at the
1680
+ // end of each work-group work.
1681
+ ext::oneapi::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range,
1682
+ Redu);
1683
+ }
1684
+ // If the reduction variable must be initialized with the identity value
1685
+ // before the kernel run, then an additional working accessor is created,
1686
+ // initialized with the identity value and used in the kernel. That
1687
+ // working accessor is then copied to user's accessor or USM pointer after
1688
+ // the kernel run.
1689
+ // For USM pointers without initialize_to_identity properties the same
1690
+ // scheme with working accessor is used as re-using user's USM pointer in
1691
+ // the kernel would require creation of another variant of user's kernel,
1692
+ // which does not seem efficient.
1695
1693
if (Reduction::is_usm || Redu.initializeToIdentity ()) {
1696
1694
this ->finalize ();
1697
- handler CopyHandler (QueueCopy, MIsHost);
1698
- CopyHandler.saveCodeLoc (MCodeLoc);
1699
- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1700
- CopyHandler, Redu);
1701
- MLastEvent = CopyHandler.finalize ();
1695
+ MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1696
+ ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1697
+ CopyHandler, Redu);
1698
+ });
1702
1699
}
1703
- } else {
1704
- parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1705
1700
}
1706
1701
}
1707
1702
1708
- // / Defines and invokes a SYCL kernel function for the specified nd_range.
1709
- // / Performs reduction operation specified in \p Redu.
1710
- // /
1711
- // / The SYCL kernel function is defined as a lambda function or a named
1712
- // / function object type and given an id or item for indexing in the indexing
1713
- // / space defined by \p Range.
1714
- // / If it is a named function object and the function object type is
1715
- // / globally visible, there is no need for the developer to provide
1716
- // / a kernel name for it.
1717
- // /
1718
- // / TODO: Support HOST. The kernels called by this parallel_for() may use
1719
- // / some functionality that is not yet supported on HOST such as:
1720
- // / barrier(), and ext::oneapi::reduce() that also may be used in more
1721
- // / optimized implementations waiting for their turn of code-review.
1722
- template <typename KernelName = detail::auto_name, typename KernelType,
1723
- int Dims, typename Reduction>
1724
- detail::enable_if_t <!Reduction::has_fast_atomics &&
1725
- !Reduction::has_atomic_add_float64>
1726
- parallel_for (nd_range<Dims> Range, Reduction Redu,
1727
- _KERNELFUNCPARAM (KernelFunc)) {
1728
-
1729
- parallel_for_Impl<KernelName>(Range, Redu, KernelFunc);
1730
- }
1731
-
1732
1703
template <typename KernelName, typename KernelType, int Dims,
1733
1704
typename Reduction>
1734
- detail::enable_if_t <!Reduction::has_fast_atomics>
1735
- parallel_for_Impl (nd_range<Dims> Range, Reduction Redu,
1736
- KernelType KernelFunc) {
1705
+ void parallel_for_Impl (nd_range<Dims> Range, Reduction Redu,
1706
+ KernelType KernelFunc) {
1737
1707
// This parallel_for() is lowered to the following sequence:
1738
1708
// 1) Call a kernel that a) call user's lambda function and b) performs
1739
1709
// one iteration of reduction, storing the partial reductions/sums
@@ -1789,20 +1759,17 @@ class __SYCL_EXPORT handler {
1789
1759
PI_ERROR_INVALID_WORK_GROUP_SIZE);
1790
1760
size_t NWorkItems = Range.get_group_range ().size ();
1791
1761
while (NWorkItems > 1 ) {
1792
- handler AuxHandler (QueueCopy, MIsHost);
1793
- AuxHandler.saveCodeLoc (MCodeLoc);
1794
-
1795
- NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1796
- AuxHandler, NWorkItems, MaxWGSize, Redu);
1797
- MLastEvent = AuxHandler.finalize ();
1762
+ MLastEvent = withAuxHandler (QueueCopy, [&](handler &AuxHandler) {
1763
+ NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName, KernelType>(
1764
+ AuxHandler, NWorkItems, MaxWGSize, Redu);
1765
+ });
1798
1766
} // end while (NWorkItems > 1)
1799
1767
1800
1768
if (Reduction::is_usm || Redu.hasUserDiscardWriteAccessor ()) {
1801
- handler CopyHandler (QueueCopy, MIsHost);
1802
- CopyHandler.saveCodeLoc (MCodeLoc);
1803
- ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler,
1804
- Redu);
1805
- MLastEvent = CopyHandler.finalize ();
1769
+ MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1770
+ ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(
1771
+ CopyHandler, Redu);
1772
+ });
1806
1773
}
1807
1774
}
1808
1775
@@ -1867,27 +1834,26 @@ class __SYCL_EXPORT handler {
1867
1834
std::to_string (MaxWGSize),
1868
1835
PI_ERROR_INVALID_WORK_GROUP_SIZE);
1869
1836
1870
- ext::oneapi::detail::reduCGFunc <KernelName>(*this , KernelFunc, Range,
1871
- ReduTuple, ReduIndices);
1837
+ ext::oneapi::detail::reduCGFuncMulti <KernelName>(*this , KernelFunc, Range,
1838
+ ReduTuple, ReduIndices);
1872
1839
std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1873
1840
this ->finalize ();
1874
1841
1875
1842
size_t NWorkItems = Range.get_group_range ().size ();
1876
1843
while (NWorkItems > 1 ) {
1877
- handler AuxHandler (QueueCopy, MIsHost);
1878
- AuxHandler.saveCodeLoc (MCodeLoc);
1879
-
1880
- NWorkItems =
1881
- ext::oneapi::detail::reduAuxCGFunc<KernelName, decltype (KernelFunc)>(
1882
- AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1883
- MLastEvent = AuxHandler.finalize ();
1844
+ MLastEvent = withAuxHandler (QueueCopy, [&](handler &AuxHandler) {
1845
+ NWorkItems = ext::oneapi::detail::reduAuxCGFunc<KernelName,
1846
+ decltype (KernelFunc)>(
1847
+ AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1848
+ });
1884
1849
} // end while (NWorkItems > 1)
1885
1850
1886
1851
auto CopyEvent = ext::oneapi::detail::reduSaveFinalResultToUserMem (
1887
1852
QueueCopy, MIsHost, ReduTuple, ReduIndices);
1888
1853
if (CopyEvent)
1889
1854
MLastEvent = *CopyEvent;
1890
1855
}
1856
+ #endif
1891
1857
1892
1858
// / Hierarchical kernel invocation method of a kernel defined as a lambda
1893
1859
// / encoding the body of each work-group to launch.
@@ -2688,14 +2654,6 @@ class __SYCL_EXPORT handler {
2688
2654
class Algorithm >
2689
2655
friend class ext ::oneapi::detail::reduction_impl_algo;
2690
2656
2691
- // This method needs to call the method finalize() and also access to private
2692
- // ctor/dtor.
2693
- template <typename Reduction, typename ... RestT>
2694
- std::enable_if_t <!Reduction::is_usm> friend ext::oneapi::detail::
2695
- reduSaveFinalResultToUserMemHelper (
2696
- std::vector<event> &Events, std::shared_ptr<detail::queue_impl> Queue,
2697
- bool IsHost, Reduction &, RestT...);
2698
-
2699
2657
friend void detail::associateWithHandler (handler &,
2700
2658
detail::AccessorBaseHost *,
2701
2659
access::target);
0 commit comments