@@ -481,13 +481,13 @@ class __SYCL_EXPORT handler {
481
481
482
482
// / Helper utility for operation widely used through different reduction
483
483
// / implementations.
484
- template <class FunctorTy >
485
- event withAuxHandler (std::shared_ptr<detail::queue_impl> Queue,
486
- FunctorTy Func) {
487
- handler AuxHandler (Queue, MIsHost);
484
+ template <class FunctorTy > void withAuxHandler (FunctorTy Func) {
485
+ this ->finalize ();
486
+ handler AuxHandler (MQueue, MIsHost);
488
487
AuxHandler.saveCodeLoc (MCodeLoc);
489
488
Func (AuxHandler);
490
- return AuxHandler.finalize ();
489
+ MLastEvent = AuxHandler.finalize ();
490
+ return ;
491
491
}
492
492
493
493
// / Saves buffers created by handling reduction feature in handler.
@@ -1761,8 +1761,6 @@ class __SYCL_EXPORT handler {
1761
1761
ext::oneapi::experimental::is_property_list<PropertiesT>::value>
1762
1762
parallel_for_impl (range<Dims> Range, PropertiesT Properties, Reduction Redu,
1763
1763
_KERNELFUNCPARAM (KernelFunc)) {
1764
- std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1765
-
1766
1764
// Before running the kernels, check that device has enough local memory
1767
1765
// to hold local arrays required for the tree-reduction algorithm.
1768
1766
constexpr bool IsTreeReduction =
@@ -1782,8 +1780,7 @@ class __SYCL_EXPORT handler {
1782
1780
if (detail::reduCGFuncForRange<KernelName>(
1783
1781
*this , KernelFunc, Range, PrefWGSize, NumConcurrentWorkGroups,
1784
1782
Properties, Redu)) {
1785
- this ->finalize ();
1786
- MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1783
+ withAuxHandler ([&](handler &CopyHandler) {
1787
1784
detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1788
1785
});
1789
1786
}
@@ -1802,7 +1799,6 @@ class __SYCL_EXPORT handler {
1802
1799
parallel_for_basic_impl<KernelName>(Range, Properties, Redu, KernelFunc);
1803
1800
return ;
1804
1801
} else { // Can't "early" return for "if constexpr".
1805
- std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1806
1802
if constexpr (Reduction::has_float64_atomics) {
1807
1803
// / This version is a specialization for the add
1808
1804
// / operator. It performs runtime checks for device aspect "atomic64";
@@ -1837,8 +1833,7 @@ class __SYCL_EXPORT handler {
1837
1833
// the kernel would require creation of another variant of user's kernel,
1838
1834
// which does not seem efficient.
1839
1835
if (Reduction::is_usm || Redu.initializeToIdentity ()) {
1840
- this ->finalize ();
1841
- MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1836
+ withAuxHandler ([&](handler &CopyHandler) {
1842
1837
detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1843
1838
});
1844
1839
}
@@ -1888,7 +1883,6 @@ class __SYCL_EXPORT handler {
1888
1883
1889
1884
// 1. Call the kernel that includes user's lambda function.
1890
1885
detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Properties, Redu);
1891
- std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1892
1886
this ->finalize ();
1893
1887
1894
1888
// 2. Run the additional kernel as many times as needed to reduce
@@ -1906,14 +1900,14 @@ class __SYCL_EXPORT handler {
1906
1900
PI_ERROR_INVALID_WORK_GROUP_SIZE);
1907
1901
size_t NWorkItems = Range.get_group_range ().size ();
1908
1902
while (NWorkItems > 1 ) {
1909
- MLastEvent = withAuxHandler (QueueCopy, [&](handler &AuxHandler) {
1903
+ withAuxHandler ([&](handler &AuxHandler) {
1910
1904
NWorkItems = detail::reduAuxCGFunc<KernelName, KernelType>(
1911
1905
AuxHandler, NWorkItems, MaxWGSize, Redu);
1912
1906
});
1913
1907
} // end while (NWorkItems > 1)
1914
1908
1915
1909
if (Reduction::is_usm) {
1916
- MLastEvent = withAuxHandler (QueueCopy, [&](handler &CopyHandler) {
1910
+ withAuxHandler ([&](handler &CopyHandler) {
1917
1911
detail::reduSaveFinalResultToUserMem<KernelName>(CopyHandler, Redu);
1918
1912
});
1919
1913
}
@@ -1957,12 +1951,11 @@ class __SYCL_EXPORT handler {
1957
1951
1958
1952
detail::reduCGFuncMulti<KernelName>(*this , KernelFunc, Range, Properties,
1959
1953
ReduTuple, ReduIndices);
1960
- std::shared_ptr<detail::queue_impl> QueueCopy = MQueue;
1961
1954
this ->finalize ();
1962
1955
1963
1956
size_t NWorkItems = Range.get_group_range ().size ();
1964
1957
while (NWorkItems > 1 ) {
1965
- MLastEvent = withAuxHandler (QueueCopy, [&](handler &AuxHandler) {
1958
+ withAuxHandler ([&](handler &AuxHandler) {
1966
1959
NWorkItems = detail::reduAuxCGFunc<KernelName, decltype (KernelFunc)>(
1967
1960
AuxHandler, NWorkItems, MaxWGSize, ReduTuple, ReduIndices);
1968
1961
});
0 commit comments