18
18
#include < sycl/kernel.hpp>
19
19
#include < sycl/known_identity.hpp>
20
20
#include < sycl/properties/reduction_properties.hpp>
21
+ #include < sycl/usm.hpp>
21
22
22
23
#include < tuple>
23
24
@@ -666,15 +667,18 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
666
667
667
668
// On discrete (vs. integrated) GPUs it's faster to initialize memory with an
668
669
// extra kernel than copy it from the host.
669
- template <typename Name> auto getGroupsCounterAccDiscrete (handler &CGH) {
670
- auto &Buf = getTempBuffer<int >(1 , CGH);
671
- std::shared_ptr<detail::queue_impl> QueueCopy = CGH.MQueue ;
672
- auto Event = CGH.withAuxHandler (QueueCopy, [&](handler &InitHandler) {
673
- auto Acc = accessor{Buf, InitHandler, sycl::write_only, sycl::no_init};
674
- InitHandler.single_task <Name>([=]() { Acc[0 ] = 0 ; });
675
- });
670
+ auto getGroupsCounterAccDiscrete (handler &CGH) {
671
+ queue q = createSyclObjFromImpl<queue>(CGH.MQueue );
672
+ device Dev = q.get_device ();
673
+ auto Deleter = [=](auto *Ptr) { free (Ptr, q); };
674
+
675
+ std::shared_ptr<int > Counter (malloc_device<int >(1 , q), Deleter);
676
+ CGH.addReduction (Counter);
677
+
678
+ auto Event = q.memset (Counter.get (), 0 , sizeof (int ));
676
679
CGH.depends_on (Event);
677
- return accessor{Buf, CGH};
680
+
681
+ return Counter.get ();
678
682
}
679
683
680
684
RedOutVar &getUserRedVar () { return MRedOut; }
@@ -895,11 +899,8 @@ bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
895
899
896
900
namespace reduction {
897
901
namespace main_krn {
898
- template <class KernelName > struct RangeFastReduce ;
902
+ template <class KernelName , class NWorkGroupsFinished > struct RangeFastReduce ;
899
903
} // namespace main_krn
900
- namespace init_krn {
901
- template <class KernelName > struct GroupCounter ;
902
- }
903
904
} // namespace reduction
904
905
template <typename KernelName, typename KernelType, int Dims, class Reduction >
905
906
bool reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
@@ -917,81 +918,81 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
917
918
accessor PartialSums (PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
918
919
919
920
bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity ();
920
- using InitName =
921
- __sycl_reduction_kernel<reduction::init_krn::GroupCounter, KernelName>;
922
-
923
- // Integrated/discrete GPUs have different faster path.
924
- auto NWorkGroupsFinished =
925
- getDeviceFromHandler (CGH).get_info <info::device::host_unified_memory>()
926
- ? Redu.getReadWriteAccessorToInitializedGroupsCounter (CGH)
927
- : Redu.template getGroupsCounterAccDiscrete <InitName>(CGH);
928
-
929
- auto DoReducePartialSumsInLastWG =
930
- Reduction::template getReadWriteLocalAcc<int >(1 , CGH);
931
-
932
- using Name =
933
- __sycl_reduction_kernel<reduction::main_krn::RangeFastReduce, KernelName>;
934
- size_t PerGroup = Range.size () / NWorkGroups;
935
- CGH.parallel_for <Name>(NDRange, [=](nd_item<1 > NDId) {
936
- // Call user's functions. Reducer.MValue gets initialized there.
937
- typename Reduction::reducer_type Reducer;
938
- reductionLoop (Range, PerGroup, Reducer, NDId, KernelFunc);
921
+ auto Rest = [&](auto NWorkGroupsFinished) {
922
+ auto DoReducePartialSumsInLastWG =
923
+ Reduction::template getReadWriteLocalAcc<int >(1 , CGH);
924
+
925
+ using Name = __sycl_reduction_kernel<reduction::main_krn::RangeFastReduce,
926
+ KernelName, decltype (NWorkGroupsFinished)>;
927
+ size_t PerGroup = Range.size () / NWorkGroups;
928
+ CGH.parallel_for <Name>(NDRange, [=](nd_item<1 > NDId) {
929
+ // Call user's functions. Reducer.MValue gets initialized there.
930
+ typename Reduction::reducer_type Reducer;
931
+ reductionLoop (Range, PerGroup, Reducer, NDId, KernelFunc);
939
932
940
- typename Reduction::binary_operation BOp;
941
- auto Group = NDId.get_group ();
933
+ typename Reduction::binary_operation BOp;
934
+ auto Group = NDId.get_group ();
942
935
943
- // If there are multiple values, reduce each separately
944
- // reduce_over_group is only defined for each T, not for span<T, ...>
945
- size_t LID = NDId.get_local_id (0 );
946
- for (int E = 0 ; E < NElements; ++E) {
947
- auto &RedElem = Reducer.getElement (E);
948
- RedElem = reduce_over_group (Group, RedElem, BOp);
949
- if (LID == 0 ) {
950
- if (NWorkGroups == 1 ) {
951
- auto &OutElem = Reduction::getOutPointer (Out)[E];
952
- // Can avoid using partial sum and write the final result immediately.
953
- if (IsUpdateOfUserVar)
954
- RedElem = BOp (RedElem, OutElem);
955
- OutElem = RedElem;
956
- } else {
957
- PartialSums[NDId.get_group_linear_id () * NElements + E] =
958
- Reducer.getElement (E);
936
+ // If there are multiple values, reduce each separately
937
+ // reduce_over_group is only defined for each T, not for span<T, ...>
938
+ size_t LID = NDId.get_local_id (0 );
939
+ for (int E = 0 ; E < NElements; ++E) {
940
+ auto &RedElem = Reducer.getElement (E);
941
+ RedElem = reduce_over_group (Group, RedElem, BOp);
942
+ if (LID == 0 ) {
943
+ if (NWorkGroups == 1 ) {
944
+ auto &OutElem = Reduction::getOutPointer (Out)[E];
945
+ // Can avoid using partial sum and write the final result
946
+ // immediately.
947
+ if (IsUpdateOfUserVar)
948
+ RedElem = BOp (RedElem, OutElem);
949
+ OutElem = RedElem;
950
+ } else {
951
+ PartialSums[NDId.get_group_linear_id () * NElements + E] =
952
+ Reducer.getElement (E);
953
+ }
959
954
}
960
955
}
961
- }
962
-
963
- if (NWorkGroups == 1 )
964
- // We're done.
965
- return ;
966
956
967
- // Signal this work-group has finished after all values are reduced
968
- if (LID == 0 ) {
969
- auto NFinished =
970
- sycl::atomic_ref<int , memory_order::relaxed, memory_scope::device,
971
- access::address_space::global_space>(
972
- NWorkGroupsFinished[0 ]);
973
- DoReducePartialSumsInLastWG[0 ] = ++NFinished == NWorkGroups;
974
- }
957
+ if (NWorkGroups == 1 )
958
+ // We're done.
959
+ return ;
975
960
976
- workGroupBarrier ();
977
- if (DoReducePartialSumsInLastWG[0 ]) {
978
- // Reduce each result separately
979
- // TODO: Opportunity to parallelize across elements.
980
- for (int E = 0 ; E < NElements; ++E) {
981
- auto &OutElem = Reduction::getOutPointer (Out)[E];
982
- auto LocalSum = Reducer.getIdentity ();
983
- for (size_t I = LID; I < NWorkGroups; I += WGSize)
984
- LocalSum = BOp (LocalSum, PartialSums[I * NElements + E]);
985
- auto Result = reduce_over_group (Group, LocalSum, BOp);
961
+ // Signal this work-group has finished after all values are reduced
962
+ if (LID == 0 ) {
963
+ auto NFinished =
964
+ sycl::atomic_ref<int , memory_order::relaxed, memory_scope::device,
965
+ access::address_space::global_space>(
966
+ NWorkGroupsFinished[0 ]);
967
+ DoReducePartialSumsInLastWG[0 ] = ++NFinished == NWorkGroups;
968
+ }
986
969
987
- if (LID == 0 ) {
988
- if (IsUpdateOfUserVar)
989
- Result = BOp (Result, OutElem);
990
- OutElem = Result;
970
+ workGroupBarrier ();
971
+ if (DoReducePartialSumsInLastWG[0 ]) {
972
+ // Reduce each result separately
973
+ // TODO: Opportunity to parallelize across elements.
974
+ for (int E = 0 ; E < NElements; ++E) {
975
+ auto &OutElem = Reduction::getOutPointer (Out)[E];
976
+ auto LocalSum = Reducer.getIdentity ();
977
+ for (size_t I = LID; I < NWorkGroups; I += WGSize)
978
+ LocalSum = BOp (LocalSum, PartialSums[I * NElements + E]);
979
+ auto Result = reduce_over_group (Group, LocalSum, BOp);
980
+
981
+ if (LID == 0 ) {
982
+ if (IsUpdateOfUserVar)
983
+ Result = BOp (Result, OutElem);
984
+ OutElem = Result;
985
+ }
991
986
}
992
987
}
993
- }
994
- });
988
+ });
989
+ };
990
+
991
+ // Integrated/discrete GPUs have different faster path.
992
+ if (getDeviceFromHandler (CGH).get_info <info::device::host_unified_memory>())
993
+ Rest (Redu.getReadWriteAccessorToInitializedGroupsCounter (CGH));
994
+ else
995
+ Rest (Redu.getGroupsCounterAccDiscrete (CGH));
995
996
996
997
// We've updated user's variable, no extra work needed.
997
998
return false ;
0 commit comments