@@ -649,6 +649,13 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
649
649
}
650
650
}
651
651
652
+ template <class _T = T, int D = buffer_dim>
653
+ auto &getTempBuffer (size_t Size, handler &CGH) {
654
+ auto Buffer = std::make_shared<buffer<_T, D>>(range<1 >(Size));
655
+ CGH.addReduction (Buffer);
656
+ return *Buffer;
657
+ }
658
+
652
659
// / Returns an accessor accessing the memory that will hold the reduction
653
660
// / partial sums.
654
661
// / If \p Size is equal to one, then the reduction result is the final and
@@ -708,15 +715,28 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
708
715
return {*CounterBuf, CGH};
709
716
}
710
717
711
- RedOutVar &getUserRedVar () { return MRedOut; }
712
-
713
- static inline result_type *getOutPointer (const rw_accessor_type &OutAcc) {
714
- return OutAcc.get_pointer ().get ();
718
+ // On discrete (vs. integrated) GPUs it's faster to initialize memory with an
719
+ // extra kernel than copy it from the host.
720
+ template <typename Name> auto getGroupsCounterAccDiscrete (handler &CGH) {
721
+ auto &Buf = getTempBuffer<int , 1 >(1 , CGH);
722
+ std::shared_ptr<detail::queue_impl> QueueCopy = CGH.MQueue ;
723
+ auto Event = CGH.withAuxHandler (QueueCopy, [&](handler &InitHandler) {
724
+ auto Acc = accessor{Buf, InitHandler, sycl::write_only, sycl::no_init};
725
+ InitHandler.single_task <Name>([=]() { Acc[0 ] = 0 ; });
726
+ });
727
+ CGH.depends_on (Event);
728
+ return accessor{Buf, CGH};
715
729
}
716
730
731
+ RedOutVar &getUserRedVar () { return MRedOut; }
732
+
717
733
static inline result_type *getOutPointer (result_type *OutPtr) {
718
734
return OutPtr;
719
735
}
736
+ template <class AccessorType >
737
+ static inline result_type *getOutPointer (const AccessorType &OutAcc) {
738
+ return OutAcc.get_pointer ().get ();
739
+ }
720
740
721
741
private:
722
742
template <typename BufferT>
@@ -892,7 +912,7 @@ template <class KernelName> struct RangeFastAtomics;
892
912
} // namespace main_krn
893
913
} // namespace reduction
894
914
template <typename KernelName, typename KernelType, int Dims, class Reduction >
895
- void reduCGFuncForRangeFastAtomics (handler &CGH, KernelType KernelFunc,
915
+ bool reduCGFuncForRangeFastAtomics (handler &CGH, KernelType KernelFunc,
896
916
const range<Dims> &Range,
897
917
const nd_range<1 > &NDRange,
898
918
Reduction &Redu) {
@@ -927,29 +947,43 @@ void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
927
947
Reducer.template atomic_combine (Reduction::getOutPointer (Out));
928
948
}
929
949
});
950
+ return Reduction::is_usm || Redu.initializeToIdentity ();
930
951
}
931
952
932
953
namespace reduction {
933
954
namespace main_krn {
934
955
template <class KernelName > struct RangeFastReduce ;
935
956
} // namespace main_krn
957
+ namespace init_krn {
958
+ template <class KernelName > struct GroupCounter ;
959
+ }
936
960
} // namespace reduction
937
961
template <typename KernelName, typename KernelType, int Dims, class Reduction >
938
- void reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
962
+ bool reduCGFuncForRangeFastReduce (handler &CGH, KernelType KernelFunc,
939
963
const range<Dims> &Range,
940
964
const nd_range<1 > &NDRange, Reduction &Redu) {
941
965
constexpr size_t NElements = Reduction::num_elements;
942
966
size_t WGSize = NDRange.get_local_range ().size ();
943
967
size_t NWorkGroups = NDRange.get_group_range ().size ();
944
968
969
+ auto &Out = Redu.getUserRedVar ();
970
+ if constexpr (Reduction::is_acc)
971
+ associateWithHandler (CGH, &Out, access::target::device);
972
+
973
+ auto &PartialSumsBuf = Redu.getTempBuffer (NWorkGroups * NElements, CGH);
974
+ accessor PartialSums (PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
975
+
945
976
bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity ();
946
- auto PartialSums =
947
- Redu.getWriteAccForPartialReds (NWorkGroups * NElements, CGH);
948
- auto Out = (NWorkGroups == 1 )
949
- ? PartialSums
950
- : Redu.getWriteAccForPartialReds (NElements, CGH);
977
+ using InitName =
978
+ __sycl_reduction_kernel<reduction::init_krn::GroupCounter, KernelName>;
979
+
980
+ // Integrated/discrete GPUs have different faster path.
951
981
auto NWorkGroupsFinished =
952
- Redu.getReadWriteAccessorToInitializedGroupsCounter (CGH);
982
+ sycl::detail::getDeviceFromHandler (CGH)
983
+ .get_info <info::device::host_unified_memory>()
984
+ ? Redu.getReadWriteAccessorToInitializedGroupsCounter (CGH)
985
+ : Redu.template getGroupsCounterAccDiscrete <InitName>(CGH);
986
+
953
987
auto DoReducePartialSumsInLastWG =
954
988
Reduction::template getReadWriteLocalAcc<int >(1 , CGH);
955
989
@@ -967,50 +1001,57 @@ void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
967
1001
// reduce_over_group is only defined for each T, not for span<T, ...>
968
1002
size_t LID = NDId.get_local_id (0 );
969
1003
for (int E = 0 ; E < NElements; ++E) {
970
- Reducer.getElement (E) =
971
- reduce_over_group (Group, Reducer.getElement (E), BOp);
972
-
1004
+ auto &RedElem = Reducer.getElement (E);
1005
+ RedElem = reduce_over_group (Group, RedElem, BOp);
973
1006
if (LID == 0 ) {
974
- if (NWorkGroups == 1 && IsUpdateOfUserVar)
975
- Reducer.getElement (E) =
976
- BOp (Reducer.getElement (E), Reduction::getOutPointer (Out)[E]);
977
-
978
- // if NWorkGroups == 1, then PartialsSum and Out point to same memory.
979
- Reduction::getOutPointer (
980
- PartialSums)[NDId.get_group_linear_id () * NElements + E] =
981
- Reducer.getElement (E);
1007
+ if (NWorkGroups == 1 ) {
1008
+ auto &OutElem = Reduction::getOutPointer (Out)[E];
1009
+ // Can avoid using partial sum and write the final result immediately.
1010
+ if (IsUpdateOfUserVar)
1011
+ RedElem = BOp (RedElem, OutElem);
1012
+ OutElem = RedElem;
1013
+ } else {
1014
+ PartialSums[NDId.get_group_linear_id () * NElements + E] =
1015
+ Reducer.getElement (E);
1016
+ }
982
1017
}
983
1018
}
984
1019
1020
+ if (NWorkGroups == 1 )
1021
+ // We're done.
1022
+ return ;
1023
+
985
1024
// Signal this work-group has finished after all values are reduced
986
1025
if (LID == 0 ) {
987
1026
auto NFinished =
988
1027
sycl::atomic_ref<int , memory_order::relaxed, memory_scope::device,
989
1028
access::address_space::global_space>(
990
1029
NWorkGroupsFinished[0 ]);
991
- DoReducePartialSumsInLastWG[0 ] =
992
- ++NFinished == NWorkGroups && NWorkGroups > 1 ;
1030
+ DoReducePartialSumsInLastWG[0 ] = ++NFinished == NWorkGroups;
993
1031
}
994
1032
995
1033
sycl::detail::workGroupBarrier ();
996
1034
if (DoReducePartialSumsInLastWG[0 ]) {
997
1035
// Reduce each result separately
998
- // TODO: Opportunity to parallelize across elements
1036
+ // TODO: Opportunity to parallelize across elements.
999
1037
for (int E = 0 ; E < NElements; ++E) {
1038
+ auto &OutElem = Reduction::getOutPointer (Out)[E];
1000
1039
auto LocalSum = Reducer.getIdentity ();
1001
1040
for (size_t I = LID; I < NWorkGroups; I += WGSize)
1002
1041
LocalSum = BOp (LocalSum, PartialSums[I * NElements + E]);
1003
- Reducer. getElement (E) = reduce_over_group (Group, LocalSum, BOp);
1042
+ auto Result = reduce_over_group (Group, LocalSum, BOp);
1004
1043
1005
1044
if (LID == 0 ) {
1006
1045
if (IsUpdateOfUserVar)
1007
- Reducer.getElement (E) =
1008
- BOp (Reducer.getElement (E), Reduction::getOutPointer (Out)[E]);
1009
- Reduction::getOutPointer (Out)[E] = Reducer.getElement (E);
1046
+ Result = BOp (Result, OutElem);
1047
+ OutElem = Result;
1010
1048
}
1011
1049
}
1012
1050
}
1013
1051
});
1052
+
1053
+ // We've updated user's variable, no extra work needed.
1054
+ return false ;
1014
1055
}
1015
1056
1016
1057
namespace reduction {
@@ -1019,7 +1060,7 @@ template <class KernelName> struct RangeBasic;
1019
1060
} // namespace main_krn
1020
1061
} // namespace reduction
1021
1062
template <typename KernelName, typename KernelType, int Dims, class Reduction >
1022
- void reduCGFuncForRangeBasic (handler &CGH, KernelType KernelFunc,
1063
+ bool reduCGFuncForRangeBasic (handler &CGH, KernelType KernelFunc,
1023
1064
const range<Dims> &Range,
1024
1065
const nd_range<1 > &NDRange, Reduction &Redu) {
1025
1066
constexpr size_t NElements = Reduction::num_elements;
@@ -1125,10 +1166,13 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1125
1166
}
1126
1167
}
1127
1168
});
1169
+ return Reduction::is_usm || Reduction::is_dw_acc;
1128
1170
}
1129
1171
1172
+ // / Returns "true" if the result has to be saved to user's variable by
1173
+ // / reduSaveFinalResultToUserMem.
1130
1174
template <typename KernelName, typename KernelType, int Dims, class Reduction >
1131
- void reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
1175
+ bool reduCGFuncForRange (handler &CGH, KernelType KernelFunc,
1132
1176
const range<Dims> &Range, size_t MaxWGSize,
1133
1177
uint32_t NumConcurrentWorkGroups, Reduction &Redu) {
1134
1178
size_t NWorkItems = Range.size ();
@@ -1141,16 +1185,15 @@ void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
1141
1185
size_t NDRItems = NWorkGroups * WGSize;
1142
1186
nd_range<1 > NDRange{range<1 >{NDRItems}, range<1 >{WGSize}};
1143
1187
1144
- if constexpr (Reduction::has_fast_atomics) {
1145
- reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange,
1146
- Redu);
1147
-
1148
- } else if constexpr (Reduction::has_fast_reduce) {
1149
- reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange,
1150
- Redu);
1151
- } else {
1152
- reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange, Redu);
1153
- }
1188
+ if constexpr (Reduction::has_fast_atomics)
1189
+ return reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range,
1190
+ NDRange, Redu);
1191
+ else if constexpr (Reduction::has_fast_reduce)
1192
+ return reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range,
1193
+ NDRange, Redu);
1194
+ else
1195
+ return reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1196
+ Redu);
1154
1197
}
1155
1198
1156
1199
namespace reduction {
0 commit comments