@@ -111,8 +111,8 @@ __SYCL_EXPORT device getDeviceFromHandler(handler &);
111
111
112
112
namespace intel {
113
113
namespace detail {
114
- template <typename T, class BinaryOperation , int Dims, access::mode AccMode ,
115
- access::placeholder IsPlaceholder>
114
+ template <typename T, class BinaryOperation , int Dims, bool IsUSM ,
115
+ access::mode AccMode, access:: placeholder IsPlaceholder>
116
116
class reduction_impl ;
117
117
118
118
using cl::sycl::detail::enable_if_t ;
@@ -140,12 +140,12 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
140
140
template <typename KernelName, typename KernelType, int Dims, class Reduction >
141
141
enable_if_t <Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
142
142
reduAuxCGFunc (handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
143
- size_t KernelRun, Reduction &Redu);
143
+ Reduction &Redu);
144
144
145
145
template <typename KernelName, typename KernelType, int Dims, class Reduction >
146
146
enable_if_t <!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
147
147
reduAuxCGFunc (handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
148
- size_t KernelRun, Reduction &Redu);
148
+ Reduction &Redu);
149
149
} // namespace detail
150
150
} // namespace intel
151
151
@@ -266,11 +266,9 @@ class __SYCL_EXPORT handler {
266
266
267
267
bool is_host () { return MIsHost; }
268
268
269
- template <typename DataT, int Dims, access::mode AccessMode,
270
- access::target AccessTarget>
271
- void associateWithHandler (accessor<DataT, Dims, AccessMode, AccessTarget,
272
- access::placeholder::false_t >
273
- Acc) {
269
+ template <typename T, int Dims, access::mode AccMode,
270
+ access::target AccTarget, access::placeholder IsPH>
271
+ void associateWithHandler (accessor<T, Dims, AccMode, AccTarget, IsPH> Acc) {
274
272
detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
275
273
detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl (*AccBase);
276
274
detail::Requirement *Req = AccImpl.get ();
@@ -281,7 +279,7 @@ class __SYCL_EXPORT handler {
281
279
// Add an accessor to the handler list of associated accessors.
282
280
// For associated accessors index does not means nothing.
283
281
MAssociatedAccesors.emplace_back (detail::kernel_param_kind_t ::kind_accessor,
284
- Req, static_cast <int >(AccessTarget ),
282
+ Req, static_cast <int >(AccTarget ),
285
283
/* index*/ 0 );
286
284
}
287
285
@@ -692,18 +690,7 @@ class __SYCL_EXPORT handler {
692
690
void
693
691
require (accessor<DataT, Dims, AccMode, AccTarget, access::placeholder::true_t >
694
692
Acc) {
695
- detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
696
- detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl (*AccBase);
697
- detail::Requirement *Req = AccImpl.get ();
698
- // Add accessor to the list of requirements.
699
- MRequirements.push_back (Req);
700
- // Store copy of the accessor.
701
- MAccStorage.push_back (std::move (AccImpl));
702
- // Add an accessor to the handler list of associated accessors.
703
- // For associated accessors index does not means nothing.
704
- MAssociatedAccesors.emplace_back (detail::kernel_param_kind_t ::kind_accessor,
705
- Req, static_cast <int >(AccTarget),
706
- /* index*/ 0 );
693
+ associateWithHandler (Acc);
707
694
}
708
695
709
696
// / Registers event dependencies on this command group.
@@ -867,8 +854,22 @@ class __SYCL_EXPORT handler {
867
854
detail::enable_if_t <Reduction::accessor_mode == access::mode::read_write &&
868
855
Reduction::has_fast_atomics>
869
856
parallel_for (nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
870
- intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu,
871
- Redu.MAcc );
857
+ if (Reduction::is_usm)
858
+ Redu.associateWithHandler (*this );
859
+ shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
860
+ auto Acc = Redu.getUserAccessor ();
861
+ intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu, Acc);
862
+
863
+ // Submit non-blocking copy from reduction accessor to user's reduction
864
+ // variable.
865
+ if (Reduction::is_usm) {
866
+ this ->finalize ();
867
+ handler CopyHandler (QueueCopy, MIsHost);
868
+ CopyHandler.saveCodeLoc (MCodeLoc);
869
+ Redu.associateWithHandler (CopyHandler);
870
+ CopyHandler.copy (Acc, Redu.getUSMPointer ());
871
+ MLastEvent = CopyHandler.finalize ();
872
+ }
872
873
}
873
874
874
875
// / Implements parallel_for() accepting nd_range and 1 reduction variable
@@ -886,7 +887,7 @@ class __SYCL_EXPORT handler {
886
887
detail::enable_if_t <Reduction::accessor_mode == access::mode::discard_write &&
887
888
Reduction::has_fast_atomics>
888
889
parallel_for (nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
889
- auto QueueCopy = MQueue;
890
+ shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
890
891
auto RWAcc = Redu.getReadWriteScalarAcc (*this );
891
892
intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu,
892
893
RWAcc);
@@ -896,7 +897,8 @@ class __SYCL_EXPORT handler {
896
897
handler CopyHandler (QueueCopy, MIsHost);
897
898
CopyHandler.saveCodeLoc (MCodeLoc);
898
899
CopyHandler.associateWithHandler (RWAcc);
899
- CopyHandler.copy (RWAcc, Redu.MAcc );
900
+ Redu.associateWithHandler (CopyHandler);
901
+ CopyHandler.copy (RWAcc, Redu.getUserAccessor ());
900
902
MLastEvent = CopyHandler.finalize ();
901
903
}
902
904
@@ -935,8 +937,10 @@ class __SYCL_EXPORT handler {
935
937
// necessary to reduce all partial sums into one final sum.
936
938
937
939
// 1. Call the kernel that includes user's lambda function.
940
+ if (Reduction::is_usm && NWorkGroups == 1 )
941
+ Redu.associateWithHandler (*this );
938
942
intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu);
939
- auto QueueCopy = MQueue;
943
+ shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
940
944
this ->finalize ();
941
945
942
946
// 2. Run the additional aux kernel as many times as needed to reduce
@@ -950,7 +954,6 @@ class __SYCL_EXPORT handler {
950
954
// sum faster.
951
955
size_t WGSize = Range.get_local_range ().size ();
952
956
size_t NWorkItems = NWorkGroups;
953
- size_t KernelRun = 1 ;
954
957
while (NWorkItems > 1 ) {
955
958
WGSize = std::min (WGSize, NWorkItems);
956
959
NWorkGroups = NWorkItems / WGSize;
@@ -965,14 +968,23 @@ class __SYCL_EXPORT handler {
965
968
// The last kernel DOES write to reduction's accessor.
966
969
// Associate it with handler manually.
967
970
if (NWorkGroups == 1 )
968
- AuxHandler .associateWithHandler (Redu. MAcc );
969
- intel::detail::reduAuxCGFunc<KernelName, KernelType>(
970
- AuxHandler, Range, NWorkItems, KernelRun , Redu);
971
+ Redu .associateWithHandler (AuxHandler );
972
+ intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
973
+ NWorkItems , Redu);
971
974
MLastEvent = AuxHandler.finalize ();
972
975
973
976
NWorkItems = NWorkGroups;
974
- ++KernelRun;
975
977
} // end while (NWorkItems > 1)
978
+
979
+ // Submit non-blocking copy from reduction accessor to user's reduction
980
+ // variable.
981
+ if (Reduction::is_usm) {
982
+ handler CopyHandler (QueueCopy, MIsHost);
983
+ CopyHandler.saveCodeLoc (MCodeLoc);
984
+ Redu.associateWithHandler (CopyHandler);
985
+ CopyHandler.copy (Redu.getUserAccessor (), Redu.getUSMPointer ());
986
+ MLastEvent = CopyHandler.finalize ();
987
+ }
976
988
}
977
989
978
990
// / Hierarchical kernel invocation method of a kernel defined as a lambda
@@ -1614,8 +1626,8 @@ class __SYCL_EXPORT handler {
1614
1626
friend class detail ::stream_impl;
1615
1627
// Make reduction_impl friend to store buffers and arrays created for it
1616
1628
// in handler from reduction_impl methods.
1617
- template <typename T, class BinaryOperation , int Dims, access::mode AccMode ,
1618
- access::placeholder IsPlaceholder>
1629
+ template <typename T, class BinaryOperation , int Dims, bool IsUSM ,
1630
+ access::mode AccMode, access:: placeholder IsPlaceholder>
1619
1631
friend class intel ::detail::reduction_impl;
1620
1632
};
1621
1633
} // namespace sycl
0 commit comments