Skip to content

Commit 94cb022

Browse files
authored
[SYCL] Implement USM vars and placeholder accessors passed to reduction (#1657)
In order to support USM vars/pointers passed to reduction the placeholder accessors to global buffers were used. Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent 2de30ed commit 94cb022

File tree

4 files changed

+410
-58
lines changed

4 files changed

+410
-58
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 46 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,8 @@ __SYCL_EXPORT device getDeviceFromHandler(handler &);
111111

112112
namespace intel {
113113
namespace detail {
114-
template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
115-
access::placeholder IsPlaceholder>
114+
template <typename T, class BinaryOperation, int Dims, bool IsUSM,
115+
access::mode AccMode, access::placeholder IsPlaceholder>
116116
class reduction_impl;
117117

118118
using cl::sycl::detail::enable_if_t;
@@ -140,12 +140,12 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
140140
template <typename KernelName, typename KernelType, int Dims, class Reduction>
141141
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
142142
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
143-
size_t KernelRun, Reduction &Redu);
143+
Reduction &Redu);
144144

145145
template <typename KernelName, typename KernelType, int Dims, class Reduction>
146146
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
147147
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
148-
size_t KernelRun, Reduction &Redu);
148+
Reduction &Redu);
149149
} // namespace detail
150150
} // namespace intel
151151

@@ -266,11 +266,9 @@ class __SYCL_EXPORT handler {
266266

267267
bool is_host() { return MIsHost; }
268268

269-
template <typename DataT, int Dims, access::mode AccessMode,
270-
access::target AccessTarget>
271-
void associateWithHandler(accessor<DataT, Dims, AccessMode, AccessTarget,
272-
access::placeholder::false_t>
273-
Acc) {
269+
template <typename T, int Dims, access::mode AccMode,
270+
access::target AccTarget, access::placeholder IsPH>
271+
void associateWithHandler(accessor<T, Dims, AccMode, AccTarget, IsPH> Acc) {
274272
detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
275273
detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
276274
detail::Requirement *Req = AccImpl.get();
@@ -281,7 +279,7 @@ class __SYCL_EXPORT handler {
281279
// Add an accessor to the handler list of associated accessors.
282280
// For associated accessors index does not means nothing.
283281
MAssociatedAccesors.emplace_back(detail::kernel_param_kind_t::kind_accessor,
284-
Req, static_cast<int>(AccessTarget),
282+
Req, static_cast<int>(AccTarget),
285283
/*index*/ 0);
286284
}
287285

@@ -692,18 +690,7 @@ class __SYCL_EXPORT handler {
692690
void
693691
require(accessor<DataT, Dims, AccMode, AccTarget, access::placeholder::true_t>
694692
Acc) {
695-
detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
696-
detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
697-
detail::Requirement *Req = AccImpl.get();
698-
// Add accessor to the list of requirements.
699-
MRequirements.push_back(Req);
700-
// Store copy of the accessor.
701-
MAccStorage.push_back(std::move(AccImpl));
702-
// Add an accessor to the handler list of associated accessors.
703-
// For associated accessors index does not means nothing.
704-
MAssociatedAccesors.emplace_back(detail::kernel_param_kind_t::kind_accessor,
705-
Req, static_cast<int>(AccTarget),
706-
/*index*/ 0);
693+
associateWithHandler(Acc);
707694
}
708695

709696
/// Registers event dependencies on this command group.
@@ -867,8 +854,22 @@ class __SYCL_EXPORT handler {
867854
detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
868855
Reduction::has_fast_atomics>
869856
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
870-
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
871-
Redu.MAcc);
857+
if (Reduction::is_usm)
858+
Redu.associateWithHandler(*this);
859+
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
860+
auto Acc = Redu.getUserAccessor();
861+
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu, Acc);
862+
863+
// Submit non-blocking copy from reduction accessor to user's reduction
864+
// variable.
865+
if (Reduction::is_usm) {
866+
this->finalize();
867+
handler CopyHandler(QueueCopy, MIsHost);
868+
CopyHandler.saveCodeLoc(MCodeLoc);
869+
Redu.associateWithHandler(CopyHandler);
870+
CopyHandler.copy(Acc, Redu.getUSMPointer());
871+
MLastEvent = CopyHandler.finalize();
872+
}
872873
}
873874

874875
/// Implements parallel_for() accepting nd_range and 1 reduction variable
@@ -886,7 +887,7 @@ class __SYCL_EXPORT handler {
886887
detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
887888
Reduction::has_fast_atomics>
888889
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
889-
auto QueueCopy = MQueue;
890+
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
890891
auto RWAcc = Redu.getReadWriteScalarAcc(*this);
891892
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
892893
RWAcc);
@@ -896,7 +897,8 @@ class __SYCL_EXPORT handler {
896897
handler CopyHandler(QueueCopy, MIsHost);
897898
CopyHandler.saveCodeLoc(MCodeLoc);
898899
CopyHandler.associateWithHandler(RWAcc);
899-
CopyHandler.copy(RWAcc, Redu.MAcc);
900+
Redu.associateWithHandler(CopyHandler);
901+
CopyHandler.copy(RWAcc, Redu.getUserAccessor());
900902
MLastEvent = CopyHandler.finalize();
901903
}
902904

@@ -935,8 +937,10 @@ class __SYCL_EXPORT handler {
935937
// necessary to reduce all partial sums into one final sum.
936938

937939
// 1. Call the kernel that includes user's lambda function.
940+
if (Reduction::is_usm && NWorkGroups == 1)
941+
Redu.associateWithHandler(*this);
938942
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
939-
auto QueueCopy = MQueue;
943+
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
940944
this->finalize();
941945

942946
// 2. Run the additional aux kernel as many times as needed to reduce
@@ -950,7 +954,6 @@ class __SYCL_EXPORT handler {
950954
// sum faster.
951955
size_t WGSize = Range.get_local_range().size();
952956
size_t NWorkItems = NWorkGroups;
953-
size_t KernelRun = 1;
954957
while (NWorkItems > 1) {
955958
WGSize = std::min(WGSize, NWorkItems);
956959
NWorkGroups = NWorkItems / WGSize;
@@ -965,14 +968,23 @@ class __SYCL_EXPORT handler {
965968
// The last kernel DOES write to reduction's accessor.
966969
// Associate it with handler manually.
967970
if (NWorkGroups == 1)
968-
AuxHandler.associateWithHandler(Redu.MAcc);
969-
intel::detail::reduAuxCGFunc<KernelName, KernelType>(
970-
AuxHandler, Range, NWorkItems, KernelRun, Redu);
971+
Redu.associateWithHandler(AuxHandler);
972+
intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
973+
NWorkItems, Redu);
971974
MLastEvent = AuxHandler.finalize();
972975

973976
NWorkItems = NWorkGroups;
974-
++KernelRun;
975977
} // end while (NWorkItems > 1)
978+
979+
// Submit non-blocking copy from reduction accessor to user's reduction
980+
// variable.
981+
if (Reduction::is_usm) {
982+
handler CopyHandler(QueueCopy, MIsHost);
983+
CopyHandler.saveCodeLoc(MCodeLoc);
984+
Redu.associateWithHandler(CopyHandler);
985+
CopyHandler.copy(Redu.getUserAccessor(), Redu.getUSMPointer());
986+
MLastEvent = CopyHandler.finalize();
987+
}
976988
}
977989

978990
/// Hierarchical kernel invocation method of a kernel defined as a lambda
@@ -1614,8 +1626,8 @@ class __SYCL_EXPORT handler {
16141626
friend class detail::stream_impl;
16151627
// Make reduction_impl friend to store buffers and arrays created for it
16161628
// in handler from reduction_impl methods.
1617-
template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
1618-
access::placeholder IsPlaceholder>
1629+
template <typename T, class BinaryOperation, int Dims, bool IsUSM,
1630+
access::mode AccMode, access::placeholder IsPlaceholder>
16191631
friend class intel::detail::reduction_impl;
16201632
};
16211633
} // namespace sycl

0 commit comments

Comments
 (0)