Skip to content

Commit 05625f1

Browse files
authored
[SYCL] Add faster reduction implementations using atomic or/and intel::reduce() (#1615)
Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent cc0c33b commit 05625f1

File tree

2 files changed

+386
-19
lines changed

2 files changed

+386
-19
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 74 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -115,13 +115,37 @@ template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
115115
access::placeholder IsPlaceholder>
116116
class reduction_impl;
117117

118+
using cl::sycl::detail::enable_if_t;
119+
120+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
121+
enable_if_t<Reduction::has_fast_reduce && Reduction::has_fast_atomics>
122+
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
123+
Reduction &Redu, typename Reduction::rw_accessor_type &Out);
124+
125+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
126+
enable_if_t<!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
127+
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
128+
Reduction &Redu, typename Reduction::rw_accessor_type &Out);
129+
130+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
131+
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
132+
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
133+
Reduction &Redu);
134+
135+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
136+
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
137+
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
138+
Reduction &Redu);
139+
118140
template <typename KernelName, typename KernelType, int Dims, class Reduction>
119-
void reduCGFunc(handler &CGH, KernelType KernelFunc,
120-
const nd_range<Dims> &Range, Reduction &Redu);
141+
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
142+
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
143+
size_t KernelRun, Reduction &Redu);
121144

122145
template <typename KernelName, typename KernelType, int Dims, class Reduction>
123-
void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
124-
size_t KernelRun, Reduction &Redu);
146+
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
147+
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
148+
size_t KernelRun, Reduction &Redu);
125149
} // namespace detail
126150
} // namespace intel
127151

@@ -759,6 +783,48 @@ class __SYCL_EXPORT handler {
759783
#endif
760784
}
761785

786+
/// Implements parallel_for() accepting nd_range and 1 reduction variable
787+
/// having 'read_write' access mode.
788+
/// This version uses fast sycl::atomic operations to update user's reduction
789+
/// variable at the end of each work-group work.
790+
template <typename KernelName = detail::auto_name, typename KernelType,
791+
int Dims, typename Reduction>
792+
detail::enable_if_t<Reduction::accessor_mode == access::mode::read_write &&
793+
Reduction::has_fast_atomics>
794+
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
795+
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
796+
Redu.MAcc);
797+
}
798+
799+
/// Implements parallel_for() accepting nd_range and 1 reduction variable
800+
/// having 'discard_write' access mode.
801+
/// This version uses fast sycl::atomic operations to update user's reduction
802+
/// variable at the end of each work-group work.
803+
///
804+
/// The reduction variable must be initialized before the kernel is started
805+
/// because atomic operations only update the value, but never initialize it.
806+
/// Thus, an additional 'read_write' accessor is created/initialized with
807+
/// identity value and then passed to the kernel. After running the kernel it
808+
/// is copied to user's 'discard_write' accessor.
809+
template <typename KernelName = detail::auto_name, typename KernelType,
810+
int Dims, typename Reduction>
811+
detail::enable_if_t<Reduction::accessor_mode == access::mode::discard_write &&
812+
Reduction::has_fast_atomics>
813+
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
814+
auto QueueCopy = MQueue;
815+
auto RWAcc = Redu.getReadWriteScalarAcc(*this);
816+
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu,
817+
RWAcc);
818+
this->finalize();
819+
820+
// Copy from RWAcc to some temp memory.
821+
handler CopyHandler(QueueCopy, MIsHost);
822+
CopyHandler.saveCodeLoc(MCodeLoc);
823+
CopyHandler.associateWithHandler(RWAcc);
824+
CopyHandler.copy(RWAcc, Redu.MAcc);
825+
MLastEvent = CopyHandler.finalize();
826+
}
827+
762828
/// Defines and invokes a SYCL kernel function for the specified nd_range.
763829
/// Performs reduction operation specified in \param Redu.
764830
///
@@ -769,20 +835,15 @@ class __SYCL_EXPORT handler {
769835
/// globally visible, there is no need for the developer to provide
770836
/// a kernel name for it.
771837
///
772-
/// TODO: currently it calls only those versions of kernels that can handle
773-
/// custom types and operations. Some of types and operations may use faster
774-
/// implementations that use intel::reduce() and/or sycl::atomic.fetch_<op>()
775-
/// functions and thus provide much better performance. Those variants exist,
776-
/// are fully functional. They just wait for their time for code-review.
777838
/// TODO: Need to handle more than 1 reduction in parallel_for().
778839
/// TODO: Support HOST. The kernels called by this parallel_for() may use
779840
/// some functionality that is not yet supported on HOST such as:
780841
/// barrier(), and intel::reduce() that also may be used in more
781842
/// optimized implementations waiting for their turn of code-review.
782843
template <typename KernelName = detail::auto_name, typename KernelType,
783844
int Dims, typename Reduction>
784-
void parallel_for(nd_range<Dims> Range, Reduction &Redu,
785-
KernelType KernelFunc) {
845+
detail::enable_if_t<!Reduction::has_fast_atomics>
846+
parallel_for(nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
786847
size_t NWorkGroups = Range.get_group_range().size();
787848

788849
// This parallel_for() is lowered to the following sequence:
@@ -801,7 +862,7 @@ class __SYCL_EXPORT handler {
801862
// 1. Call the kernel that includes user's lambda function.
802863
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
803864
auto QueueCopy = MQueue;
804-
MLastEvent = this->finalize();
865+
this->finalize();
805866

806867
// 2. Run the additional aux kernel as many times as needed to reduce
807868
// all partial sums into one scalar.
@@ -821,8 +882,7 @@ class __SYCL_EXPORT handler {
821882
// The last group may be not fully loaded. Still register it as a group.
822883
if ((NWorkItems % WGSize) != 0)
823884
++NWorkGroups;
824-
auto Range =
825-
nd_range<1>(range<1>(WGSize * NWorkGroups), range<1>(WGSize));
885+
nd_range<1> Range(range<1>(WGSize * NWorkGroups), range<1>(WGSize));
826886

827887
handler AuxHandler(QueueCopy, MIsHost);
828888
AuxHandler.saveCodeLoc(MCodeLoc);

0 commit comments

Comments
 (0)