@@ -115,13 +115,37 @@ template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
115
115
access::placeholder IsPlaceholder>
116
116
class reduction_impl ;
117
117
118
+ using cl::sycl::detail::enable_if_t ;
119
+
120
+ template <typename KernelName, typename KernelType, int Dims, class Reduction >
121
+ enable_if_t <Reduction::has_fast_reduce && Reduction::has_fast_atomics>
122
+ reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
123
+ Reduction &Redu, typename Reduction::rw_accessor_type &Out);
124
+
125
+ template <typename KernelName, typename KernelType, int Dims, class Reduction >
126
+ enable_if_t <!Reduction::has_fast_reduce && Reduction::has_fast_atomics>
127
+ reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
128
+ Reduction &Redu, typename Reduction::rw_accessor_type &Out);
129
+
130
+ template <typename KernelName, typename KernelType, int Dims, class Reduction >
131
+ enable_if_t <Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
132
+ reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
133
+ Reduction &Redu);
134
+
135
+ template <typename KernelName, typename KernelType, int Dims, class Reduction >
136
+ enable_if_t <!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
137
+ reduCGFunc (handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
138
+ Reduction &Redu);
139
+
118
140
template <typename KernelName, typename KernelType, int Dims, class Reduction >
119
- void reduCGFunc (handler &CGH, KernelType KernelFunc,
120
- const nd_range<Dims> &Range, Reduction &Redu);
141
+ enable_if_t <Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
142
+ reduAuxCGFunc (handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
143
+ size_t KernelRun, Reduction &Redu);
121
144
122
145
template <typename KernelName, typename KernelType, int Dims, class Reduction >
123
- void reduAuxCGFunc (handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
124
- size_t KernelRun, Reduction &Redu);
146
+ enable_if_t <!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
147
+ reduAuxCGFunc (handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
148
+ size_t KernelRun, Reduction &Redu);
125
149
} // namespace detail
126
150
} // namespace intel
127
151
@@ -759,6 +783,48 @@ class __SYCL_EXPORT handler {
759
783
#endif
760
784
}
761
785
786
+ // / Implements parallel_for() accepting nd_range and 1 reduction variable
787
+ // / having 'read_write' access mode.
788
+ // / This version uses fast sycl::atomic operations to update user's reduction
789
+ // / variable at the end of each work-group work.
790
+ template <typename KernelName = detail::auto_name, typename KernelType,
791
+ int Dims, typename Reduction>
792
+ detail::enable_if_t <Reduction::accessor_mode == access::mode::read_write &&
793
+ Reduction::has_fast_atomics>
794
+ parallel_for (nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
795
+ intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu,
796
+ Redu.MAcc );
797
+ }
798
+
799
+ // / Implements parallel_for() accepting nd_range and 1 reduction variable
800
+ // / having 'discard_write' access mode.
801
+ // / This version uses fast sycl::atomic operations to update user's reduction
802
+ // / variable at the end of each work-group work.
803
+ // /
804
+ // / The reduction variable must be initialized before the kernel is started
805
+ // / because atomic operations only update the value, but never initialize it.
806
+ // / Thus, an additional 'read_write' accessor is created/initialized with
807
+ // / identity value and then passed to the kernel. After running the kernel it
808
+ // / is copied to user's 'discard_write' accessor.
809
+ template <typename KernelName = detail::auto_name, typename KernelType,
810
+ int Dims, typename Reduction>
811
+ detail::enable_if_t <Reduction::accessor_mode == access::mode::discard_write &&
812
+ Reduction::has_fast_atomics>
813
+ parallel_for (nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
814
+ auto QueueCopy = MQueue;
815
+ auto RWAcc = Redu.getReadWriteScalarAcc (*this );
816
+ intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu,
817
+ RWAcc);
818
+ this ->finalize ();
819
+
820
+ // Copy from RWAcc to some temp memory.
821
+ handler CopyHandler (QueueCopy, MIsHost);
822
+ CopyHandler.saveCodeLoc (MCodeLoc);
823
+ CopyHandler.associateWithHandler (RWAcc);
824
+ CopyHandler.copy (RWAcc, Redu.MAcc );
825
+ MLastEvent = CopyHandler.finalize ();
826
+ }
827
+
762
828
// / Defines and invokes a SYCL kernel function for the specified nd_range.
763
829
// / Performs reduction operation specified in \param Redu.
764
830
// /
@@ -769,20 +835,15 @@ class __SYCL_EXPORT handler {
769
835
// / globally visible, there is no need for the developer to provide
770
836
// / a kernel name for it.
771
837
// /
772
- // / TODO: currently it calls only those versions of kernels that can handle
773
- // / custom types and operations. Some of types and operations may use faster
774
- // / implementations that use intel::reduce() and/or sycl::atomic.fetch_<op>()
775
- // / functions and thus provide much better performance. Those variants exist,
776
- // / are fully functional. They just wait for their time for code-review.
777
838
// / TODO: Need to handle more than 1 reduction in parallel_for().
778
839
// / TODO: Support HOST. The kernels called by this parallel_for() may use
779
840
// / some functionality that is not yet supported on HOST such as:
780
841
// / barrier(), and intel::reduce() that also may be used in more
781
842
// / optimized implementations waiting for their turn of code-review.
782
843
template <typename KernelName = detail::auto_name, typename KernelType,
783
844
int Dims, typename Reduction>
784
- void parallel_for (nd_range<Dims> Range, Reduction &Redu,
785
- KernelType KernelFunc) {
845
+ detail:: enable_if_t <! Reduction::has_fast_atomics>
846
+ parallel_for (nd_range<Dims> Range, Reduction &Redu, KernelType KernelFunc) {
786
847
size_t NWorkGroups = Range.get_group_range ().size ();
787
848
788
849
// This parallel_for() is lowered to the following sequence:
@@ -801,7 +862,7 @@ class __SYCL_EXPORT handler {
801
862
// 1. Call the kernel that includes user's lambda function.
802
863
intel::detail::reduCGFunc<KernelName>(*this , KernelFunc, Range, Redu);
803
864
auto QueueCopy = MQueue;
804
- MLastEvent = this ->finalize ();
865
+ this ->finalize ();
805
866
806
867
// 2. Run the additional aux kernel as many times as needed to reduce
807
868
// all partial sums into one scalar.
@@ -821,8 +882,7 @@ class __SYCL_EXPORT handler {
821
882
// The last group may be not fully loaded. Still register it as a group.
822
883
if ((NWorkItems % WGSize) != 0 )
823
884
++NWorkGroups;
824
- auto Range =
825
- nd_range<1 >(range<1 >(WGSize * NWorkGroups), range<1 >(WGSize));
885
+ nd_range<1 > Range (range<1 >(WGSize * NWorkGroups), range<1 >(WGSize));
826
886
827
887
handler AuxHandler (QueueCopy, MIsHost);
828
888
AuxHandler.saveCodeLoc (MCodeLoc);
0 commit comments