Skip to content

Commit 47db644

Browse files
committed
[SYCL][NFC] Move the reduction command group funcs out of handler.hpp
This patch only moves he functions has been moved from handler.hpp to reduction.hpp. No any other changes done. Signed-off-by: Vyacheslav N Klochkov <[email protected]>
1 parent bb73d92 commit 47db644

File tree

2 files changed

+250
-242
lines changed

2 files changed

+250
-242
lines changed

sycl/include/CL/sycl/handler.hpp

Lines changed: 11 additions & 242 deletions
Original file line numberDiff line numberDiff line change
@@ -107,32 +107,6 @@ template <typename Type> struct get_kernel_name_t<detail::auto_name, Type> {
107107

108108
__SYCL_EXPORT device getDeviceFromHandler(handler &);
109109

110-
/// These are the forward declaration for the classes that help to create
111-
/// names for additional kernels. It is used only when there are
112-
/// more then 1 kernels in one parallel_for() implementing SYCL reduction.
113-
template <typename Type> class __sycl_reduction_main_2nd_kernel;
114-
template <typename Type> class __sycl_reduction_aux_1st_kernel;
115-
template <typename Type> class __sycl_reduction_aux_2nd_kernel;
116-
117-
/// Helper structs to get additional kernel name types based on given
118-
/// \c Name and \c Type types: if \c Name is undefined (is a \c auto_name) then
119-
/// \c Type becomes the \c Name.
120-
template <typename Name, typename Type>
121-
struct get_reduction_main_2nd_kernel_name_t {
122-
using name = __sycl_reduction_main_2nd_kernel<
123-
typename get_kernel_name_t<Name, Type>::name>;
124-
};
125-
template <typename Name, typename Type>
126-
struct get_reduction_aux_1st_kernel_name_t {
127-
using name = __sycl_reduction_aux_1st_kernel<
128-
typename get_kernel_name_t<Name, Type>::name>;
129-
};
130-
template <typename Name, typename Type>
131-
struct get_reduction_aux_2nd_kernel_name_t {
132-
using name = __sycl_reduction_aux_2nd_kernel<
133-
typename get_kernel_name_t<Name, Type>::name>;
134-
};
135-
136110
device getDeviceFromHandler(handler &);
137111

138112
} // namespace detail
@@ -142,6 +116,14 @@ namespace detail {
142116
template <typename T, class BinaryOperation, int Dims, access::mode AccMode,
143117
access::placeholder IsPlaceholder>
144118
class reduction_impl;
119+
120+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
121+
void reduCGFunc(handler &CGH, KernelType KernelFunc,
122+
const nd_range<Dims> &Range, Reduction &Redu);
123+
124+
template <typename KernelName, typename KernelType, int Dims, class Reduction>
125+
void reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
126+
size_t KernelRun, Reduction &Redu);
145127
} // namespace detail
146128
} // namespace intel
147129

@@ -810,219 +792,6 @@ class __SYCL_EXPORT handler {
810792
#endif
811793
}
812794

813-
/// Implements a command group function that enqueues a kernel that calls
814-
/// user's lambda function \param KernelFunc and does one iteration of
815-
/// reduction of elements in each of work-groups.
816-
/// This version uses tree-reduction algorithm to reduce elements in each
817-
/// of work-groups. At the end of each work-group the partial sum is written
818-
/// to a global buffer.
819-
///
820-
/// Briefly: user's lambda, tree-reduction, CUSTOM types/ops.
821-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
822-
void reduCGFunc(KernelType KernelFunc, const nd_range<Dims> &Range,
823-
Reduction &Redu) {
824-
825-
size_t NWorkItems = Range.get_global_range().size();
826-
size_t WGSize = Range.get_local_range().size();
827-
size_t NWorkGroups = Range.get_group_range().size();
828-
829-
bool IsUnderLoaded = (NWorkGroups * WGSize - NWorkItems) != 0;
830-
bool IsEfficientCase = !IsUnderLoaded && ((WGSize & (WGSize - 1)) == 0);
831-
832-
bool IsUpdateOfUserAcc =
833-
Reduction::accessor_mode == access::mode::read_write &&
834-
NWorkGroups == 1;
835-
836-
// Use local memory to reduce elements in work-groups into 0-th element.
837-
// If WGSize is not power of two, then WGSize+1 elements are allocated.
838-
// The additional last element is used to catch elements that could
839-
// otherwise be lost in the tree-reduction algorithm.
840-
size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
841-
auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
842-
843-
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, 0, *this);
844-
auto ReduIdentity = Redu.getIdentity();
845-
if (IsEfficientCase) {
846-
// Efficient case: work-groups are fully loaded and work-group size
847-
// is power of two.
848-
parallel_for<KernelName>(Range, [=](nd_item<Dims> NDIt) {
849-
// Call user's functions. Reducer.MValue gets initialized there.
850-
typename Reduction::reducer_type Reducer(ReduIdentity);
851-
KernelFunc(NDIt, Reducer);
852-
853-
// Copy the element to local memory to prepare it for tree-reduction.
854-
size_t LID = NDIt.get_local_linear_id();
855-
LocalReds[LID] = Reducer.MValue;
856-
NDIt.barrier();
857-
858-
// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0].
859-
typename Reduction::binary_operation BOp;
860-
size_t WGSize = NDIt.get_local_range().size();
861-
for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
862-
if (LID < CurStep)
863-
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
864-
NDIt.barrier();
865-
}
866-
867-
// Compute the partial sum/reduction for the work-group.
868-
if (LID == 0)
869-
Out.get_pointer().get()[NDIt.get_group_linear_id()] =
870-
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
871-
: LocalReds[0];
872-
});
873-
} else {
874-
// Inefficient case: work-groups are not fully loaded
875-
// or WGSize is not power of two.
876-
// These two inefficient cases are handled by one kernel, which
877-
// can be split later into two separate kernels, if there are users who
878-
// really need more efficient code for them.
879-
using AuxName = typename detail::get_reduction_main_2nd_kernel_name_t<
880-
KernelName, KernelType>::name;
881-
parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
882-
// Call user's functions. Reducer.MValue gets initialized there.
883-
typename Reduction::reducer_type Reducer(ReduIdentity);
884-
KernelFunc(NDIt, Reducer);
885-
886-
size_t WGSize = NDIt.get_local_range().size();
887-
size_t LID = NDIt.get_local_linear_id();
888-
size_t GID = NDIt.get_global_linear_id();
889-
// Copy the element to local memory to prepare it for tree-reduction.
890-
LocalReds[LID] = (GID < NWorkItems) ? Reducer.MValue : ReduIdentity;
891-
LocalReds[WGSize] = ReduIdentity;
892-
NDIt.barrier();
893-
894-
// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
895-
// LocalReds[WGSize] accumulates last/odd elements when the step
896-
// of tree-reduction loop is not even.
897-
typename Reduction::binary_operation BOp;
898-
size_t PrevStep = WGSize;
899-
for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
900-
if (LID < CurStep)
901-
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
902-
else if (LID == CurStep && (PrevStep & 0x1))
903-
LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
904-
NDIt.barrier();
905-
PrevStep = CurStep;
906-
}
907-
908-
// Compute the partial sum/reduction for the work-group.
909-
if (LID == 0) {
910-
auto GrID = NDIt.get_group_linear_id();
911-
auto V = BOp(LocalReds[0], LocalReds[WGSize]);
912-
Out.get_pointer().get()[GrID] =
913-
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
914-
}
915-
});
916-
}
917-
}
918-
919-
/// Implements a command group function that enqueues a kernel that does one
920-
/// iteration of reduction of elements in each of work-groups.
921-
/// This version uses tree-reduction algorithm to reduce elements in each
922-
/// of work-groups. At the end of each work-group the partial sum is written
923-
/// to a global buffer.
924-
///
925-
/// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
926-
template <typename KernelName, typename KernelType, int Dims, class Reduction>
927-
void reduAuxCGFunc(const nd_range<Dims> &Range, size_t NWorkItems,
928-
size_t KernelRun, Reduction &Redu) {
929-
size_t WGSize = Range.get_local_range().size();
930-
size_t NWorkGroups = Range.get_group_range().size();
931-
932-
// The last work-group may be not fully loaded with work, or the work group
933-
// size may be not power of those. Those two cases considered inefficient
934-
// as they require additional code and checks in the kernel.
935-
bool IsUnderLoaded = NWorkGroups * WGSize != NWorkItems;
936-
bool IsEfficientCase = !IsUnderLoaded && (WGSize & (WGSize - 1)) == 0;
937-
938-
bool IsUpdateOfUserAcc =
939-
Reduction::accessor_mode == access::mode::read_write &&
940-
NWorkGroups == 1;
941-
942-
// Use local memory to reduce elements in work-groups into 0-th element.
943-
// If WGSize is not power of two, then WGSize+1 elements are allocated.
944-
// The additional last element is used to catch elements that could
945-
// otherwise be lost in the tree-reduction algorithm.
946-
size_t NumLocalElements = WGSize + (IsEfficientCase ? 0 : 1);
947-
auto LocalReds = Redu.getReadWriteLocalAcc(NumLocalElements, *this);
948-
949-
// Get read accessor to the buffer that was used as output
950-
// in the previous kernel. After that create new output buffer if needed
951-
// and get accessor to it (or use reduction's accessor if the kernel
952-
// is the last one).
953-
auto In = Redu.getReadAccToPreviousPartialReds(*this);
954-
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, KernelRun, *this);
955-
956-
if (IsEfficientCase) {
957-
// Efficient case: work-groups are fully loaded and work-group size
958-
// is power of two.
959-
using AuxName = typename detail::get_reduction_aux_1st_kernel_name_t<
960-
KernelName, KernelType>::name;
961-
parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
962-
// Copy the element to local memory to prepare it for tree-reduction.
963-
size_t LID = NDIt.get_local_linear_id();
964-
size_t GID = NDIt.get_global_linear_id();
965-
LocalReds[LID] = In[GID];
966-
NDIt.barrier();
967-
968-
// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
969-
typename Reduction::binary_operation BOp;
970-
size_t WGSize = NDIt.get_local_range().size();
971-
for (size_t CurStep = WGSize >> 1; CurStep > 0; CurStep >>= 1) {
972-
if (LID < CurStep)
973-
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
974-
NDIt.barrier();
975-
}
976-
977-
// Compute the partial sum/reduction for the work-group.
978-
if (LID == 0)
979-
Out.get_pointer().get()[NDIt.get_group_linear_id()] =
980-
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), LocalReds[0])
981-
: LocalReds[0];
982-
});
983-
} else {
984-
// Inefficient case: work-groups are not fully loaded
985-
// or WGSize is not power of two.
986-
// These two inefficient cases are handled by one kernel, which
987-
// can be split later into two separate kernels, if there are users
988-
// who really need more efficient code for them.
989-
using AuxName = typename detail::get_reduction_aux_2nd_kernel_name_t<
990-
KernelName, KernelType>::name;
991-
auto ReduIdentity = Redu.getIdentity();
992-
parallel_for<AuxName>(Range, [=](nd_item<Dims> NDIt) {
993-
size_t WGSize = NDIt.get_local_range().size();
994-
size_t LID = NDIt.get_local_linear_id();
995-
size_t GID = NDIt.get_global_linear_id();
996-
// Copy the element to local memory to prepare it for tree-reduction
997-
LocalReds[LID] = (GID < NWorkItems) ? In[GID] : ReduIdentity;
998-
LocalReds[WGSize] = ReduIdentity;
999-
NDIt.barrier();
1000-
1001-
// Tree-reduction: reduce the local array LocalReds[:] to LocalReds[0]
1002-
// LocalReds[WGSize] accumulates last/odd elements when the step
1003-
// of tree-reduction loop is not even.
1004-
typename Reduction::binary_operation BOp;
1005-
size_t PrevStep = WGSize;
1006-
for (size_t CurStep = PrevStep >> 1; CurStep > 0; CurStep >>= 1) {
1007-
if (LID < CurStep)
1008-
LocalReds[LID] = BOp(LocalReds[LID], LocalReds[LID + CurStep]);
1009-
else if (LID == CurStep && (PrevStep & 0x1))
1010-
LocalReds[WGSize] = BOp(LocalReds[WGSize], LocalReds[PrevStep - 1]);
1011-
NDIt.barrier();
1012-
PrevStep = CurStep;
1013-
}
1014-
1015-
// Compute the partial sum/reduction for the work-group.
1016-
if (LID == 0) {
1017-
auto GrID = NDIt.get_group_linear_id();
1018-
auto V = BOp(LocalReds[0], LocalReds[WGSize]);
1019-
Out.get_pointer().get()[GrID] =
1020-
IsUpdateOfUserAcc ? BOp(*(Out.get_pointer()), V) : V;
1021-
}
1022-
});
1023-
}
1024-
}
1025-
1026795
/// Defines and invokes a SYCL kernel function for the specified nd_range.
1027796
/// Performs reduction operation specified in \param Redu.
1028797
///
@@ -1070,7 +839,7 @@ class __SYCL_EXPORT handler {
1070839
if (NWorkGroups > 1)
1071840
dissociateWithHandler(Redu.MAcc);
1072841

1073-
reduCGFunc<KernelName>(KernelFunc, Range, Redu);
842+
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
1074843
auto QueueCopy = MQueue;
1075844
MLastEvent = this->finalize();
1076845

@@ -1102,8 +871,8 @@ class __SYCL_EXPORT handler {
1102871
// Associate it with handler manually.
1103872
if (NWorkGroups == 1)
1104873
AuxHandler.associateWithHandler(Redu.MAcc);
1105-
AuxHandler.reduAuxCGFunc<KernelName, KernelType>(Range, NWorkItems,
1106-
KernelRun, Redu);
874+
intel::detail::reduAuxCGFunc<KernelName, KernelType>(
875+
AuxHandler, Range, NWorkItems, KernelRun, Redu);
1107876
MLastEvent = AuxHandler.finalize();
1108877

1109878
NWorkItems = NWorkGroups;

0 commit comments

Comments
 (0)