Skip to content

Commit 6177f2f

Browse files
committed
update to sycl branch, resolve merge conflicts
2 parents 9622317 + c22a5d3 commit 6177f2f

File tree

10 files changed

+519
-768
lines changed

10 files changed

+519
-768
lines changed

sycl/include/sycl/detail/common.hpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,30 @@ struct code_location {
8383
unsigned long MLineNo;
8484
unsigned long MColumnNo;
8585
};
86+
87+
// The C++ FE may instrument user calls with code location metadata.
88+
// If it does then that will appear as an extra last argument.
89+
// Having _TWO_ mid-param #ifdefs makes the functions very difficult to read.
90+
// Here we simplify the &CodeLoc declaration to be _CODELOCPARAM(&CodeLoc) and
91+
// _CODELOCARG(&CodeLoc).
92+
93+
#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
94+
#define _CODELOCONLYPARAM(a) \
95+
const detail::code_location a = detail::code_location::current()
96+
#define _CODELOCPARAM(a) \
97+
, const detail::code_location a = detail::code_location::current()
98+
#define _CODELOCPARAMDEF(a) , const detail::code_location a
99+
100+
#define _CODELOCARG(a)
101+
#define _CODELOCFW(a) , a
102+
#else
103+
#define _CODELOCONLYPARAM(a)
104+
#define _CODELOCPARAM(a)
105+
106+
#define _CODELOCARG(a) const detail::code_location a = {}
107+
#define _CODELOCFW(a)
108+
#endif
109+
86110
} // namespace detail
87111
} // namespace sycl
88112
} // __SYCL_INLINE_NAMESPACE(cl)

sycl/include/sycl/ext/oneapi/reduction.hpp

Lines changed: 85 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -649,6 +649,13 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
649649
}
650650
}
651651

652+
template <class _T = T, int D = buffer_dim>
653+
auto &getTempBuffer(size_t Size, handler &CGH) {
654+
auto Buffer = std::make_shared<buffer<_T, D>>(range<1>(Size));
655+
CGH.addReduction(Buffer);
656+
return *Buffer;
657+
}
658+
652659
/// Returns an accessor accessing the memory that will hold the reduction
653660
/// partial sums.
654661
/// If \p Size is equal to one, then the reduction result is the final and
@@ -708,15 +715,28 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {
708715
return {*CounterBuf, CGH};
709716
}
710717

711-
RedOutVar &getUserRedVar() { return MRedOut; }
712-
713-
static inline result_type *getOutPointer(const rw_accessor_type &OutAcc) {
714-
return OutAcc.get_pointer().get();
718+
// On discrete (vs. integrated) GPUs it's faster to initialize memory with an
719+
// extra kernel than copy it from the host.
720+
template <typename Name> auto getGroupsCounterAccDiscrete(handler &CGH) {
721+
auto &Buf = getTempBuffer<int, 1>(1, CGH);
722+
std::shared_ptr<detail::queue_impl> QueueCopy = CGH.MQueue;
723+
auto Event = CGH.withAuxHandler(QueueCopy, [&](handler &InitHandler) {
724+
auto Acc = accessor{Buf, InitHandler, sycl::write_only, sycl::no_init};
725+
InitHandler.single_task<Name>([=]() { Acc[0] = 0; });
726+
});
727+
CGH.depends_on(Event);
728+
return accessor{Buf, CGH};
715729
}
716730

731+
RedOutVar &getUserRedVar() { return MRedOut; }
732+
717733
static inline result_type *getOutPointer(result_type *OutPtr) {
718734
return OutPtr;
719735
}
736+
template <class AccessorType>
737+
static inline result_type *getOutPointer(const AccessorType &OutAcc) {
738+
return OutAcc.get_pointer().get();
739+
}
720740

721741
private:
722742
template <typename BufferT>
@@ -892,7 +912,7 @@ template <class KernelName> struct RangeFastAtomics;
892912
} // namespace main_krn
893913
} // namespace reduction
894914
template <typename KernelName, typename KernelType, int Dims, class Reduction>
895-
void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
915+
bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
896916
const range<Dims> &Range,
897917
const nd_range<1> &NDRange,
898918
Reduction &Redu) {
@@ -927,29 +947,43 @@ void reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,
927947
Reducer.template atomic_combine(Reduction::getOutPointer(Out));
928948
}
929949
});
950+
return Reduction::is_usm || Redu.initializeToIdentity();
930951
}
931952

932953
namespace reduction {
933954
namespace main_krn {
934955
template <class KernelName> struct RangeFastReduce;
935956
} // namespace main_krn
957+
namespace init_krn {
958+
template <class KernelName> struct GroupCounter;
959+
}
936960
} // namespace reduction
937961
template <typename KernelName, typename KernelType, int Dims, class Reduction>
938-
void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
962+
bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
939963
const range<Dims> &Range,
940964
const nd_range<1> &NDRange, Reduction &Redu) {
941965
constexpr size_t NElements = Reduction::num_elements;
942966
size_t WGSize = NDRange.get_local_range().size();
943967
size_t NWorkGroups = NDRange.get_group_range().size();
944968

969+
auto &Out = Redu.getUserRedVar();
970+
if constexpr (Reduction::is_acc)
971+
associateWithHandler(CGH, &Out, access::target::device);
972+
973+
auto &PartialSumsBuf = Redu.getTempBuffer(NWorkGroups * NElements, CGH);
974+
accessor PartialSums(PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);
975+
945976
bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
946-
auto PartialSums =
947-
Redu.getWriteAccForPartialReds(NWorkGroups * NElements, CGH);
948-
auto Out = (NWorkGroups == 1)
949-
? PartialSums
950-
: Redu.getWriteAccForPartialReds(NElements, CGH);
977+
using InitName =
978+
__sycl_reduction_kernel<reduction::init_krn::GroupCounter, KernelName>;
979+
980+
// Integrated/discrete GPUs have different faster path.
951981
auto NWorkGroupsFinished =
952-
Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH);
982+
sycl::detail::getDeviceFromHandler(CGH)
983+
.get_info<info::device::host_unified_memory>()
984+
? Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH)
985+
: Redu.template getGroupsCounterAccDiscrete<InitName>(CGH);
986+
953987
auto DoReducePartialSumsInLastWG =
954988
Reduction::template getReadWriteLocalAcc<int>(1, CGH);
955989

@@ -967,50 +1001,57 @@ void reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
9671001
// reduce_over_group is only defined for each T, not for span<T, ...>
9681002
size_t LID = NDId.get_local_id(0);
9691003
for (int E = 0; E < NElements; ++E) {
970-
Reducer.getElement(E) =
971-
reduce_over_group(Group, Reducer.getElement(E), BOp);
972-
1004+
auto &RedElem = Reducer.getElement(E);
1005+
RedElem = reduce_over_group(Group, RedElem, BOp);
9731006
if (LID == 0) {
974-
if (NWorkGroups == 1 && IsUpdateOfUserVar)
975-
Reducer.getElement(E) =
976-
BOp(Reducer.getElement(E), Reduction::getOutPointer(Out)[E]);
977-
978-
// if NWorkGroups == 1, then PartialsSum and Out point to same memory.
979-
Reduction::getOutPointer(
980-
PartialSums)[NDId.get_group_linear_id() * NElements + E] =
981-
Reducer.getElement(E);
1007+
if (NWorkGroups == 1) {
1008+
auto &OutElem = Reduction::getOutPointer(Out)[E];
1009+
// Can avoid using partial sum and write the final result immediately.
1010+
if (IsUpdateOfUserVar)
1011+
RedElem = BOp(RedElem, OutElem);
1012+
OutElem = RedElem;
1013+
} else {
1014+
PartialSums[NDId.get_group_linear_id() * NElements + E] =
1015+
Reducer.getElement(E);
1016+
}
9821017
}
9831018
}
9841019

1020+
if (NWorkGroups == 1)
1021+
// We're done.
1022+
return;
1023+
9851024
// Signal this work-group has finished after all values are reduced
9861025
if (LID == 0) {
9871026
auto NFinished =
9881027
sycl::atomic_ref<int, memory_order::relaxed, memory_scope::device,
9891028
access::address_space::global_space>(
9901029
NWorkGroupsFinished[0]);
991-
DoReducePartialSumsInLastWG[0] =
992-
++NFinished == NWorkGroups && NWorkGroups > 1;
1030+
DoReducePartialSumsInLastWG[0] = ++NFinished == NWorkGroups;
9931031
}
9941032

9951033
sycl::detail::workGroupBarrier();
9961034
if (DoReducePartialSumsInLastWG[0]) {
9971035
// Reduce each result separately
998-
// TODO: Opportunity to parallelize across elements
1036+
// TODO: Opportunity to parallelize across elements.
9991037
for (int E = 0; E < NElements; ++E) {
1038+
auto &OutElem = Reduction::getOutPointer(Out)[E];
10001039
auto LocalSum = Reducer.getIdentity();
10011040
for (size_t I = LID; I < NWorkGroups; I += WGSize)
10021041
LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
1003-
Reducer.getElement(E) = reduce_over_group(Group, LocalSum, BOp);
1042+
auto Result = reduce_over_group(Group, LocalSum, BOp);
10041043

10051044
if (LID == 0) {
10061045
if (IsUpdateOfUserVar)
1007-
Reducer.getElement(E) =
1008-
BOp(Reducer.getElement(E), Reduction::getOutPointer(Out)[E]);
1009-
Reduction::getOutPointer(Out)[E] = Reducer.getElement(E);
1046+
Result = BOp(Result, OutElem);
1047+
OutElem = Result;
10101048
}
10111049
}
10121050
}
10131051
});
1052+
1053+
// We've updated user's variable, no extra work needed.
1054+
return false;
10141055
}
10151056

10161057
namespace reduction {
@@ -1019,7 +1060,7 @@ template <class KernelName> struct RangeBasic;
10191060
} // namespace main_krn
10201061
} // namespace reduction
10211062
template <typename KernelName, typename KernelType, int Dims, class Reduction>
1022-
void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
1063+
bool reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
10231064
const range<Dims> &Range,
10241065
const nd_range<1> &NDRange, Reduction &Redu) {
10251066
constexpr size_t NElements = Reduction::num_elements;
@@ -1125,10 +1166,13 @@ void reduCGFuncForRangeBasic(handler &CGH, KernelType KernelFunc,
11251166
}
11261167
}
11271168
});
1169+
return Reduction::is_usm || Reduction::is_dw_acc;
11281170
}
11291171

1172+
/// Returns "true" if the result has to be saved to user's variable by
1173+
/// reduSaveFinalResultToUserMem.
11301174
template <typename KernelName, typename KernelType, int Dims, class Reduction>
1131-
void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
1175+
bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
11321176
const range<Dims> &Range, size_t MaxWGSize,
11331177
uint32_t NumConcurrentWorkGroups, Reduction &Redu) {
11341178
size_t NWorkItems = Range.size();
@@ -1141,16 +1185,15 @@ void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
11411185
size_t NDRItems = NWorkGroups * WGSize;
11421186
nd_range<1> NDRange{range<1>{NDRItems}, range<1>{WGSize}};
11431187

1144-
if constexpr (Reduction::has_fast_atomics) {
1145-
reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range, NDRange,
1146-
Redu);
1147-
1148-
} else if constexpr (Reduction::has_fast_reduce) {
1149-
reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range, NDRange,
1150-
Redu);
1151-
} else {
1152-
reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange, Redu);
1153-
}
1188+
if constexpr (Reduction::has_fast_atomics)
1189+
return reduCGFuncForRangeFastAtomics<KernelName>(CGH, KernelFunc, Range,
1190+
NDRange, Redu);
1191+
else if constexpr (Reduction::has_fast_reduce)
1192+
return reduCGFuncForRangeFastReduce<KernelName>(CGH, KernelFunc, Range,
1193+
NDRange, Redu);
1194+
else
1195+
return reduCGFuncForRangeBasic<KernelName>(CGH, KernelFunc, Range, NDRange,
1196+
Redu);
11541197
}
11551198

11561199
namespace reduction {

sycl/include/sycl/handler.hpp

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -251,7 +251,7 @@ using sycl::detail::queue_impl;
251251
/// If we are given sycl::range and not sycl::nd_range we have more freedom in
252252
/// how to split the iteration space.
253253
template <typename KernelName, typename KernelType, int Dims, class Reduction>
254-
void reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
254+
bool reduCGFuncForRange(handler &CGH, KernelType KernelFunc,
255255
const range<Dims> &Range, size_t MaxWGSize,
256256
uint32_t NumConcurrentWorkGroups, Reduction &Redu);
257257

@@ -1649,11 +1649,9 @@ class __SYCL_EXPORT handler {
16491649
// for the device.
16501650
size_t MaxWGSize =
16511651
ext::oneapi::detail::reduGetMaxWGSize(MQueue, OneElemSize);
1652-
ext::oneapi::detail::reduCGFuncForRange<KernelName>(
1653-
*this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups, Redu);
1654-
if (Reduction::is_usm ||
1655-
(Reduction::has_fast_atomics && Redu.initializeToIdentity()) ||
1656-
(!Reduction::has_fast_atomics && Reduction::is_dw_acc)) {
1652+
if (ext::oneapi::detail::reduCGFuncForRange<KernelName>(
1653+
*this, KernelFunc, Range, MaxWGSize, NumConcurrentWorkGroups,
1654+
Redu)) {
16571655
this->finalize();
16581656
MLastEvent = withAuxHandler(QueueCopy, [&](handler &CopyHandler) {
16591657
ext::oneapi::detail::reduSaveFinalResultToUserMem<KernelName>(

sycl/include/sycl/queue.hpp

Lines changed: 2 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -32,33 +32,9 @@
3232
#include <utility>
3333

3434
// having _TWO_ mid-param #ifdefs makes the functions very difficult to read.
35-
// Here we simplify the &CodeLoc declaration to be _CODELOCPARAM(&CodeLoc) and
36-
// _CODELOCARG(&CodeLoc) Similarly, the KernelFunc param is simplified to be
35+
// Here we simplify the KernelFunc param is simplified to be
3736
// _KERNELFUNCPARAM(KernelFunc) Once the queue kernel functions are defined,
3837
// these macros are #undef immediately.
39-
40-
// replace _CODELOCPARAM(&CodeLoc) with nothing
41-
// or : , const detail::code_location &CodeLoc =
42-
// detail::code_location::current()
43-
// replace _CODELOCARG(&CodeLoc) with nothing
44-
// or : const detail::code_location &CodeLoc = {}
45-
46-
#ifndef DISABLE_SYCL_INSTRUMENTATION_METADATA
47-
#define _CODELOCONLYPARAM(a) \
48-
const detail::code_location a = detail::code_location::current()
49-
#define _CODELOCPARAM(a) \
50-
, const detail::code_location a = detail::code_location::current()
51-
52-
#define _CODELOCARG(a)
53-
#define _CODELOCFW(a) , a
54-
#else
55-
#define _CODELOCONLYPARAM(a)
56-
#define _CODELOCPARAM(a)
57-
58-
#define _CODELOCARG(a) const detail::code_location a = {}
59-
#define _CODELOCFW(a)
60-
#endif
61-
6238
// replace _KERNELFUNCPARAM(KernelFunc) with KernelType KernelFunc
6339
// or const KernelType &KernelFunc
6440
#ifdef __SYCL_NONCONST_FUNCTOR__
@@ -1081,11 +1057,7 @@ class __SYCL_EXPORT queue {
10811057
CodeLoc);
10821058
}
10831059

1084-
// Clean up CODELOC and KERNELFUNC macros.
1085-
#undef _CODELOCPARAM
1086-
#undef _CODELOCONLYPARAM
1087-
#undef _CODELOCARG
1088-
#undef _CODELOCFW
1060+
// Clean KERNELFUNC macros.
10891061
#undef _KERNELFUNCPARAM
10901062

10911063
/// Returns whether the queue is in order or OoO

0 commit comments

Comments
 (0)