Skip to content

[SYCL][Reduction] Improve getGroupsCounterAccDiscrete performance #6858

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Sep 23, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 79 additions & 78 deletions sycl/include/sycl/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <sycl/kernel.hpp>
#include <sycl/known_identity.hpp>
#include <sycl/properties/reduction_properties.hpp>
#include <sycl/usm.hpp>

#include <tuple>

Expand Down Expand Up @@ -666,15 +667,18 @@ class reduction_impl_algo : public reduction_impl_common<T, BinaryOperation> {

// On discrete (vs. integrated) GPUs it's faster to initialize memory with an
// extra kernel than copy it from the host.
template <typename Name> auto getGroupsCounterAccDiscrete(handler &CGH) {
auto &Buf = getTempBuffer<int>(1, CGH);
std::shared_ptr<detail::queue_impl> QueueCopy = CGH.MQueue;
auto Event = CGH.withAuxHandler(QueueCopy, [&](handler &InitHandler) {
auto Acc = accessor{Buf, InitHandler, sycl::write_only, sycl::no_init};
InitHandler.single_task<Name>([=]() { Acc[0] = 0; });
});
auto getGroupsCounterAccDiscrete(handler &CGH) {
queue q = createSyclObjFromImpl<queue>(CGH.MQueue);
device Dev = q.get_device();
auto Deleter = [=](auto *Ptr) { free(Ptr, q); };

std::shared_ptr<int> Counter(malloc_device<int>(1, q), Deleter);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here it is assumed that 'device' allocation is always supported.
If there are any real chance that it is not supported somewhere, then may need to have a copy of that older code.

CGH.addReduction(Counter);

auto Event = q.memset(Counter.get(), 0, sizeof(int));
CGH.depends_on(Event);
return accessor{Buf, CGH};

return Counter.get();
}

RedOutVar &getUserRedVar() { return MRedOut; }
Expand Down Expand Up @@ -895,11 +899,8 @@ bool reduCGFuncForRangeFastAtomics(handler &CGH, KernelType KernelFunc,

namespace reduction {
namespace main_krn {
template <class KernelName> struct RangeFastReduce;
template <class KernelName, class NWorkGroupsFinished> struct RangeFastReduce;
} // namespace main_krn
namespace init_krn {
template <class KernelName> struct GroupCounter;
}
} // namespace reduction
template <typename KernelName, typename KernelType, int Dims, class Reduction>
bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
Expand All @@ -917,81 +918,81 @@ bool reduCGFuncForRangeFastReduce(handler &CGH, KernelType KernelFunc,
accessor PartialSums(PartialSumsBuf, CGH, sycl::read_write, sycl::no_init);

bool IsUpdateOfUserVar = !Reduction::is_usm && !Redu.initializeToIdentity();
using InitName =
__sycl_reduction_kernel<reduction::init_krn::GroupCounter, KernelName>;

// Integrated/discrete GPUs have different faster path.
auto NWorkGroupsFinished =
getDeviceFromHandler(CGH).get_info<info::device::host_unified_memory>()
? Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH)
: Redu.template getGroupsCounterAccDiscrete<InitName>(CGH);

auto DoReducePartialSumsInLastWG =
Reduction::template getReadWriteLocalAcc<int>(1, CGH);

using Name =
__sycl_reduction_kernel<reduction::main_krn::RangeFastReduce, KernelName>;
size_t PerGroup = Range.size() / NWorkGroups;
CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
// Call user's functions. Reducer.MValue gets initialized there.
typename Reduction::reducer_type Reducer;
reductionLoop(Range, PerGroup, Reducer, NDId, KernelFunc);
auto Rest = [&](auto NWorkGroupsFinished) {
auto DoReducePartialSumsInLastWG =
Reduction::template getReadWriteLocalAcc<int>(1, CGH);

using Name = __sycl_reduction_kernel<reduction::main_krn::RangeFastReduce,
KernelName, decltype(NWorkGroupsFinished)>;
size_t PerGroup = Range.size() / NWorkGroups;
CGH.parallel_for<Name>(NDRange, [=](nd_item<1> NDId) {
// Call user's functions. Reducer.MValue gets initialized there.
typename Reduction::reducer_type Reducer;
reductionLoop(Range, PerGroup, Reducer, NDId, KernelFunc);

typename Reduction::binary_operation BOp;
auto Group = NDId.get_group();
typename Reduction::binary_operation BOp;
auto Group = NDId.get_group();

// If there are multiple values, reduce each separately
// reduce_over_group is only defined for each T, not for span<T, ...>
size_t LID = NDId.get_local_id(0);
for (int E = 0; E < NElements; ++E) {
auto &RedElem = Reducer.getElement(E);
RedElem = reduce_over_group(Group, RedElem, BOp);
if (LID == 0) {
if (NWorkGroups == 1) {
auto &OutElem = Reduction::getOutPointer(Out)[E];
// Can avoid using partial sum and write the final result immediately.
if (IsUpdateOfUserVar)
RedElem = BOp(RedElem, OutElem);
OutElem = RedElem;
} else {
PartialSums[NDId.get_group_linear_id() * NElements + E] =
Reducer.getElement(E);
// If there are multiple values, reduce each separately
// reduce_over_group is only defined for each T, not for span<T, ...>
size_t LID = NDId.get_local_id(0);
for (int E = 0; E < NElements; ++E) {
auto &RedElem = Reducer.getElement(E);
RedElem = reduce_over_group(Group, RedElem, BOp);
if (LID == 0) {
if (NWorkGroups == 1) {
auto &OutElem = Reduction::getOutPointer(Out)[E];
// Can avoid using partial sum and write the final result
// immediately.
if (IsUpdateOfUserVar)
RedElem = BOp(RedElem, OutElem);
OutElem = RedElem;
} else {
PartialSums[NDId.get_group_linear_id() * NElements + E] =
Reducer.getElement(E);
}
}
}
}

if (NWorkGroups == 1)
// We're done.
return;

// Signal this work-group has finished after all values are reduced
if (LID == 0) {
auto NFinished =
sycl::atomic_ref<int, memory_order::relaxed, memory_scope::device,
access::address_space::global_space>(
NWorkGroupsFinished[0]);
DoReducePartialSumsInLastWG[0] = ++NFinished == NWorkGroups;
}
if (NWorkGroups == 1)
// We're done.
return;

workGroupBarrier();
if (DoReducePartialSumsInLastWG[0]) {
// Reduce each result separately
// TODO: Opportunity to parallelize across elements.
for (int E = 0; E < NElements; ++E) {
auto &OutElem = Reduction::getOutPointer(Out)[E];
auto LocalSum = Reducer.getIdentity();
for (size_t I = LID; I < NWorkGroups; I += WGSize)
LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
auto Result = reduce_over_group(Group, LocalSum, BOp);
// Signal this work-group has finished after all values are reduced
if (LID == 0) {
auto NFinished =
sycl::atomic_ref<int, memory_order::relaxed, memory_scope::device,
access::address_space::global_space>(
NWorkGroupsFinished[0]);
DoReducePartialSumsInLastWG[0] = ++NFinished == NWorkGroups;
}

if (LID == 0) {
if (IsUpdateOfUserVar)
Result = BOp(Result, OutElem);
OutElem = Result;
workGroupBarrier();
if (DoReducePartialSumsInLastWG[0]) {
// Reduce each result separately
// TODO: Opportunity to parallelize across elements.
for (int E = 0; E < NElements; ++E) {
auto &OutElem = Reduction::getOutPointer(Out)[E];
auto LocalSum = Reducer.getIdentity();
for (size_t I = LID; I < NWorkGroups; I += WGSize)
LocalSum = BOp(LocalSum, PartialSums[I * NElements + E]);
auto Result = reduce_over_group(Group, LocalSum, BOp);

if (LID == 0) {
if (IsUpdateOfUserVar)
Result = BOp(Result, OutElem);
OutElem = Result;
}
}
}
}
});
});
};

// Integrated/discrete GPUs have different faster path.
if (getDeviceFromHandler(CGH).get_info<info::device::host_unified_memory>())
Rest(Redu.getReadWriteAccessorToInitializedGroupsCounter(CGH));
else
Rest(Redu.getGroupsCounterAccDiscrete(CGH));

// We've updated user's variable, no extra work needed.
return false;
Expand Down