Skip to content

[SYCL] Fix endless-loop in reduction with nd_range having 1 element l… #2022

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Jul 21, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 41 additions & 29 deletions sycl/include/CL/sycl/handler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ template <typename T, class BinaryOperation, int Dims, bool IsUSM,
class reduction_impl;

using cl::sycl::detail::enable_if_t;
using cl::sycl::detail::queue_impl;

template <typename KernelName, typename KernelType, int Dims, class Reduction,
typename OutputT>
Expand All @@ -191,10 +192,14 @@ enable_if_t<!Reduction::has_fast_atomics>
reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
Reduction &Redu);

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<!Reduction::has_fast_atomics>
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
template <typename KernelName, typename KernelType, class Reduction>
enable_if_t<!Reduction::has_fast_atomics, size_t>
reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
Reduction &Redu);

__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
size_t LocalMemBytesPerWorkItem);

} // namespace detail
} // namespace intel

Expand Down Expand Up @@ -1028,8 +1033,6 @@ class __SYCL_EXPORT handler {
int Dims, typename Reduction>
detail::enable_if_t<!Reduction::has_fast_atomics>
parallel_for(nd_range<Dims> Range, Reduction Redu, KernelType KernelFunc) {
size_t NWorkGroups = Range.get_group_range().size();

// This parallel_for() is lowered to the following sequence:
// 1) Call a kernel that a) call user's lambda function and b) performs
// one iteration of reduction, storing the partial reductions/sums
Expand All @@ -1043,42 +1046,51 @@ class __SYCL_EXPORT handler {
// 2) Call an aux kernel (if necessary, i.e. if N2 > 1) as many times as
// necessary to reduce all partial sums into one final sum.

// Before running the kernels, check that device has enough local memory
// to hold local arrays that may be required for the reduction algorithm.
// TODO: If the work-group-size is limited by the local memory, then
// a special version of the main kernel may be created. The one that would
// not use local accessors, which means it would not do the reduction in
// the main kernel, but simply generate Range.get_global_range.size() number
// of partial sums, leaving the reduction work to the additional/aux
// kernels.
constexpr bool HFR = Reduction::has_fast_reduce;
size_t OneElemSize = HFR ? 0 : sizeof(typename Reduction::result_type);
// TODO: currently the maximal work group size is determined for the given
// queue/device, while it may be safer to use queries to the kernel compiled
// for the device.
size_t MaxWGSize = intel::detail::reduGetMaxWGSize(MQueue, OneElemSize);
if (Range.get_local_range().size() > MaxWGSize)
throw sycl::runtime_error("The implementation handling parallel_for with"
" reduction requires smaller work group size.",
PI_INVALID_WORK_GROUP_SIZE);

// 1. Call the kernel that includes user's lambda function.
intel::detail::reduCGFunc<KernelName>(*this, KernelFunc, Range, Redu);
shared_ptr_class<detail::queue_impl> QueueCopy = MQueue;
this->finalize();

// 2. Run the additional aux kernel as many times as needed to reduce
// 2. Run the additional kernel as many times as needed to reduce
// all partial sums into one scalar.

// TODO: user's nd_range and the work-group size specified there must
// be honored only for the main kernel that calls user's lambda functions.
// There is no need in using the same work-group size in these additional
// kernels. Thus, the better strategy here is to make the work-group size
// as big as possible to converge/reduce the partial sums into the last
// sum faster.
size_t WGSize = Range.get_local_range().size();
size_t NWorkItems = NWorkGroups;
// TODO: Create a special slow/sequential version of the kernel that would
// handle the reduction instead of reporting an assert below.
if (MaxWGSize <= 1)
throw sycl::runtime_error("The implementation handling parallel_for with "
"reduction requires the maximal work group "
"size to be greater than 1 to converge. "
"The maximal work group size depends on the "
"device and the size of the objects passed to "
"the reduction.",
PI_INVALID_WORK_GROUP_SIZE);
size_t NWorkItems = Range.get_group_range().size();
while (NWorkItems > 1) {
WGSize = std::min(WGSize, NWorkItems);
NWorkGroups = NWorkItems / WGSize;
// The last group may be not fully loaded. Still register it as a group.
if ((NWorkItems % WGSize) != 0)
++NWorkGroups;
nd_range<1> Range(range<1>(WGSize * NWorkGroups), range<1>(WGSize));

handler AuxHandler(QueueCopy, MIsHost);
AuxHandler.saveCodeLoc(MCodeLoc);

// The last kernel DOES write to user's accessor passed to reduction.
// Associate it with handler manually.
if (NWorkGroups == 1 && !Reduction::is_usm)
Redu.associateWithHandler(AuxHandler);
intel::detail::reduAuxCGFunc<KernelName, KernelType>(AuxHandler, Range,
NWorkItems, Redu);
NWorkItems = intel::detail::reduAuxCGFunc<KernelName, KernelType>(
AuxHandler, NWorkItems, MaxWGSize, Redu);
MLastEvent = AuxHandler.finalize();

NWorkItems = NWorkGroups;
} // end while (NWorkItems > 1)
}

Expand Down
77 changes: 46 additions & 31 deletions sycl/include/CL/sycl/intel/reduction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#pragma once

#include <CL/sycl/accessor.hpp>
#include <CL/sycl/handler.hpp>
#include <CL/sycl/intel/group_algorithm.hpp>

__SYCL_INLINE_NAMESPACE(cl) {
Expand All @@ -17,6 +18,11 @@ namespace intel {

namespace detail {

__SYCL_EXPORT size_t reduGetMaxWGSize(shared_ptr_class<queue_impl> Queue,
size_t LocalMemBytesPerWorkItem);
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
size_t &NWorkGroups);

using cl::sycl::detail::bool_constant;
using cl::sycl::detail::enable_if_t;
using cl::sycl::detail::is_geninteger16bit;
Expand Down Expand Up @@ -867,19 +873,19 @@ reduCGFunc(handler &CGH, KernelType KernelFunc, const nd_range<Dims> &Range,
/// of work-groups. At the end of each work-groups the partial sum is written
/// to a global buffer.
///
/// Briefly: aux kernel, intel:reduce(), reproducible results,FP + ADD/MIN/MAX
template <typename KernelName, typename KernelType, int Dims, class Reduction,
bool UniformWG, typename InputT, typename OutputT>
/// Briefly: aux kernel, intel:reduce(), reproducible results, FP + ADD/MIN/MAX
template <typename KernelName, typename KernelType, bool UniformWG,
class Reduction, typename InputT, typename OutputT>
enable_if_t<Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
Reduction &, InputT In, OutputT Out) {
size_t NWorkGroups = Range.get_group_range().size();
bool IsUpdateOfUserVar =
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;

reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
size_t WGSize, Reduction &, InputT In, OutputT Out) {
using Name = typename get_reduction_aux_kernel_name_t<
KernelName, KernelType, Reduction::is_usm, UniformWG, OutputT>::name;
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {

bool IsUpdateOfUserVar =
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;
nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
typename Reduction::binary_operation BOp;
size_t WGID = NDIt.get_group_linear_id();
size_t GID = NDIt.get_global_linear_id();
Expand All @@ -903,14 +909,11 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
/// to a global buffer.
///
/// Briefly: aux kernel, tree-reduction, CUSTOM types/ops.
template <typename KernelName, typename KernelType, int Dims, class Reduction,
bool UniformPow2WG, typename InputT, typename OutputT>
template <typename KernelName, typename KernelType, bool UniformPow2WG,
class Reduction, typename InputT, typename OutputT>
enable_if_t<!Reduction::has_fast_reduce && !Reduction::has_fast_atomics>
reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
Reduction &Redu, InputT In, OutputT Out) {
size_t WGSize = Range.get_local_range().size();
size_t NWorkGroups = Range.get_group_range().size();

reduAuxCGFuncImpl(handler &CGH, size_t NWorkItems, size_t NWorkGroups,
size_t WGSize, Reduction &Redu, InputT In, OutputT Out) {
bool IsUpdateOfUserVar =
Reduction::accessor_mode == access::mode::read_write && NWorkGroups == 1;

Expand All @@ -924,7 +927,8 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
auto ReduIdentity = Redu.getIdentity();
using Name = typename get_reduction_aux_kernel_name_t<
KernelName, KernelType, Reduction::is_usm, UniformPow2WG, OutputT>::name;
CGH.parallel_for<Name>(Range, [=](nd_item<Dims> NDIt) {
nd_range<1> Range{range<1>(NWorkItems), range<1>(WGSize)};
CGH.parallel_for<Name>(Range, [=](nd_item<1> NDIt) {
size_t WGSize = NDIt.get_local_range().size();
size_t LID = NDIt.get_local_linear_id();
size_t GID = NDIt.get_global_linear_id();
Expand Down Expand Up @@ -962,12 +966,22 @@ reduAuxCGFuncImpl(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
});
}

template <typename KernelName, typename KernelType, int Dims, class Reduction>
enable_if_t<!Reduction::has_fast_atomics>
reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
/// Implements a command group function that enqueues a kernel that does one
/// iteration of reduction of elements in each of work-groups.
/// At the end of each work-group the partial sum is written to a global buffer.
/// The function returns the number of the newly generated partial sums.
template <typename KernelName, typename KernelType, class Reduction>
enable_if_t<!Reduction::has_fast_atomics, size_t>
reduAuxCGFunc(handler &CGH, size_t NWorkItems, size_t MaxWGSize,
Reduction &Redu) {
size_t WGSize = Range.get_local_range().size();
size_t NWorkGroups = Range.get_group_range().size();

size_t NWorkGroups;
size_t WGSize = reduComputeWGSize(NWorkItems, MaxWGSize, NWorkGroups);

// The last kernel DOES write to user's accessor passed to reduction.
// Associate it with handler manually.
if (NWorkGroups == 1 && !Reduction::is_usm)
Redu.associateWithHandler(CGH);

// The last work-group may be not fully loaded with work, or the work group
// size may be not power of two. Those two cases considered inefficient
Expand All @@ -981,20 +995,21 @@ reduAuxCGFunc(handler &CGH, const nd_range<Dims> &Range, size_t NWorkItems,
auto In = Redu.getReadAccToPreviousPartialReds(CGH);
if (Reduction::is_usm && NWorkGroups == 1) {
if (HasUniformWG)
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
reduAuxCGFuncImpl<KernelName, KernelType, true>(
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
else
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
CGH, Range, NWorkItems, Redu, In, Redu.getUSMPointer());
reduAuxCGFuncImpl<KernelName, KernelType, false>(
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Redu.getUSMPointer());
} else {
auto Out = Redu.getWriteAccForPartialReds(NWorkGroups, CGH);
if (HasUniformWG)
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, true>(
CGH, Range, NWorkItems, Redu, In, Out);
reduAuxCGFuncImpl<KernelName, KernelType, true>(
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
else
reduAuxCGFuncImpl<KernelName, KernelType, Dims, Reduction, false>(
CGH, Range, NWorkItems, Redu, In, Out);
reduAuxCGFuncImpl<KernelName, KernelType, false>(
CGH, NWorkItems, NWorkGroups, WGSize, Redu, In, Out);
}
return NWorkGroups;
}

} // namespace detail
Expand Down
1 change: 1 addition & 0 deletions sycl/source/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ set(SYCL_SOURCES
"detail/queue_impl.cpp"
"detail/os_util.cpp"
"detail/platform_util.cpp"
"detail/reduction.cpp"
"detail/sampler_impl.cpp"
"detail/stream_impl.cpp"
"detail/scheduler/commands.cpp"
Expand Down
67 changes: 67 additions & 0 deletions sycl/source/detail/reduction.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
//==---------------- reduction.cpp - SYCL reduction ------------*- C++ -*---==//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include <CL/sycl/intel/reduction.hpp>
#include <detail/queue_impl.hpp>

__SYCL_INLINE_NAMESPACE(cl) {
namespace sycl {
namespace intel {
namespace detail {

// TODO: The algorithm of choosing the work-group size is definitely
// imperfect now and can be improved.
__SYCL_EXPORT size_t reduComputeWGSize(size_t NWorkItems, size_t MaxWGSize,
size_t &NWorkGroups) {
size_t WGSize = MaxWGSize;
if (NWorkItems <= WGSize) {
NWorkGroups = 1;
WGSize = NWorkItems;
} else {
NWorkGroups = NWorkItems / WGSize;
size_t Rem = NWorkItems % WGSize;
if (Rem != 0) {
// Let's say MaxWGSize = 128 and NWorkItems is (128+32).
// It seems better to have 5 groups 32 work-items each than 2 groups with
// 128 work-items in the 1st group and 32 work-items in the 2nd group.
Comment on lines +29 to +31
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should set up some benchmarks for this sort of thing. I agree that load balance is important, but we might need to pay attention to the total number of kernels we're launching as well.

size_t NWorkGroupsAlt = NWorkItems / Rem;
size_t RemAlt = NWorkItems % Rem;
if (RemAlt == 0 && NWorkGroupsAlt <= MaxWGSize) {
NWorkGroups = NWorkGroupsAlt;
WGSize = Rem;
}
} else {
NWorkGroups++;
}
}
return WGSize;
}

__SYCL_EXPORT size_t
reduGetMaxWGSize(shared_ptr_class<sycl::detail::queue_impl> Queue,
size_t LocalMemBytesPerWorkItem) {
device Dev = Queue->get_device();
size_t WGSize = Dev.get_info<info::device::max_work_group_size>();
Comment on lines +48 to +49
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you think it's worth adding a TODO here noting that the code assumes that the reduction kernel can be launched with the maximum work-group size? I think eventually we want to pay attention to the maximum work-group size for the kernel itself (which may be different).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added TODO comment.
If some kernel uses local 1-dim accessor, do you think a query to kernel-on-device would return a max-work-group-size that taking the local memory limits into account? I doubt it can do that because for such query there is no reliable information proving that local accessor(s) would have same amount of elements as the local_range.size().

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, I don't think we can count on the query to account for local memory. We'd need two queries as you have here, but the max work-group size one would be kernel-specific instead of device-specific.

It might be hard to do this right now, because I think the program interface only allows you to access this information today by re-compiling the kernel. Paying the JIT cost for the kernel every time it's launched just to check what sizes it supports doesn't seem like a good idea.

if (LocalMemBytesPerWorkItem != 0) {
size_t MemSize = Dev.get_info<info::device::local_mem_size>();
size_t WGSizePerMem = MemSize / LocalMemBytesPerWorkItem;

// If the work group size is not pow of two, then an additional element
// in local memory is needed for the reduction algorithm and thus the real
// work-group size requirement per available memory is stricter.
if ((WGSize & (WGSize - 1)) == 0)
WGSizePerMem--;
WGSize = (std::min)(WGSizePerMem, WGSize);
}
return WGSize;
}

} // namespace detail
} // namespace intel
} // namespace sycl
} // __SYCL_INLINE_NAMESPACE(cl)
2 changes: 2 additions & 0 deletions sycl/test/abi/sycl_symbols_linux.dump
Original file line number Diff line number Diff line change
Expand Up @@ -3616,6 +3616,8 @@ _ZN2cl4sycl5eventC1Ev
_ZN2cl4sycl5eventC2EP9_cl_eventRKNS0_7contextE
_ZN2cl4sycl5eventC2ESt10shared_ptrINS0_6detail10event_implEE
_ZN2cl4sycl5eventC2Ev
_ZN2cl4sycl5intel6detail17reduComputeWGSizeEmmRm
_ZN2cl4sycl5intel6detail16reduGetMaxWGSizeESt10shared_ptrINS0_6detail10queue_implEEm
_ZN2cl4sycl5queue10mem_adviseEPKvm14_pi_mem_advice
_ZN2cl4sycl5queue10wait_proxyERKNS0_6detail13code_locationE
_ZN2cl4sycl5queue11submit_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationE
Expand Down
2 changes: 1 addition & 1 deletion sycl/test/reduction/reduction_nd_s1_rw.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ int main() {

// Check with various types.
test<float, 1, std::multiplies<float>>(1, 8, 256);
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 8, 256);
test<float, 1, intel::minimum<float>>(getMaximumFPValue<float>(), 1, 16);
test<float, 1, intel::maximum<float>>(getMinimumFPValue<float>(), 8, 256);

// Check with CUSTOM type.
Expand Down
1 change: 1 addition & 0 deletions sycl/test/reduction/reduction_transparent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ int main() {
test<float, 0, intel::maximum<>>(getMinimumFPValue<float>(), 7, 7 * 5);
test<signed char, 0, intel::plus<>>(0, 7, 49);
test<unsigned char, 1, std::multiplies<>>(1, 4, 16);
test<unsigned short, 0, intel::plus<>>(0, 1, 512 + 32);
#endif // __cplusplus >= 201402L

std::cout << "Test passed\n";
Expand Down