Skip to content

Commit 567c077

Browse files
[SYCL][NFCI] Don't go through variadic for parallel_for(range<N>, krn) (#18019)
This is a "reduction" overload that just happens to dispatch immediately to the non-reduction range+properties version of `parallel_for`. Going through the simpler overload (unused before this PR) seems to be cheaper. E.g., for ``` template <typename...> struct Name; template <typename Krn> struct Invoker { static void call(void *p, int i) { (*static_cast<Krn *>(p))(i); } }; void invoke(void (*)(void *, int)); struct Kernel { using PointersVariant = std::variant<std::int8_t *, std::int16_t *, std::uint8_t *, std::uint16_t *, float *, double *, sycl::half *>; PointersVariant lhs; PointersVariant rhs; std::size_t sz; PointersVariant out; template <typename T> Kernel(T *l, T *r, std::size_t size, T *o) : lhs(l), rhs(r), sz(size), out(o) {} void operator()(sycl::handler &h) { std::visit( [&](auto lhs_ptr, auto rhs_ptr, auto dst_ptr) { auto L = [=](auto i) { dst_ptr[i] = lhs_ptr[i] + rhs_ptr[i]; }; using N = Name<decltype(lhs_ptr), decltype(rhs_ptr), decltype(dst_ptr)>; h.parallel_for<N>(sz, L); invoke(&Invoker<decltype(L)>::call); }, lhs, rhs, out); } }; auto p = &Kernel::operator(); ``` I see 10.35s->9.9s improvement for `$ time clang++ -fsycl -c a.cpp -D__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__`
1 parent 5ce93be commit 567c077

File tree

1 file changed

+6
-3
lines changed

1 file changed

+6
-3
lines changed

sycl/include/sycl/handler.hpp

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2346,23 +2346,26 @@ class __SYCL_EXPORT handler {
23462346
}
23472347

23482348
template <typename KernelName = detail::auto_name, typename... RestT>
2349-
std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2349+
std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value &&
2350+
(sizeof...(RestT) > 1)>
23502351
parallel_for(range<1> Range, RestT &&...Rest) {
23512352
parallel_for<KernelName>(Range,
23522353
ext::oneapi::experimental::empty_properties_t{},
23532354
std::forward<RestT>(Rest)...);
23542355
}
23552356

23562357
template <typename KernelName = detail::auto_name, typename... RestT>
2357-
std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2358+
std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value &&
2359+
(sizeof...(RestT) > 1)>
23582360
parallel_for(range<2> Range, RestT &&...Rest) {
23592361
parallel_for<KernelName>(Range,
23602362
ext::oneapi::experimental::empty_properties_t{},
23612363
std::forward<RestT>(Rest)...);
23622364
}
23632365

23642366
template <typename KernelName = detail::auto_name, typename... RestT>
2365-
std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value>
2367+
std::enable_if_t<detail::AreAllButLastReductions<RestT...>::value &&
2368+
(sizeof...(RestT) > 1)>
23662369
parallel_for(range<3> Range, RestT &&...Rest) {
23672370
parallel_for<KernelName>(Range,
23682371
ext::oneapi::experimental::empty_properties_t{},

0 commit comments

Comments
 (0)