You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
[SYCL][NFCI] Don't go through variadic for parallel_for(range<N>, krn) (#18019)
This is a "reduction" overload that just happens to dispatch immediately
to the non-reduction range+properties version of `parallel_for`. Going
through the simpler overload (unused before this PR) seems to be
cheaper.
E.g., for
```
template <typename...> struct Name;
template <typename Krn> struct Invoker {
static void call(void *p, int i) { (*static_cast<Krn *>(p))(i); }
};
void invoke(void (*)(void *, int));
struct Kernel {
using PointersVariant =
std::variant<std::int8_t *, std::int16_t *, std::uint8_t *,
std::uint16_t *, float *, double *, sycl::half *>;
PointersVariant lhs;
PointersVariant rhs;
std::size_t sz;
PointersVariant out;
template <typename T>
Kernel(T *l, T *r, std::size_t size, T *o)
: lhs(l), rhs(r), sz(size), out(o) {}
void operator()(sycl::handler &h) {
std::visit(
[&](auto lhs_ptr, auto rhs_ptr, auto dst_ptr) {
auto L = [=](auto i) { dst_ptr[i] = lhs_ptr[i] + rhs_ptr[i]; };
using N =
Name<decltype(lhs_ptr), decltype(rhs_ptr), decltype(dst_ptr)>;
h.parallel_for<N>(sz, L);
invoke(&Invoker<decltype(L)>::call);
},
lhs, rhs, out);
}
};
auto p = &Kernel::operator();
```
I see 10.35s->9.9s improvement for
`$ time clang++ -fsycl -c a.cpp
-D__SYCL_DISABLE_PARALLEL_FOR_RANGE_ROUNDING__`
0 commit comments