Skip to content

Boolean reduction performance improvements #1401

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Sep 18, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/generate-coverage.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ jobs:
- name: Install dpctl dependencies
shell: bash -l {0}
run: |
pip install numpy cython setuptools pytest pytest-cov scikit-build cmake coverage[toml]
pip install numpy"<1.26.0" cython setuptools pytest pytest-cov scikit-build cmake coverage[toml]

- name: Build dpctl with coverage
shell: bash -l {0}
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/os-llvm-sycl-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ jobs:
- name: Install dpctl dependencies
shell: bash -l {0}
run: |
pip install numpy cython setuptools pytest scikit-build cmake ninja
pip install numpy"<1.26.0" cython setuptools pytest scikit-build cmake ninja

- name: Checkout repo
uses: actions/checkout@v3
Expand Down
181 changes: 131 additions & 50 deletions dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "pybind11/pybind11.h"

#include "utils/offset_utils.hpp"
#include "utils/sycl_utils.hpp"
#include "utils/type_dispatch.hpp"
#include "utils/type_utils.hpp"

Expand Down Expand Up @@ -227,9 +228,8 @@ struct ContigBooleanReduction

void operator()(sycl::nd_item<1> it) const
{
const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
const size_t reduction_id = it.get_global_id(0) / red_gws_;
const size_t reduction_batch_id = get_reduction_batch_id(it);
const size_t reduction_id = it.get_group(0) % iter_gws_;
const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
const size_t wg_size = it.get_local_range(0);

const size_t base = reduction_id * reduction_max_gid_;
Expand All @@ -241,14 +241,6 @@ struct ContigBooleanReduction
// in group_op_
group_op_(it, out_, reduction_id, inp_ + start, inp_ + end);
}

private:
size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
{
const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
return reduction_batch_id;
}
};

typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)(
Expand All @@ -268,17 +260,19 @@ class boolean_reduction_contig_krn;
template <typename T1, typename T2, typename T3, typename T4, typename T5>
class boolean_reduction_seq_contig_krn;

using dpctl::tensor::sycl_utils::choose_workgroup_size;

template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
sycl::event
boolean_reduction_contig_impl(sycl::queue exec_q,
size_t iter_nelems,
size_t reduction_nelems,
const char *arg_cp,
char *res_cp,
py::ssize_t iter_arg_offset,
py::ssize_t iter_res_offset,
py::ssize_t red_arg_offset,
const std::vector<sycl::event> &depends)
boolean_reduction_axis1_contig_impl(sycl::queue exec_q,
size_t iter_nelems,
size_t reduction_nelems,
const char *arg_cp,
char *res_cp,
py::ssize_t iter_arg_offset,
py::ssize_t iter_res_offset,
py::ssize_t red_arg_offset,
const std::vector<sycl::event> &depends)
{
const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
iter_arg_offset + red_arg_offset;
Expand All @@ -288,8 +282,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q,

const sycl::device &d = exec_q.get_device();
const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
size_t wg =
4 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);

sycl::event red_ev;
if (reduction_nelems < wg) {
Expand Down Expand Up @@ -322,18 +315,8 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
});
}
else {
sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) {
using IndexerT = dpctl::tensor::offset_utils::NoOpIndexer;

IndexerT res_indexer{};

cgh.depends_on(depends);

cgh.parallel_for(sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
auto res_offset = res_indexer(id[0]);
res_tp[res_offset] = identity_val;
});
});
sycl::event init_ev = exec_q.fill<resTy>(res_tp, resTy(identity_val),
iter_nelems, depends);
red_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(init_ev);

Expand Down Expand Up @@ -363,7 +346,7 @@ boolean_reduction_contig_impl(sycl::queue exec_q,
return red_ev;
}

template <typename fnT, typename srcTy> struct AllContigFactory
template <typename fnT, typename srcTy> struct AllAxis1ContigFactory
{
fnT get() const
{
Expand All @@ -372,12 +355,12 @@ template <typename fnT, typename srcTy> struct AllContigFactory
using GroupOpT =
all_reduce_wg_contig<srcTy, resTy, boolean_predicate<srcTy>>;

return dpctl::tensor::kernels::boolean_reduction_contig_impl<
return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl<
srcTy, resTy, RedOpT, GroupOpT>;
}
};

template <typename fnT, typename srcTy> struct AnyContigFactory
template <typename fnT, typename srcTy> struct AnyAxis1ContigFactory
{
fnT get() const
{
Expand All @@ -386,7 +369,7 @@ template <typename fnT, typename srcTy> struct AnyContigFactory
using GroupOpT =
any_reduce_wg_contig<srcTy, resTy, boolean_predicate<srcTy>>;

return dpctl::tensor::kernels::boolean_reduction_contig_impl<
return dpctl::tensor::kernels::boolean_reduction_axis1_contig_impl<
srcTy, resTy, RedOpT, GroupOpT>;
}
};
Expand Down Expand Up @@ -433,9 +416,9 @@ struct StridedBooleanReduction

void operator()(sycl::nd_item<1> it) const
{
const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
const size_t reduction_id = it.get_global_id(0) / red_gws_;
const size_t reduction_batch_id = get_reduction_batch_id(it);
const size_t reduction_id = it.get_group(0) % iter_gws_;
const size_t reduction_batch_id = it.get_group(0) / iter_gws_;

const size_t reduction_lid = it.get_local_id(0);
const size_t wg_size = it.get_local_range(0);

Expand Down Expand Up @@ -468,13 +451,112 @@ struct StridedBooleanReduction
// in group_op_
group_op_(it, out_, out_iter_offset, local_red_val);
}
};

template <typename T1,
typename T2,
typename T3,
typename T4,
typename T5,
typename T6>
class boolean_reduction_axis0_contig_krn;

template <typename argTy, typename resTy, typename RedOpT, typename GroupOpT>
sycl::event
boolean_reduction_axis0_contig_impl(sycl::queue exec_q,
size_t iter_nelems,
size_t reduction_nelems,
const char *arg_cp,
char *res_cp,
py::ssize_t iter_arg_offset,
py::ssize_t iter_res_offset,
py::ssize_t red_arg_offset,
const std::vector<sycl::event> &depends)
{
const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_cp) +
iter_arg_offset + red_arg_offset;
resTy *res_tp = reinterpret_cast<resTy *>(res_cp) + iter_res_offset;

constexpr resTy identity_val = sycl::known_identity<RedOpT, resTy>::value;

const sycl::device &d = exec_q.get_device();
const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);

private:
size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
{
const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
return reduction_batch_id;
sycl::event init_ev = exec_q.fill<resTy>(res_tp, resTy(identity_val),
iter_nelems, depends);
sycl::event red_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(init_ev);

constexpr std::uint8_t dim = 1;

using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
using ColsIndexerT = dpctl::tensor::offset_utils::Strided1DIndexer;
using InputOutputIterIndexerT =
dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
NoOpIndexerT, NoOpIndexerT>;
using ReductionIndexerT = ColsIndexerT;

NoOpIndexerT columns_indexer{};
NoOpIndexerT result_indexer{};
InputOutputIterIndexerT in_out_iter_indexer{columns_indexer,
result_indexer};
ReductionIndexerT reduction_indexer{
0, static_cast<py::ssize_t>(reduction_nelems),
static_cast<py::ssize_t>(iter_nelems)};

constexpr size_t preferred_reductions_per_wi = 4;
size_t reductions_per_wi =
(reduction_nelems < preferred_reductions_per_wi * wg)
? ((reduction_nelems + wg - 1) / wg)
: preferred_reductions_per_wi;

size_t reduction_groups =
(reduction_nelems + reductions_per_wi * wg - 1) /
(reductions_per_wi * wg);

auto gws = sycl::range<dim>{iter_nelems * reduction_groups * wg};
auto lws = sycl::range<dim>{wg};

cgh.parallel_for<class boolean_reduction_axis0_contig_krn<
argTy, resTy, RedOpT, GroupOpT, InputOutputIterIndexerT,
ReductionIndexerT>>(
sycl::nd_range<dim>(gws, lws),
StridedBooleanReduction<argTy, resTy, RedOpT, GroupOpT,
InputOutputIterIndexerT,
ReductionIndexerT>(
arg_tp, res_tp, RedOpT(), GroupOpT(), identity_val,
in_out_iter_indexer, reduction_indexer, reduction_nelems,
iter_nelems, reductions_per_wi));
});
return red_ev;
}
}

template <typename fnT, typename srcTy> struct AllAxis0ContigFactory
{
fnT get() const
{
using resTy = std::int32_t;
using RedOpT = sycl::logical_and<resTy>;
using GroupOpT = all_reduce_wg_strided<resTy>;

return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl<
srcTy, resTy, RedOpT, GroupOpT>;
}
};

template <typename fnT, typename srcTy> struct AnyAxis0ContigFactory
{
fnT get() const
{
using resTy = std::int32_t;
using RedOpT = sycl::logical_or<resTy>;
using GroupOpT = any_reduce_wg_strided<resTy>;

return dpctl::tensor::kernels::boolean_reduction_axis0_contig_impl<
srcTy, resTy, RedOpT, GroupOpT>;
}
};

Expand Down Expand Up @@ -527,8 +609,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,

const sycl::device &d = exec_q.get_device();
const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
size_t wg =
4 * (*std::max_element(std::begin(sg_sizes), std::end(sg_sizes)));
size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);

sycl::event red_ev;
if (reduction_nelems < wg) {
Expand Down Expand Up @@ -558,7 +639,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
});
}
else {
sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
sycl::event init_ev = exec_q.submit([&](sycl::handler &cgh) {
using IndexerT =
dpctl::tensor::offset_utils::UnpackedStridedIndexer;

Expand All @@ -576,7 +657,7 @@ boolean_reduction_strided_impl(sycl::queue exec_q,
});
});
red_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(res_init_ev);
cgh.depends_on(init_ev);

constexpr std::uint8_t dim = 1;

Expand Down
Loading