Skip to content

Commit 6a0b09c

Browse files
Apply SequentialReductionKernel to axis0 reduction
1 parent c742e79 commit 6a0b09c

File tree

1 file changed

+70
-1
lines changed

1 file changed

+70
-1
lines changed

dpctl/tensor/libtensor/include/kernels/reductions.hpp

Lines changed: 70 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -696,7 +696,41 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
696696
const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
697697
size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
698698

699-
{
699+
if (reduction_nelems < wg) {
700+
sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
701+
cgh.depends_on(depends);
702+
703+
using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
704+
using InputOutputIterIndexerT =
705+
dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
706+
NoOpIndexerT, NoOpIndexerT>;
707+
using ReductionIndexerT =
708+
dpctl::tensor::offset_utils::Strided1DIndexer;
709+
710+
InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
711+
NoOpIndexerT{}};
712+
ReductionIndexerT reduction_indexer{
713+
0, static_cast<py::ssize_t>(reduction_nelems),
714+
static_cast<py::ssize_t>(iter_nelems)};
715+
716+
using KernelName =
717+
class reduction_seq_contig_krn<argTy, resTy, ReductionOpT,
718+
InputOutputIterIndexerT,
719+
ReductionIndexerT>;
720+
721+
sycl::range<1> iter_range{iter_nelems};
722+
723+
cgh.parallel_for<KernelName>(
724+
iter_range,
725+
SequentialReduction<argTy, resTy, ReductionOpT,
726+
InputOutputIterIndexerT, ReductionIndexerT>(
727+
arg_tp, res_tp, ReductionOpT(), identity_val,
728+
in_out_iter_indexer, reduction_indexer, reduction_nelems));
729+
});
730+
731+
return comp_ev;
732+
}
733+
else {
700734
sycl::event res_init_ev = exec_q.fill<resTy>(
701735
res_tp, resTy(identity_val), iter_nelems, depends);
702736

@@ -1849,6 +1883,41 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
18491883
const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
18501884
size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
18511885

1886+
if (reduction_nelems < wg) {
1887+
sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
1888+
cgh.depends_on(depends);
1889+
1890+
using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
1891+
using InputOutputIterIndexerT =
1892+
dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
1893+
NoOpIndexerT, NoOpIndexerT>;
1894+
using ReductionIndexerT =
1895+
dpctl::tensor::offset_utils::Strided1DIndexer;
1896+
1897+
InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
1898+
NoOpIndexerT{}};
1899+
ReductionIndexerT reduction_indexer{
1900+
0, static_cast<py::ssize_t>(reduction_nelems),
1901+
static_cast<py::ssize_t>(iter_nelems)};
1902+
1903+
using KernelName =
1904+
class reduction_seq_contig_krn<argTy, resTy, ReductionOpT,
1905+
InputOutputIterIndexerT,
1906+
ReductionIndexerT>;
1907+
1908+
sycl::range<1> iter_range{iter_nelems};
1909+
1910+
cgh.parallel_for<KernelName>(
1911+
iter_range,
1912+
SequentialReduction<argTy, resTy, ReductionOpT,
1913+
InputOutputIterIndexerT, ReductionIndexerT>(
1914+
arg_tp, res_tp, ReductionOpT(), identity_val,
1915+
in_out_iter_indexer, reduction_indexer, reduction_nelems));
1916+
});
1917+
1918+
return comp_ev;
1919+
}
1920+
18521921
constexpr size_t preferred_reductions_per_wi = 8;
18531922
// max_max_wg prevents running out of resources on CPU
18541923
constexpr size_t max_max_wg = 2048;

0 commit comments

Comments
 (0)