@@ -696,7 +696,41 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl(
696
696
const auto &sg_sizes = d.get_info <sycl::info::device::sub_group_sizes>();
697
697
size_t wg = choose_workgroup_size<4 >(reduction_nelems, sg_sizes);
698
698
699
- {
699
+ if (reduction_nelems < wg) {
700
+ sycl::event comp_ev = exec_q.submit ([&](sycl::handler &cgh) {
701
+ cgh.depends_on (depends);
702
+
703
+ using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
704
+ using InputOutputIterIndexerT =
705
+ dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
706
+ NoOpIndexerT, NoOpIndexerT>;
707
+ using ReductionIndexerT =
708
+ dpctl::tensor::offset_utils::Strided1DIndexer;
709
+
710
+ InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
711
+ NoOpIndexerT{}};
712
+ ReductionIndexerT reduction_indexer{
713
+ 0 , static_cast <py::ssize_t >(reduction_nelems),
714
+ static_cast <py::ssize_t >(iter_nelems)};
715
+
716
+ using KernelName =
717
+ class reduction_seq_contig_krn <argTy, resTy, ReductionOpT,
718
+ InputOutputIterIndexerT,
719
+ ReductionIndexerT>;
720
+
721
+ sycl::range<1 > iter_range{iter_nelems};
722
+
723
+ cgh.parallel_for <KernelName>(
724
+ iter_range,
725
+ SequentialReduction<argTy, resTy, ReductionOpT,
726
+ InputOutputIterIndexerT, ReductionIndexerT>(
727
+ arg_tp, res_tp, ReductionOpT (), identity_val,
728
+ in_out_iter_indexer, reduction_indexer, reduction_nelems));
729
+ });
730
+
731
+ return comp_ev;
732
+ }
733
+ else {
700
734
sycl::event res_init_ev = exec_q.fill <resTy>(
701
735
res_tp, resTy (identity_val), iter_nelems, depends);
702
736
@@ -1849,6 +1883,41 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
1849
1883
const auto &sg_sizes = d.get_info <sycl::info::device::sub_group_sizes>();
1850
1884
size_t wg = choose_workgroup_size<4 >(reduction_nelems, sg_sizes);
1851
1885
1886
+ if (reduction_nelems < wg) {
1887
+ sycl::event comp_ev = exec_q.submit ([&](sycl::handler &cgh) {
1888
+ cgh.depends_on (depends);
1889
+
1890
+ using NoOpIndexerT = dpctl::tensor::offset_utils::NoOpIndexer;
1891
+ using InputOutputIterIndexerT =
1892
+ dpctl::tensor::offset_utils::TwoOffsets_CombinedIndexer<
1893
+ NoOpIndexerT, NoOpIndexerT>;
1894
+ using ReductionIndexerT =
1895
+ dpctl::tensor::offset_utils::Strided1DIndexer;
1896
+
1897
+ InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{},
1898
+ NoOpIndexerT{}};
1899
+ ReductionIndexerT reduction_indexer{
1900
+ 0 , static_cast <py::ssize_t >(reduction_nelems),
1901
+ static_cast <py::ssize_t >(iter_nelems)};
1902
+
1903
+ using KernelName =
1904
+ class reduction_seq_contig_krn <argTy, resTy, ReductionOpT,
1905
+ InputOutputIterIndexerT,
1906
+ ReductionIndexerT>;
1907
+
1908
+ sycl::range<1 > iter_range{iter_nelems};
1909
+
1910
+ cgh.parallel_for <KernelName>(
1911
+ iter_range,
1912
+ SequentialReduction<argTy, resTy, ReductionOpT,
1913
+ InputOutputIterIndexerT, ReductionIndexerT>(
1914
+ arg_tp, res_tp, ReductionOpT (), identity_val,
1915
+ in_out_iter_indexer, reduction_indexer, reduction_nelems));
1916
+ });
1917
+
1918
+ return comp_ev;
1919
+ }
1920
+
1852
1921
constexpr size_t preferred_reductions_per_wi = 8 ;
1853
1922
// max_max_wg prevents running out of resources on CPU
1854
1923
constexpr size_t max_max_wg = 2048 ;
0 commit comments