Skip to content

Commit 26b6d6d

Browse files
Improved performance of reduction kernel with atomics
1. Contig implementation kernel gets a dedicated name (easier to spot in the output of onetrace) 2. Increase work-group multiple 3. Change the order in which workgroups tile the array from 'along reduction axis' moves fastest to 'along iteration axis' moves fastests. This last change contributes to significant performance improvement: ``` ================= Before change In [1]: import dpctl.tensor as dpt In [2]: x = dpt.reshape(dpt.asarray(1, dtype="f4")/dpt.square(dpt.arange(1, 1282200*128 + 1, dtype="f4")), (1282200, 128)) In [3]: %time y = dpt.sum(x, axis=0) CPU times: user 309 ms, sys: 128 ms, total: 437 ms Wall time: 473 ms In [4]: %time y = dpt.sum(x, axis=0) CPU times: user 132 ms, sys: 160 ms, total: 292 ms Wall time: 316 ms In [5]: %time y = dpt.sum(x, axis=0) CPU times: user 104 ms, sys: 185 ms, total: 289 ms Wall time: 312 ms ``` ``` ===== After change In [1]: import dpctl.tensor as dpt In [2]: x = dpt.reshape(dpt.asarray(1, dtype="f4")/dpt.square(dpt.arange(1, 1282200*128 + 1, dtype="f4")), (1282200, 128)) In [3]: %time y = dpt.sum(x, axis=0) CPU times: user 150 ms, sys: 32.9 ms, total: 183 ms Wall time: 198 ms In [4]: %time y = dpt.sum(x, axis=0) CPU times: user 20 ms, sys: 22.7 ms, total: 42.7 ms Wall time: 49.4 ms In [5]: %time y = dpt.sum(x, axis=0) CPU times: user 10.2 ms, sys: 28.9 ms, total: 39.1 ms Wall time: 41.4 ms In [6]: %time y = dpt.sum(x, axis=0) CPU times: user 23 ms, sys: 18 ms, total: 41 ms Wall time: 43.5 ms ```
1 parent 3cbb221 commit 26b6d6d

File tree

1 file changed

+11
-15
lines changed

1 file changed

+11
-15
lines changed

dpctl/tensor/libtensor/include/kernels/reductions.hpp

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -146,9 +146,9 @@ struct ReductionOverGroupWithAtomicFunctor
146146

147147
void operator()(sycl::nd_item<1> it) const
148148
{
149-
const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
150-
const size_t iter_gid = it.get_global_id(0) / red_gws_;
151-
const size_t reduction_batch_id = get_reduction_batch_id(it);
149+
const size_t iter_gid = it.get_group(0) % iter_gws_;
150+
const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
151+
152152
const size_t reduction_lid = it.get_local_id(0);
153153
const size_t wg = it.get_local_range(0); // 0 <= reduction_lid < wg
154154

@@ -204,14 +204,6 @@ struct ReductionOverGroupWithAtomicFunctor
204204
}
205205
}
206206
}
207-
208-
private:
209-
size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
210-
{
211-
const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
212-
const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
213-
return reduction_batch_id;
214-
}
215207
};
216208

217209
typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)(
@@ -241,6 +233,9 @@ class sum_reduction_seq_strided_krn;
241233
template <typename T1, typename T2, typename T3, typename T4, typename T5>
242234
class sum_reduction_seq_contig_krn;
243235

236+
template <typename T1, typename T2, typename T3, typename T4, typename T5>
237+
class sum_reduction_over_group_with_atomics_contig_krn;
238+
244239
using dpctl::tensor::sycl_utils::choose_workgroup_size;
245240

246241
template <typename argTy, typename resTy>
@@ -417,7 +412,7 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
417412

418413
const sycl::device &d = exec_q.get_device();
419414
const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
420-
size_t wg = choose_workgroup_size<2>(reduction_nelems, sg_sizes);
415+
size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
421416

422417
if (reduction_nelems < wg) {
423418
sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
@@ -499,9 +494,10 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
499494
sycl::range<1>{iter_nelems * reduction_groups * wg};
500495
auto localRange = sycl::range<1>{wg};
501496

502-
using KernelName = class sum_reduction_over_group_with_atomics_krn<
503-
argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
504-
ReductionIndexerT>;
497+
using KernelName =
498+
class sum_reduction_over_group_with_atomics_contig_krn<
499+
argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
500+
ReductionIndexerT>;
505501

506502
cgh.parallel_for<KernelName>(
507503
sycl::nd_range<1>(globalRange, localRange),

0 commit comments

Comments
 (0)