Improved performance of reduction kernel with atomics

oleksandr-pavlyk · oleksandr-pavlyk · commit a81d9c809cc4 · 2023-08-28T20:33:53.000-05:00
1. Contig implementation kernel gets a dedicated name
  (easier to spot in the output of onetrace)
2. Increase work-group multiple
3. Change the order in which workgroups tile the array
   from 'along reduction axis' moves fastest to
   'along iteration axis' moves fastests.

This last change contributes to significant performance improvement:

```
================= Before change

In [1]: import dpctl.tensor as dpt

In [2]: x = dpt.reshape(dpt.asarray(1, dtype="f4")/dpt.square(dpt.arange(1, 1282200*128 + 1, dtype="f4")), (1282200, 128))

In [3]: %time y = dpt.sum(x, axis=0)
CPU times: user 309 ms, sys: 128 ms, total: 437 ms
Wall time: 473 ms

In [4]: %time y = dpt.sum(x, axis=0)
CPU times: user 132 ms, sys: 160 ms, total: 292 ms
Wall time: 316 ms

In [5]: %time y = dpt.sum(x, axis=0)
CPU times: user 104 ms, sys: 185 ms, total: 289 ms
Wall time: 312 ms
```

```
===== After change

In [1]: import dpctl.tensor as dpt

In [2]: x = dpt.reshape(dpt.asarray(1, dtype="f4")/dpt.square(dpt.arange(1, 1282200*128 + 1, dtype="f4")), (1282200, 128))

In [3]: %time y = dpt.sum(x, axis=0)
CPU times: user 150 ms, sys: 32.9 ms, total: 183 ms
Wall time: 198 ms

In [4]: %time y = dpt.sum(x, axis=0)
CPU times: user 20 ms, sys: 22.7 ms, total: 42.7 ms
Wall time: 49.4 ms

In [5]: %time y = dpt.sum(x, axis=0)
CPU times: user 10.2 ms, sys: 28.9 ms, total: 39.1 ms
Wall time: 41.4 ms

In [6]: %time y = dpt.sum(x, axis=0)
CPU times: user 23 ms, sys: 18 ms, total: 41 ms
Wall time: 43.5 ms
```
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -146,9 +146,9 @@ struct ReductionOverGroupWithAtomicFunctor
 
     void operator()(sycl::nd_item<1> it) const
     {
-        const size_t red_gws_ = it.get_global_range(0) / iter_gws_;
-        const size_t iter_gid = it.get_global_id(0) / red_gws_;
-        const size_t reduction_batch_id = get_reduction_batch_id(it);
+        const size_t iter_gid = it.get_group(0) % iter_gws_;
+        const size_t reduction_batch_id = it.get_group(0) / iter_gws_;
+
         const size_t reduction_lid = it.get_local_id(0);
         const size_t wg = it.get_local_range(0); //   0 <= reduction_lid < wg
 
@@ -204,14 +204,6 @@ struct ReductionOverGroupWithAtomicFunctor
             }
         }
     }
-
-private:
-    size_t get_reduction_batch_id(sycl::nd_item<1> const &it) const
-    {
-        const size_t n_reduction_groups = it.get_group_range(0) / iter_gws_;
-        const size_t reduction_batch_id = it.get_group(0) % n_reduction_groups;
-        return reduction_batch_id;
-    }
 };
 
 typedef sycl::event (*sum_reduction_strided_impl_fn_ptr)(
@@ -241,6 +233,9 @@ class sum_reduction_seq_strided_krn;
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class sum_reduction_seq_contig_krn;
 
+template <typename T1, typename T2, typename T3, typename T4, typename T5>
+class sum_reduction_over_group_with_atomics_contig_krn;
+
 using dpctl::tensor::sycl_utils::choose_workgroup_size;
 
 template <typename argTy, typename resTy>
@@ -417,7 +412,7 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
 
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
-    size_t wg = choose_workgroup_size<2>(reduction_nelems, sg_sizes);
+    size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
 
     if (reduction_nelems < wg) {
         sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) {
@@ -499,9 +494,10 @@ sycl::event sum_reduction_over_group_with_atomics_contig_impl(
                 sycl::range<1>{iter_nelems * reduction_groups * wg};
             auto localRange = sycl::range<1>{wg};
 
-            using KernelName = class sum_reduction_over_group_with_atomics_krn<
-                argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
-                ReductionIndexerT>;
+            using KernelName =
+                class sum_reduction_over_group_with_atomics_contig_krn<
+                    argTy, resTy, ReductionOpT, InputOutputIterIndexerT,
+                    ReductionIndexerT>;
 
             cgh.parallel_for<KernelName>(
                 sycl::nd_range<1>(globalRange, localRange),