Resolves gh-1456

ndgrigorian · ndgrigorian · commit c26456721b9c · 2023-11-08T01:36:23.000-08:00
Tree reductions now populate destination with the identity when reducing over
zero-size axes. As a result, logic was removed for handling zero-size axes.

``argmax``, ``argmin``, ``max``, and ``min`` still raise an error for
zero-size axes.

Reductions now return a copy when provided an empty axis tuple.

Adds additional supported dtype combinations to ``prod`` and ``sum``, specifically for input integers and inexact output type
diff --git a/dpctl/tensor/_reduction.py b/dpctl/tensor/_reduction.py
@@ -83,7 +83,6 @@ def _reduction_over_axis(
     _reduction_fn,
     _dtype_supported,
     _default_reduction_type_fn,
-    _identity=None,
 ):
     if not isinstance(x, dpt.usm_ndarray):
         raise TypeError(f"Expected dpctl.tensor.usm_ndarray, got {type(x)}")
@@ -106,23 +105,8 @@ def _reduction_over_axis(
         res_dt = _to_device_supported_dtype(res_dt, q.sycl_device)
 
     res_usm_type = x.usm_type
-    if x.size == 0:
-        if _identity is None:
-            raise ValueError("reduction does not support zero-size arrays")
-        else:
-            if keepdims:
-                res_shape = res_shape + (1,) * red_nd
-                inv_perm = sorted(range(nd), key=lambda d: perm[d])
-                res_shape = tuple(res_shape[i] for i in inv_perm)
-            return dpt.full(
-                res_shape,
-                _identity,
-                dtype=res_dt,
-                usm_type=res_usm_type,
-                sycl_queue=q,
-            )
     if red_nd == 0:
-        return dpt.astype(x, res_dt, copy=False)
+        return dpt.astype(x, res_dt, copy=True)
 
     host_tasks_list = []
     if _dtype_supported(inp_dt, res_dt, res_usm_type, q):
@@ -251,7 +235,6 @@ def sum(x, axis=None, dtype=None, keepdims=False):
         tri._sum_over_axis,
         tri._sum_over_axis_dtype_supported,
         _default_reduction_dtype,
-        _identity=0,
     )
 
 
@@ -312,7 +295,6 @@ def prod(x, axis=None, dtype=None, keepdims=False):
         tri._prod_over_axis,
         tri._prod_over_axis_dtype_supported,
         _default_reduction_dtype,
-        _identity=1,
     )
 
 
@@ -368,7 +350,6 @@ def logsumexp(x, axis=None, dtype=None, keepdims=False):
             inp_dt, res_dt
         ),
         _default_reduction_dtype_fp_types,
-        _identity=-dpt.inf,
     )
 
 
@@ -424,7 +405,6 @@ def reduce_hypot(x, axis=None, dtype=None, keepdims=False):
             inp_dt, res_dt
         ),
         _default_reduction_dtype_fp_types,
-        _identity=0,
     )
 
 
@@ -446,9 +426,19 @@ def _comparison_over_axis(x, axis, keepdims, _reduction_fn):
     res_dt = x.dtype
     res_usm_type = x.usm_type
     if x.size == 0:
-        raise ValueError("reduction does not support zero-size arrays")
+        if any([x.shape[i] == 0 for i in axis]):
+            raise ValueError(
+                "reduction cannot be performed over zero-size axes"
+            )
+        else:
+            return dpt.empty(
+                res_shape,
+                dtype=res_dt,
+                usm_type=res_usm_type,
+                sycl_queue=exec_q,
+            )
     if red_nd == 0:
-        return x
+        return dpt.copy(x)
 
     res = dpt.empty(
         res_shape,
@@ -549,7 +539,17 @@ def _search_over_axis(x, axis, keepdims, _reduction_fn):
     res_dt = ti.default_device_index_type(exec_q.sycl_device)
     res_usm_type = x.usm_type
     if x.size == 0:
-        raise ValueError("reduction does not support zero-size arrays")
+        if any([x.shape[i] == 0 for i in axis]):
+            raise ValueError(
+                "reduction cannot be performed over zero-size axes"
+            )
+        else:
+            return dpt.empty(
+                res_shape,
+                dtype=res_dt,
+                usm_type=res_usm_type,
+                sycl_queue=exec_q,
+            )
     if red_nd == 0:
         return dpt.zeros(
             res_shape, dtype=res_dt, usm_type=res_usm_type, sycl_queue=exec_q
diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp
@@ -1009,6 +1009,9 @@ template <typename T1,
           typename T6>
 class custom_reduction_over_group_temps_strided_krn;
 
+template <typename T1, typename T2, typename T3>
+class reduction_over_group_temps_empty_krn;
+
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 class single_reduction_axis0_temps_contig_krn;
 
@@ -1120,6 +1123,31 @@ sycl::event reduction_over_group_temps_strided_impl(
 
     constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const py::ssize_t *const &res_shape = iter_shape_and_strides;
+            const py::ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                 res_strides);
+            using InitKernelName =
+                class reduction_over_group_temps_empty_krn<resTy, argTy,
+                                                           ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
@@ -1244,7 +1272,7 @@ sycl::event reduction_over_group_temps_strided_impl(
         resTy *partially_reduced_tmp2 = nullptr;
 
         if (partially_reduced_tmp == nullptr) {
-            throw std::runtime_error("Unabled to allocate device_memory");
+            throw std::runtime_error("Unable to allocate device_memory");
         }
         else {
             partially_reduced_tmp2 =
@@ -1501,6 +1529,13 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
 
     constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
@@ -1632,7 +1667,7 @@ sycl::event reduction_axis1_over_group_temps_contig_impl(
         resTy *partially_reduced_tmp2 = nullptr;
 
         if (partially_reduced_tmp == nullptr) {
-            throw std::runtime_error("Unabled to allocate device_memory");
+            throw std::runtime_error("Unable to allocate device_memory");
         }
         else {
             partially_reduced_tmp2 =
@@ -1879,6 +1914,13 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
 
     constexpr resTy identity_val = su_ns::Identity<ReductionOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
@@ -2015,7 +2057,7 @@ sycl::event reduction_axis0_over_group_temps_contig_impl(
         resTy *partially_reduced_tmp2 = nullptr;
 
         if (partially_reduced_tmp == nullptr) {
-            throw std::runtime_error("Unabled to allocate device_memory");
+            throw std::runtime_error("Unable to allocate device_memory");
         }
         else {
             partially_reduced_tmp2 =
@@ -2712,12 +2754,16 @@ struct TypePairSupportDataForSumReductionTemps
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
 
         // input int8_t
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
 
         // input uint8_t
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
@@ -2727,32 +2773,44 @@ struct TypePairSupportDataForSumReductionTemps
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
 
         // input int16_t
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
 
         // input uint16_t
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
 
         // input int32_t
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
 
         // input uint32_t
         td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
 
         // input int64_t
         td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
 
-        // input uint32_t
+        // input uint64_t
         td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
 
         // input half
         td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
@@ -2967,12 +3025,16 @@ struct TypePairSupportDataForProductReductionTemps
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, bool, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, bool, outTy, double>,
 
         // input int8_t
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int8_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int16_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int8_t, outTy, double>,
 
         // input uint8_t
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint8_t>,
@@ -2982,32 +3044,44 @@ struct TypePairSupportDataForProductReductionTemps
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint8_t, outTy, double>,
 
         // input int16_t
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int16_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int16_t, outTy, double>,
 
         // input uint16_t
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint16_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::int64_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint16_t, outTy, double>,
 
         // input int32_t
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::int32_t, outTy, double>,
 
         // input uint32_t
         td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint32_t>,
         td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, float>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint32_t, outTy, double>,
 
         // input int64_t
         td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, std::int64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::int64_t, outTy, double>,
 
         // input uint32_t
         td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, std::uint64_t>,
+        td_ns::TypePairDefinedEntry<argTy, std::uint64_t, outTy, double>,
 
         // input half
         td_ns::TypePairDefinedEntry<argTy, sycl::half, outTy, sycl::half>,
@@ -3957,6 +4031,8 @@ template <typename T1,
           bool b2>
 class custom_search_over_group_temps_strided_krn;
 
+template <typename T1, typename T2, typename T3> class search_empty_krn;
+
 template <typename T1,
           typename T2,
           typename T3,
@@ -4160,6 +4236,30 @@ sycl::event search_over_group_temps_strided_impl(
     constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
     constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.submit([&](sycl::handler &cgh) {
+            using IndexerT =
+                dpctl::tensor::offset_utils::UnpackedStridedIndexer;
+
+            const py::ssize_t *const &res_shape = iter_shape_and_strides;
+            const py::ssize_t *const &res_strides =
+                iter_shape_and_strides + 2 * iter_nd;
+            IndexerT res_indexer(iter_nd, iter_res_offset, res_shape,
+                                 res_strides);
+            using InitKernelName =
+                class search_empty_krn<resTy, argTy, ReductionOpT>;
+            cgh.depends_on(depends);
+
+            cgh.parallel_for<InitKernelName>(
+                sycl::range<1>(iter_nelems), [=](sycl::id<1> id) {
+                    auto res_offset = res_indexer(id[0]);
+                    res_tp[res_offset] = idx_identity_val;
+                });
+        });
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
@@ -4590,6 +4690,13 @@ sycl::event search_axis1_over_group_temps_contig_impl(
     constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
     constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
@@ -5005,6 +5112,13 @@ sycl::event search_axis0_over_group_temps_contig_impl(
     constexpr argTy identity_val = su_ns::Identity<ReductionOpT, argTy>::value;
     constexpr resTy idx_identity_val = su_ns::Identity<IndexOpT, resTy>::value;
 
+    if (reduction_nelems == 0) {
+        sycl::event res_init_ev = exec_q.fill<resTy>(
+            res_tp, resTy(idx_identity_val), iter_nelems, depends);
+
+        return res_init_ev;
+    }
+
     const sycl::device &d = exec_q.get_device();
     const auto &sg_sizes = d.get_info<sycl::info::device::sub_group_sizes>();
     size_t wg = choose_workgroup_size<4>(reduction_nelems, sg_sizes);
diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp