Added support of dpnp.allclose() for a device without fp64 aspect

antonwolfy · antonwolfy · commit 2ac7f3921b78 · 2023-08-24T17:51:33.000+02:00
diff --git a/dpnp/backend/kernels/dpnp_krnl_logic.cpp b/dpnp/backend/kernels/dpnp_krnl_logic.cpp
@@ -74,7 +74,7 @@ DPCTLSyclEventRef dpnp_all_c(DPCTLSyclQueueRef q_ref,
     sycl::nd_range<1> gws(gws_range, lws_range);
 
     auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {
-        auto gr = nd_it.get_group();
+        auto gr = nd_it.get_sub_group();
         const auto max_gr_size = gr.get_max_local_range()[0];
         const size_t start =
             vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +
@@ -127,8 +127,72 @@ DPCTLSyclEventRef (*dpnp_all_ext_c)(DPCTLSyclQueueRef,
                                     const DPCTLEventVectorRef) =
     dpnp_all_c<_DataType, _ResultType>;
 
-template <typename _DataType1, typename _DataType2, typename _ResultType>
-class dpnp_allclose_c_kernel;
+template <typename _DataType1, typename _DataType2, typename _TolType>
+class dpnp_allclose_kernel;
+
+template <typename _DataType1, typename _DataType2, typename _TolType>
+static sycl::event dpnp_allclose(sycl::queue &q,
+                                 const _DataType1 *array1,
+                                 const _DataType2 *array2,
+                                 bool *result,
+                                 const size_t size,
+                                 const _TolType rtol_val,
+                                 const _TolType atol_val)
+{
+    sycl::event fill_event = q.fill(result, true, 1);
+    if (!size) {
+        return fill_event;
+    }
+
+    constexpr size_t lws = 64;
+    constexpr size_t vec_sz = 8;
+
+    auto gws_range =
+        sycl::range<1>(((size + lws * vec_sz - 1) / (lws * vec_sz)) * lws);
+    auto lws_range = sycl::range<1>(lws);
+    sycl::nd_range<1> gws(gws_range, lws_range);
+
+    auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {
+        auto gr = nd_it.get_sub_group();
+        const auto max_gr_size = gr.get_max_local_range()[0];
+        const auto gr_size = gr.get_local_linear_range();
+        const size_t start =
+            vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +
+                      gr.get_group_linear_id() * max_gr_size);
+        const size_t end = sycl::min(start + vec_sz * gr_size, size);
+
+        // each work-item iterates over "vec_sz" elements in the input arrays
+        bool partial = true;
+
+        for (size_t i = start + gr.get_local_linear_id(); i < end; i += gr_size)
+        {
+            if constexpr (std::is_floating_point_v<_DataType1> &&
+                          std::is_floating_point_v<_DataType2>)
+            {
+                if (std::isinf(array1[i]) || std::isinf(array2[i])) {
+                    partial &= (array1[i] == array2[i]);
+                    continue;
+                }
+            }
+            partial &= (std::abs(array1[i] - array2[i]) <=
+                        (atol_val + rtol_val * std::abs(array2[i])));
+        }
+        partial = sycl::all_of_group(gr, partial);
+
+        if (gr.leader() && (partial == false)) {
+            result[0] = false;
+        }
+    };
+
+    auto kernel_func = [&](sycl::handler &cgh) {
+        cgh.depends_on(fill_event);
+        cgh.parallel_for<
+            class dpnp_allclose_kernel<_DataType1, _DataType2, _TolType>>(
+            gws, kernel_parallel_for_func);
+    };
+
+    return q.submit(kernel_func);
+}
 
 template <typename _DataType1, typename _DataType2, typename _ResultType>
 DPCTLSyclEventRef dpnp_allclose_c(DPCTLSyclQueueRef q_ref,
@@ -140,6 +204,9 @@ DPCTLSyclEventRef dpnp_allclose_c(DPCTLSyclQueueRef q_ref,
                                   double atol_val,
                                   const DPCTLEventVectorRef dep_event_vec_ref)
 {
+    static_assert(std::is_same_v<_ResultType, bool>,
+                  "Boolean result type is required");
+
     // avoid warning unused variable
     (void)dep_event_vec_ref;
 
@@ -152,40 +219,21 @@ DPCTLSyclEventRef dpnp_allclose_c(DPCTLSyclQueueRef q_ref,
     sycl::queue q = *(reinterpret_cast<sycl::queue *>(q_ref));
     sycl::event event;
 
-    DPNPC_ptr_adapter<_DataType1> input1_ptr(q_ref, array1_in, size);
-    DPNPC_ptr_adapter<_DataType2> input2_ptr(q_ref, array2_in, size);
-    DPNPC_ptr_adapter<_ResultType> result1_ptr(q_ref, result1, 1, true, true);
-    const _DataType1 *array1 = input1_ptr.get_ptr();
-    const _DataType2 *array2 = input2_ptr.get_ptr();
-    _ResultType *result = result1_ptr.get_ptr();
-
-    result[0] = true;
+    const _DataType1 *array1 = static_cast<const _DataType1 *>(array1_in);
+    const _DataType2 *array2 = static_cast<const _DataType2 *>(array2_in);
+    bool *result = static_cast<bool *>(result1);
 
-    if (!size) {
-        return event_ref;
+    if (q.get_device().has(sycl::aspect::fp64)) {
+        event =
+            dpnp_allclose(q, array1, array2, result, size, rtol_val, atol_val);
+    }
+    else {
+        float rtol = static_cast<float>(rtol_val);
+        float atol = static_cast<float>(atol_val);
+        event = dpnp_allclose(q, array1, array2, result, size, rtol, atol);
     }
-
-    sycl::range<1> gws(size);
-    auto kernel_parallel_for_func = [=](sycl::id<1> global_id) {
-        size_t i = global_id[0];
-
-        if (std::abs(array1[i] - array2[i]) >
-            (atol_val + rtol_val * std::abs(array2[i])))
-        {
-            result[0] = false;
-        }
-    };
-
-    auto kernel_func = [&](sycl::handler &cgh) {
-        cgh.parallel_for<
-            class dpnp_allclose_c_kernel<_DataType1, _DataType2, _ResultType>>(
-            gws, kernel_parallel_for_func);
-    };
-
-    event = q.submit(kernel_func);
 
     event_ref = reinterpret_cast<DPCTLSyclEventRef>(&event);
-
     return DPCTLEvent_Copy(event_ref);
 }
 
@@ -269,7 +317,7 @@ DPCTLSyclEventRef dpnp_any_c(DPCTLSyclQueueRef q_ref,
     sycl::nd_range<1> gws(gws_range, lws_range);
 
     auto kernel_parallel_for_func = [=](sycl::nd_item<1> nd_it) {
-        auto gr = nd_it.get_group();
+        auto gr = nd_it.get_sub_group();
         const auto max_gr_size = gr.get_max_local_range()[0];
         const size_t start =
             vec_sz * (nd_it.get_group(0) * nd_it.get_local_range(0) +
diff --git a/dpnp/dpnp_iface_logic.py b/dpnp/dpnp_iface_logic.py
@@ -152,42 +152,74 @@ def all(x, /, axis=None, out=None, keepdims=False, *, where=True):
     )
 
 
-def allclose(x1, x2, rtol=1.0e-5, atol=1.0e-8, **kwargs):
+def allclose(a, b, rtol=1.0e-5, atol=1.0e-8, **kwargs):
     """
     Returns True if two arrays are element-wise equal within a tolerance.
 
     For full documentation refer to :obj:`numpy.allclose`.
 
+    Returns
+    -------
+    out : dpnp.ndarray
+        A boolean 0-dim array. If its value is ``True``,
+        two arrays are element-wise equal within a tolerance.
+
     Limitations
     -----------
-    Parameters `x1` and `x2` are supported as either :class:`dpnp.ndarray` or scalar.
+    Parameters `a` and `b` are supported either as :class:`dpnp.ndarray`,
+    :class:`dpctl.tensor.usm_ndarray` or scalars, but both `a` and `b`
+    can not be scalars at the same time.
     Keyword argument `kwargs` is currently unsupported.
     Otherwise the functions will be executed sequentially on CPU.
-    Input array data types are limited by supported DPNP :ref:`Data types`.
+    Parameters `rtol` and `atol` are supported as scalars. Otherwise
+    ``TypeError`` exeption will be raised.
+    Input array data types are limited by supported integer and
+    floating DPNP :ref:`Data types`.
+
+    See Also
+    --------
+    :obj:`dpnp.isclose` : Test whether two arrays are element-wise equal.
+    :obj:`dpnp.all` : Test whether all elements evaluate to True.
+    :obj:`dpnp.any` : Test whether any element evaluates to True.
+    :obj:`dpnp.equal` : Return (x1 == x2) element-wise.
 
     Examples
     --------
     >>> import dpnp as np
-    >>> np.allclose([1e10,1e-7], [1.00001e10,1e-8])
-    >>> False
+    >>> np.allclose(np.array([1e10, 1e-7]), np.array([1.00001e10, 1e-8]))
+    array([False])
+    >>> np.allclose(np.array([1.0, np.nan]), np.array([1.0, np.nan]))
+    array([False])
+    >>> np.allclose(np.array([1.0, np.inf]), np.array([1.0, np.inf]))
+    array([ True])
 
     """
 
-    rtol_is_scalar = dpnp.isscalar(rtol)
-    atol_is_scalar = dpnp.isscalar(atol)
-    x1_desc = dpnp.get_dpnp_descriptor(x1, copy_when_nondefault_queue=False)
-    x2_desc = dpnp.get_dpnp_descriptor(x2, copy_when_nondefault_queue=False)
-
-    if x1_desc and x2_desc and not kwargs:
-        if not rtol_is_scalar or not atol_is_scalar:
-            pass
-        else:
-            result_obj = dpnp_allclose(x1_desc, x2_desc, rtol, atol).get_pyobj()
-            result = dpnp.convert_single_elem_array_to_scalar(result_obj)
+    if dpnp.isscalar(a) and dpnp.isscalar(b):
+        # at least one of inputs has to be an array
+        pass
+    elif kwargs:
+        pass
+    else:
+        if not dpnp.isscalar(rtol):
+            raise TypeError(
+                "An argument `rtol` must be a scalar, but got {}".format(
+                    type(rtol)
+                )
+            )
+        elif not dpnp.isscalar(atol):
+            raise TypeError(
+                "An argument `atol` must be a scalar, but got {}".format(
+                    type(atol)
+                )
+            )
 
-            return result
+        a_desc = dpnp.get_dpnp_descriptor(a, copy_when_nondefault_queue=False)
+        b_desc = dpnp.get_dpnp_descriptor(b, copy_when_nondefault_queue=False)
+        if a_desc and b_desc:
+            return dpnp_allclose(a_desc, b_desc, rtol, atol).get_pyobj()
 
-    return call_origin(numpy.allclose, x1, x2, rtol=rtol, atol=atol, **kwargs)
+    return call_origin(numpy.allclose, a, b, rtol=rtol, atol=atol, **kwargs)
 
 
 def any(x, /, axis=None, out=None, keepdims=False, *, where=True):
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
@@ -452,11 +452,7 @@ tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transpose
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_int_axes
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_tensordot_with_list_axes
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_reversed_vdot
-tests/third_party/cupy/logic_tests/test_comparison.py::TestAllclose::test_allclose_array_scalar
-tests/third_party/cupy/logic_tests/test_comparison.py::TestAllclose::test_allclose_finite
-tests/third_party/cupy/logic_tests/test_comparison.py::TestAllclose::test_allclose_infinite
-tests/third_party/cupy/logic_tests/test_comparison.py::TestAllclose::test_allclose_infinite_equal_nan
-tests/third_party/cupy/logic_tests/test_comparison.py::TestAllclose::test_allclose_min_int
+
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_broadcast_not_allowed
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_diff_dtypes_is_equal
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_diff_dtypes_not_equal
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
@@ -598,11 +598,7 @@ tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transpose
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_tensordot_zero_dim
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_dot_with_out_f_contiguous
 tests/third_party/cupy/linalg_tests/test_product.py::TestProduct::test_transposed_multidim_vdot
-tests/third_party/cupy/logic_tests/test_comparison.py::TestAllclose::test_allclose_array_scalar
-tests/third_party/cupy/logic_tests/test_comparison.py::TestAllclose::test_allclose_finite
-tests/third_party/cupy/logic_tests/test_comparison.py::TestAllclose::test_allclose_infinite
-tests/third_party/cupy/logic_tests/test_comparison.py::TestAllclose::test_allclose_infinite_equal_nan
-tests/third_party/cupy/logic_tests/test_comparison.py::TestAllclose::test_allclose_min_int
+
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_broadcast_not_allowed
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_diff_dtypes_is_equal
 tests/third_party/cupy/logic_tests/test_comparison.py::TestArrayEqual::test_array_equal_diff_dtypes_not_equal
diff --git a/tests/test_logic.py b/tests/test_logic.py
@@ -44,9 +44,6 @@ def test_all(type, shape):
         assert_allclose(dpnp_res, np_res)
 
 
-@pytest.mark.skipif(
-    not has_support_aspect64(), reason="Aborted on Iris Xe: SAT-5988"
-)
 @pytest.mark.parametrize("type", get_all_dtypes(no_bool=True, no_complex=True))
 def test_allclose(type):
     a = numpy.random.rand(10)
diff --git a/tests/third_party/cupy/logic_tests/test_comparison.py b/tests/third_party/cupy/logic_tests/test_comparison.py
@@ -121,15 +121,15 @@ class TestAllclose(unittest.TestCase):
     @testing.for_all_dtypes()
     @testing.numpy_cupy_equal()
     def test_allclose_finite(self, xp, dtype):
-        a = xp.array([0.9e-5, 1.1e-5, 1000 + 1e-4, 1000 - 1e-4], dtype=dtype)
-        b = xp.array([0, 0, 1000, 1000], dtype=dtype)
+        a = xp.array([0.9e-5, 1.1e-5, 1000 + 1e-4, 1000 - 1e-4]).astype(dtype)
+        b = xp.array([0, 0, 1000, 1000]).astype(dtype)
         return xp.allclose(a, b)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_equal()
     def test_allclose_min_int(self, xp, dtype):
-        a = xp.array([0], dtype=dtype)
-        b = xp.array([numpy.iinfo("i").min], dtype=dtype)
+        a = xp.array([0]).astype(dtype)
+        b = xp.array([numpy.iinfo("i").min]).astype(dtype)
         return xp.allclose(a, b)
 
     @testing.for_float_dtypes()
@@ -138,24 +138,25 @@ def test_allclose_infinite(self, xp, dtype):
         nan = float("nan")
         inf = float("inf")
         ninf = float("-inf")
-        a = xp.array([0, nan, nan, 0, inf, ninf], dtype=dtype)
-        b = xp.array([0, nan, 0, nan, inf, ninf], dtype=dtype)
+        a = xp.array([0, nan, nan, 0, inf, ninf]).astype(dtype)
+        b = xp.array([0, nan, 0, nan, inf, ninf]).astype(dtype)
         return xp.allclose(a, b)
 
     @testing.for_float_dtypes()
     @testing.numpy_cupy_equal()
+    @pytest.mark.usefixtures("allow_fall_back_on_numpy")
     def test_allclose_infinite_equal_nan(self, xp, dtype):
         nan = float("nan")
         inf = float("inf")
         ninf = float("-inf")
-        a = xp.array([0, nan, inf, ninf], dtype=dtype)
-        b = xp.array([0, nan, inf, ninf], dtype=dtype)
+        a = xp.array([0, nan, inf, ninf]).astype(dtype)
+        b = xp.array([0, nan, inf, ninf]).astype(dtype)
         return xp.allclose(a, b, equal_nan=True)
 
     @testing.for_all_dtypes()
     @testing.numpy_cupy_equal()
     def test_allclose_array_scalar(self, xp, dtype):
-        a = xp.array([0.9e-5, 1.1e-5], dtype=dtype)
+        a = xp.array([0.9e-5, 1.1e-5]).astype(dtype)
         b = xp.dtype(xp.dtype).type(0)
         return xp.allclose(a, b)