Merge pull request #1473 from IntelPython/dpctl_square

npolina4 · web-flow · commit 457fb55f8f6a · 2023-07-10T12:18:23.000-07:00
Improve dpnp.square() implementation
diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
@@ -23,6 +23,7 @@ env:
       test_umath.py
       test_usm_type.py
       third_party/cupy/math_tests/test_explog.py
+      third_party/cupy/math_tests/test_misc.py
       third_party/cupy/math_tests/test_trigonometric.py
       third_party/cupy/sorting_tests/test_sort.py
   VER_JSON_NAME: 'version.json'
diff --git a/dpnp/backend/extensions/vm/sqr.hpp b/dpnp/backend/extensions/vm/sqr.hpp
@@ -0,0 +1,78 @@
+//*****************************************************************************
+// Copyright (c) 2023, Intel Corporation
+// All rights reserved.
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are met:
+// - Redistributions of source code must retain the above copyright notice,
+//   this list of conditions and the following disclaimer.
+// - Redistributions in binary form must reproduce the above copyright notice,
+//   this list of conditions and the following disclaimer in the documentation
+//   and/or other materials provided with the distribution.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+// THE POSSIBILITY OF SUCH DAMAGE.
+//*****************************************************************************
+
+#pragma once
+
+#include <CL/sycl.hpp>
+
+#include "common.hpp"
+#include "types_matrix.hpp"
+
+namespace dpnp
+{
+namespace backend
+{
+namespace ext
+{
+namespace vm
+{
+template <typename T>
+sycl::event sqr_contig_impl(sycl::queue exec_q,
+                            const std::int64_t n,
+                            const char *in_a,
+                            char *out_y,
+                            const std::vector<sycl::event> &depends)
+{
+    type_utils::validate_type_for_device<T>(exec_q);
+
+    const T *a = reinterpret_cast<const T *>(in_a);
+    T *y = reinterpret_cast<T *>(out_y);
+
+    return mkl_vm::sqr(exec_q,
+                       n, // number of elements to be calculated
+                       a, // pointer `a` containing input vector of size n
+                       y, // pointer `y` to the output vector of size n
+                       depends);
+}
+
+template <typename fnT, typename T>
+struct SqrContigFactory
+{
+    fnT get()
+    {
+        if constexpr (std::is_same_v<
+                          typename types::SqrOutputType<T>::value_type, void>)
+        {
+            return nullptr;
+        }
+        else {
+            return sqr_contig_impl<T>;
+        }
+    }
+};
+} // namespace vm
+} // namespace ext
+} // namespace backend
+} // namespace dpnp
diff --git a/dpnp/backend/extensions/vm/types_matrix.hpp b/dpnp/backend/extensions/vm/types_matrix.hpp
@@ -125,6 +125,21 @@ struct SinOutputType
         dpctl_td_ns::DefaultResultEntry<void>>::result_type;
 };
 
+/**
+ * @brief A factory to define pairs of supported types for which
+ * MKL VM library provides support in oneapi::mkl::vm::sqr<T> function.
+ *
+ * @tparam T Type of input vector `a` and of result vector `y`.
+ */
+template <typename T>
+struct SqrOutputType
+{
+    using value_type = typename std::disjunction<
+        dpctl_td_ns::TypeMapResultEntry<T, double, double>,
+        dpctl_td_ns::TypeMapResultEntry<T, float, float>,
+        dpctl_td_ns::DefaultResultEntry<void>>::result_type;
+};
+
 /**
  * @brief A factory to define pairs of supported types for which
  * MKL VM library provides support in oneapi::mkl::vm::sqrt<T> function.
diff --git a/dpnp/backend/extensions/vm/vm_py.cpp b/dpnp/backend/extensions/vm/vm_py.cpp
@@ -35,6 +35,7 @@
 #include "div.hpp"
 #include "ln.hpp"
 #include "sin.hpp"
+#include "sqr.hpp"
 #include "sqrt.hpp"
 #include "types_matrix.hpp"
 
@@ -49,6 +50,7 @@ static binary_impl_fn_ptr_t div_dispatch_vector[dpctl_td_ns::num_types];
 static unary_impl_fn_ptr_t cos_dispatch_vector[dpctl_td_ns::num_types];
 static unary_impl_fn_ptr_t ln_dispatch_vector[dpctl_td_ns::num_types];
 static unary_impl_fn_ptr_t sin_dispatch_vector[dpctl_td_ns::num_types];
+static unary_impl_fn_ptr_t sqr_dispatch_vector[dpctl_td_ns::num_types];
 static unary_impl_fn_ptr_t sqrt_dispatch_vector[dpctl_td_ns::num_types];
 
 PYBIND11_MODULE(_vm_impl, m)
@@ -170,6 +172,35 @@ PYBIND11_MODULE(_vm_impl, m)
               py::arg("sycl_queue"), py::arg("src"), py::arg("dst"));
     }
 
+    // UnaryUfunc: ==== Sqr(x) ====
+    {
+        vm_ext::init_ufunc_dispatch_vector<unary_impl_fn_ptr_t,
+                                           vm_ext::SqrContigFactory>(
+            sqr_dispatch_vector);
+
+        auto sqr_pyapi = [&](sycl::queue exec_q, arrayT src, arrayT dst,
+                             const event_vecT &depends = {}) {
+            return vm_ext::unary_ufunc(exec_q, src, dst, depends,
+                                       sqr_dispatch_vector);
+        };
+        m.def(
+            "_sqr", sqr_pyapi,
+            "Call `sqr` from OneMKL VM library to performs element by element "
+            "operation of squaring of vector `src` to resulting vector `dst`",
+            py::arg("sycl_queue"), py::arg("src"), py::arg("dst"),
+            py::arg("depends") = py::list());
+
+        auto sqr_need_to_call_pyapi = [&](sycl::queue exec_q, arrayT src,
+                                          arrayT dst) {
+            return vm_ext::need_to_call_unary_ufunc(exec_q, src, dst,
+                                                    sqr_dispatch_vector);
+        };
+        m.def("_mkl_sqr_to_call", sqr_need_to_call_pyapi,
+              "Check input arguments to answer if `sqr` function from "
+              "OneMKL VM library can be used",
+              py::arg("sycl_queue"), py::arg("src"), py::arg("dst"));
+    }
+
     // UnaryUfunc: ==== Sqrt(x) ====
     {
         vm_ext::init_ufunc_dispatch_vector<unary_impl_fn_ptr_t,
diff --git a/dpnp/backend/include/dpnp_iface_fptr.hpp b/dpnp/backend/include/dpnp_iface_fptr.hpp
@@ -476,9 +476,7 @@ enum class DPNPFuncName : size_t
     DPNP_FN_SQRT_EXT, /**< Used in numpy.sqrt() impl, requires extra parameters
                        */
     DPNP_FN_SQUARE,   /**< Used in numpy.square() impl  */
-    DPNP_FN_SQUARE_EXT, /**< Used in numpy.square() impl, requires extra
-                           parameters */
-    DPNP_FN_STD,        /**< Used in numpy.std() impl  */
+    DPNP_FN_STD,      /**< Used in numpy.std() impl  */
     DPNP_FN_STD_EXT, /**< Used in numpy.std() impl, requires extra parameters */
     DPNP_FN_SUBTRACT,     /**< Used in numpy.subtract() impl  */
     DPNP_FN_SUBTRACT_EXT, /**< Used in numpy.subtract() impl, requires extra
diff --git a/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp b/dpnp/backend/kernels/dpnp_krnl_elemwise.cpp
@@ -1156,15 +1156,6 @@ static void func_map_init_elemwise_1arg_1type(func_map_t &fmap)
     fmap[DPNPFuncName::DPNP_FN_SQUARE][eft_DBL][eft_DBL] = {
         eft_DBL, (void *)dpnp_square_c_default<double>};
 
-    fmap[DPNPFuncName::DPNP_FN_SQUARE_EXT][eft_INT][eft_INT] = {
-        eft_INT, (void *)dpnp_square_c_ext<int32_t>};
-    fmap[DPNPFuncName::DPNP_FN_SQUARE_EXT][eft_LNG][eft_LNG] = {
-        eft_LNG, (void *)dpnp_square_c_ext<int64_t>};
-    fmap[DPNPFuncName::DPNP_FN_SQUARE_EXT][eft_FLT][eft_FLT] = {
-        eft_FLT, (void *)dpnp_square_c_ext<float>};
-    fmap[DPNPFuncName::DPNP_FN_SQUARE_EXT][eft_DBL][eft_DBL] = {
-        eft_DBL, (void *)dpnp_square_c_ext<double>};
-
     return;
 }
 
diff --git a/dpnp/dpnp_algo/dpnp_algo.pxd b/dpnp/dpnp_algo/dpnp_algo.pxd
@@ -291,8 +291,6 @@ cdef extern from "dpnp_iface_fptr.hpp" namespace "DPNPFuncName":  # need this na
         DPNP_FN_SINH_EXT
         DPNP_FN_SORT
         DPNP_FN_SORT_EXT
-        DPNP_FN_SQUARE
-        DPNP_FN_SQUARE_EXT
         DPNP_FN_STD
         DPNP_FN_STD_EXT
         DPNP_FN_SUM
@@ -543,6 +541,5 @@ cpdef dpnp_descriptor dpnp_log2(dpnp_descriptor array1)
 cpdef dpnp_descriptor dpnp_radians(dpnp_descriptor array1)
 cpdef dpnp_descriptor dpnp_recip(dpnp_descriptor array1)
 cpdef dpnp_descriptor dpnp_sinh(dpnp_descriptor array1)
-cpdef dpnp_descriptor dpnp_square(dpnp_descriptor array1)
 cpdef dpnp_descriptor dpnp_tan(dpnp_descriptor array1, dpnp_descriptor out)
 cpdef dpnp_descriptor dpnp_tanh(dpnp_descriptor array1)
diff --git a/dpnp/dpnp_algo/dpnp_algo_trigonometric.pxi b/dpnp/dpnp_algo/dpnp_algo_trigonometric.pxi
@@ -54,7 +54,6 @@ __all__ += [
     'dpnp_radians',
     'dpnp_recip',
     'dpnp_sinh',
-    'dpnp_square',
     'dpnp_tan',
     'dpnp_tanh',
     'dpnp_unwrap'
@@ -133,10 +132,6 @@ cpdef utils.dpnp_descriptor dpnp_sinh(utils.dpnp_descriptor x1):
     return call_fptr_1in_1out_strides(DPNP_FN_SINH_EXT, x1)
 
 
-cpdef utils.dpnp_descriptor dpnp_square(utils.dpnp_descriptor x1):
-    return call_fptr_1in_1out_strides(DPNP_FN_SQUARE_EXT, x1)
-
-
 cpdef utils.dpnp_descriptor dpnp_tan(utils.dpnp_descriptor x1, utils.dpnp_descriptor out):
     return call_fptr_1in_1out_strides(DPNP_FN_TAN_EXT, x1, dtype=None, out=out, where=True, func_name='tan')
 
diff --git a/dpnp/dpnp_algo/dpnp_elementwise_common.py b/dpnp/dpnp_algo/dpnp_elementwise_common.py
@@ -59,6 +59,7 @@
     "dpnp_not_equal",
     "dpnp_sin",
     "dpnp_sqrt",
+    "dpnp_square",
     "dpnp_subtract",
 ]
 
@@ -900,6 +901,57 @@ def _call_sqrt(src, dst, sycl_queue, depends=None):
     return dpnp_array._create_from_usm_ndarray(res_usm)
 
 
+_square_docstring_ = """
+square(x, out=None, order='K')
+Computes `x_i**2` (or `x_i*x_i`) for each element `x_i` of input array `x`.
+Args:
+    x (dpnp.ndarray):
+        Input array.
+    out ({None, dpnp.ndarray}, optional):
+        Output array to populate. Array must have the correct
+        shape and the expected data type.
+    order ("C","F","A","K", optional): memory layout of the new
+        output array, if parameter `out` is `None`.
+        Default: "K".
+Return:
+    dpnp.ndarray:
+        An array containing the element-wise square results.
+"""
+
+
+def dpnp_square(x, out=None, order="K"):
+    """
+    Invokes sqr() function from pybind11 extension of OneMKL VM if possible.
+
+    Otherwise fully relies on dpctl.tensor implementation for square() function.
+
+    """
+
+    def _call_square(src, dst, sycl_queue, depends=None):
+        """A callback to register in UnaryElementwiseFunc class of dpctl.tensor"""
+
+        if depends is None:
+            depends = []
+
+        if vmi._mkl_sqr_to_call(sycl_queue, src, dst):
+            # call pybind11 extension for sqr() function from OneMKL VM
+            return vmi._sqr(sycl_queue, src, dst, depends)
+        return ti._square(src, dst, sycl_queue, depends)
+
+    # dpctl.tensor only works with usm_ndarray or scalar
+    x_usm = dpnp.get_usm_ndarray(x)
+    out_usm = None if out is None else dpnp.get_usm_ndarray(out)
+
+    func = UnaryElementwiseFunc(
+        "square",
+        ti._square_result_type,
+        _call_square,
+        _square_docstring_,
+    )
+    res_usm = func(x_usm, out=out_usm, order=order)
+    return dpnp_array._create_from_usm_ndarray(res_usm)
+
+
 _subtract_docstring_ = """
 subtract(x1, x2, out=None, order="K")
 
diff --git a/dpnp/dpnp_iface_trigonometric.py b/dpnp/dpnp_iface_trigonometric.py
@@ -52,6 +52,7 @@
     dpnp_log,
     dpnp_sin,
     dpnp_sqrt,
+    dpnp_square,
 )
 
 __all__ = [
@@ -1108,19 +1109,40 @@ def sqrt(
     )
 
 
-def square(x1):
+def square(
+    x,
+    /,
+    out=None,
+    *,
+    order="K",
+    where=True,
+    dtype=None,
+    subok=True,
+    **kwargs,
+):
     """
     Return the element-wise square of the input.
 
     For full documentation refer to :obj:`numpy.square`.
 
+    Returns
+    -------
+    y : dpnp.ndarray
+        Element-wise `x * x`, of the same shape and dtype as `x`.
+
     Limitations
     -----------
-    Input array is supported as :obj:`dpnp.ndarray`.
+    Input array is supported as either :class:`dpnp.ndarray` or :class:`dpctl.tensor.usm_ndarray`.
+    Parameter `out` is supported as class:`dpnp.ndarray`, class:`dpctl.tensor.usm_ndarray` or
+    with default value ``None``.
+    Parameters `where`, `dtype` and `subok` are supported with their default values.
+    Otherwise the function will be executed sequentially on CPU.
     Input array data types are limited by supported DPNP :ref:`Data types`.
 
     See Also
     --------
+    :obj:`dpnp..linalg.matrix_power` : Raise a square matrix
+                                       to the (integer) power `n`.
     :obj:`dpnp.sqrt` : Return the positive square-root of an array,
                        element-wise.
     :obj:`dpnp.power` : First array elements raised to powers
@@ -1129,20 +1151,23 @@ def square(x1):
     Examples
     --------
     >>> import dpnp as np
-    >>> x = np.array([1, 2, 3])
-    >>> out = np.square(x)
-    >>> [i for i in out]
-    [1, 4, 9]
+    >>> x = np.array([-1j, 1])
+    >>> np.square(x)
+    array([-1.+0.j,  1.+0.j])
 
     """
 
-    x1_desc = dpnp.get_dpnp_descriptor(
-        x1, copy_when_strides=False, copy_when_nondefault_queue=False
+    return check_nd_call_func(
+        numpy.square,
+        dpnp_square,
+        x,
+        out=out,
+        where=where,
+        order=order,
+        dtype=dtype,
+        subok=subok,
+        **kwargs,
     )
-    if x1_desc:
-        return dpnp_square(x1_desc).get_pyobj()
-
-    return call_origin(numpy.square, x1, **kwargs)
 
 
 def tan(x1, out=None, **kwargs):
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
diff --git a/tests/test_umath.py b/tests/test_umath.py
diff --git a/tests/third_party/cupy/math_tests/test_misc.py b/tests/third_party/cupy/math_tests/test_misc.py