Implementation of corrcoef (#2139)

AlexanderKalistratov · antonwolfy · web-flow · commit ba5fb31b270d · 2024-11-12T21:58:53.000+01:00
* Implementation of corrcoef

* Apply review comments

* Fix docs

---------

Co-authored-by: Anton &lt;100830759+antonwolfy@users.noreply.github.com&gt;
diff --git a/dpnp/dpnp_iface_statistics.py b/dpnp/dpnp_iface_statistics.py
@@ -65,6 +65,7 @@
     "amin",
     "average",
     "bincount",
+    "corrcoef",
     "correlate",
     "cov",
     "max",
@@ -360,6 +361,127 @@ def bincount(x1, weights=None, minlength=0):
     return call_origin(numpy.bincount, x1, weights=weights, minlength=minlength)
 
 
+def corrcoef(x, y=None, rowvar=True, *, dtype=None):
+    """
+    Return Pearson product-moment correlation coefficients.
+
+    For full documentation refer to :obj:`numpy.corrcoef`.
+
+    Parameters
+    ----------
+    x : {dpnp.ndarray, usm_ndarray}
+        A 1-D or 2-D array containing multiple variables and observations.
+        Each row of `x` represents a variable, and each column a single
+        observation of all those variables. Also see `rowvar` below.
+    y : {None, dpnp.ndarray, usm_ndarray}, optional
+        An additional set of variables and observations. `y` has the same
+        shape as `x`.
+        Default: ``None``.
+    rowvar : {bool}, optional
+        If `rowvar` is ``True``, then each row represents a variable,
+        with observations in the columns. Otherwise, the relationship
+        is transposed: each column represents a variable, while the rows
+        contain observations.
+        Default: ``True``.
+    dtype : {None, dtype}, optional
+        Data-type of the result.
+        Default: ``None``.
+
+    Returns
+    -------
+    R : {dpnp.ndarray}
+        The correlation coefficient matrix of the variables.
+
+    See Also
+    --------
+    :obj:`dpnp.cov` : Covariance matrix.
+
+    Examples
+    --------
+    In this example we generate two random arrays, ``xarr`` and ``yarr``, and
+    compute the row-wise and column-wise Pearson correlation coefficients,
+    ``R``. Since `rowvar` is true by default, we first find the row-wise
+    Pearson correlation coefficients between the variables of ``xarr``.
+
+    >>> import dpnp as np
+    >>> np.random.seed(123)
+    >>> xarr = np.random.rand(3, 3).astype(np.float32)
+    >>> xarr
+    array([[7.2858386e-17, 2.2066992e-02, 3.9520904e-01],
+           [4.8012391e-01, 5.9377134e-01, 4.5147297e-01],
+           [9.0728188e-01, 9.9387854e-01, 5.8399546e-01]], dtype=float32)
+    >>> R1 = np.corrcoef(xarr)
+    >>> R1
+    array([[ 0.99999994, -0.6173796 , -0.9685411 ],
+           [-0.6173796 ,  1.        ,  0.7937219 ],
+           [-0.9685411 ,  0.7937219 ,  0.9999999 ]], dtype=float32)
+
+    If we add another set of variables and observations ``yarr``, we can
+    compute the row-wise Pearson correlation coefficients between the
+    variables in ``xarr`` and ``yarr``.
+
+    >>> yarr = np.random.rand(3, 3).astype(np.float32)
+    >>> yarr
+    array([[0.17615308, 0.65354985, 0.15716429],
+           [0.09373496, 0.2123185 , 0.84086883],
+           [0.9011005 , 0.45206687, 0.00225109]], dtype=float32)
+    >>> R2 = np.corrcoef(xarr, yarr)
+    >>> R2
+    array([[ 0.99999994, -0.6173796 , -0.968541  , -0.48613155,  0.9951523 ,
+            -0.8900264 ],
+           [-0.6173796 ,  1.        ,  0.7937219 ,  0.9875833 , -0.53702235,
+             0.19083664],
+           [-0.968541  ,  0.7937219 ,  0.9999999 ,  0.6883078 , -0.9393724 ,
+             0.74857277],
+           [-0.48613152,  0.9875833 ,  0.6883078 ,  0.9999999 , -0.39783284,
+             0.0342579 ],
+           [ 0.9951523 , -0.53702235, -0.9393725 , -0.39783284,  0.99999994,
+            -0.9305482 ],
+           [-0.89002645,  0.19083665,  0.7485727 ,  0.0342579 , -0.9305482 ,
+             1.        ]], dtype=float32)
+
+    Finally if we use the option ``rowvar=False``, the columns are now
+    being treated as the variables and we will find the column-wise Pearson
+    correlation coefficients between variables in ``xarr`` and ``yarr``.
+
+    >>> R3 = np.corrcoef(xarr, yarr, rowvar=False)
+    >>> R3
+    array([[ 1.        ,  0.9724453 , -0.9909503 ,  0.8104691 , -0.46436927,
+            -0.1643624 ],
+           [ 0.9724453 ,  1.        , -0.9949381 ,  0.6515728 , -0.6580445 ,
+             0.07012729],
+           [-0.99095035, -0.994938  ,  1.        , -0.72450536,  0.5790461 ,
+             0.03047091],
+           [ 0.8104691 ,  0.65157276, -0.72450536,  1.        ,  0.14243561,
+            -0.71102554],
+           [-0.4643693 , -0.6580445 ,  0.57904613,  0.1424356 ,  0.99999994,
+            -0.79727215],
+           [-0.1643624 ,  0.07012729,  0.03047091, -0.7110255 , -0.7972722 ,
+             0.99999994]], dtype=float32)
+    """
+
+    out = dpnp.cov(x, y, rowvar, dtype=dtype)
+    if out.ndim == 0:
+        # scalar covariance
+        # nan if incorrect value (nan, inf, 0), 1 otherwise
+        return out / out
+
+    d = dpnp.diag(out)
+
+    stddev = dpnp.sqrt(d.real)
+    out /= stddev[:, None]
+    out /= stddev[None, :]
+
+    # Clip real and imaginary parts to [-1, 1]. This does not guarantee
+    # abs(a[i,j]) <= 1 for complex arrays, but is the best we can do without
+    # excessive work.
+    dpnp.clip(out.real, -1, 1, out=out.real)
+    if dpnp.iscomplexobj(out):
+        dpnp.clip(out.imag, -1, 1, out=out.imag)
+
+    return out
+
+
 def correlate(x1, x2, mode="valid"):
     """
     Cross-correlation of two 1-dimensional sequences.
diff --git a/tests/skipped_tests.tbl b/tests/skipped_tests.tbl
@@ -310,11 +310,6 @@ tests/third_party/cupy/random_tests/test_sample.py::TestRandomIntegers2::test_bo
 tests/third_party/cupy/random_tests/test_sample.py::TestRandomIntegers2::test_goodness_of_fit
 tests/third_party/cupy/random_tests/test_sample.py::TestRandomIntegers2::test_goodness_of_fit_2
 
-tests/third_party/cupy/statistics_tests/test_correlation.py::TestCorrcoef::test_corrcoef
-tests/third_party/cupy/statistics_tests/test_correlation.py::TestCorrcoef::test_corrcoef_diag_exception
-tests/third_party/cupy/statistics_tests/test_correlation.py::TestCorrcoef::test_corrcoef_rowvar
-tests/third_party/cupy/statistics_tests/test_correlation.py::TestCorrcoef::test_corrcoef_y
-
 tests/third_party/cupy/statistics_tests/test_order.py::TestOrder::test_percentile_defaults[linear]
 tests/third_party/cupy/statistics_tests/test_order.py::TestOrder::test_percentile_defaults[lower]
 tests/third_party/cupy/statistics_tests/test_order.py::TestOrder::test_percentile_defaults[higher]
diff --git a/tests/skipped_tests_gpu.tbl b/tests/skipped_tests_gpu.tbl
@@ -320,11 +320,6 @@ tests/third_party/cupy/random_tests/test_sample.py::TestRandomIntegers2::test_bo
 tests/third_party/cupy/random_tests/test_sample.py::TestRandomIntegers2::test_goodness_of_fit
 tests/third_party/cupy/random_tests/test_sample.py::TestRandomIntegers2::test_goodness_of_fit_2
 
-tests/third_party/cupy/statistics_tests/test_correlation.py::TestCorrcoef::test_corrcoef
-tests/third_party/cupy/statistics_tests/test_correlation.py::TestCorrcoef::test_corrcoef_diag_exception
-tests/third_party/cupy/statistics_tests/test_correlation.py::TestCorrcoef::test_corrcoef_rowvar
-tests/third_party/cupy/statistics_tests/test_correlation.py::TestCorrcoef::test_corrcoef_y
-
 tests/third_party/cupy/statistics_tests/test_order.py::TestOrder::test_percentile_defaults[linear]
 tests/third_party/cupy/statistics_tests/test_order.py::TestOrder::test_percentile_defaults[lower]
 tests/third_party/cupy/statistics_tests/test_order.py::TestOrder::test_percentile_defaults[higher]
diff --git a/tests/test_statistics.py b/tests/test_statistics.py
@@ -573,6 +573,61 @@ def test_std_error(self):
             dpnp.std(ia, ddof="1")
 
 
+class TestCorrcoef:
+    @pytest.mark.usefixtures(
+        "suppress_divide_invalid_numpy_warnings",
+        "suppress_dof_numpy_warnings",
+    )
+    @pytest.mark.parametrize("dtype", get_all_dtypes())
+    @pytest.mark.parametrize("rowvar", [True, False])
+    def test_corrcoef(self, dtype, rowvar):
+        dp_array = dpnp.array([[0, 1, 2], [3, 4, 0]], dtype=dtype)
+        np_array = dpnp.asnumpy(dp_array)
+
+        expected = numpy.corrcoef(np_array, rowvar=rowvar)
+        result = dpnp.corrcoef(dp_array, rowvar=rowvar)
+
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.usefixtures(
+        "suppress_divide_invalid_numpy_warnings",
+        "suppress_dof_numpy_warnings",
+        "suppress_mean_empty_slice_numpy_warnings",
+    )
+    @pytest.mark.parametrize("shape", [(2, 0), (0, 2)])
+    def test_corrcoef_empty(self, shape):
+        dp_array = dpnp.empty(shape, dtype=dpnp.int64)
+        np_array = dpnp.asnumpy(dp_array)
+
+        result = dpnp.corrcoef(dp_array)
+        expected = numpy.corrcoef(np_array)
+        assert_dtype_allclose(result, expected)
+
+    @pytest.mark.usefixtures("suppress_complex_warning")
+    @pytest.mark.parametrize("dt_in", get_all_dtypes(no_bool=True))
+    @pytest.mark.parametrize("dt_out", get_float_complex_dtypes())
+    def test_corrcoef_dtype(self, dt_in, dt_out):
+        dp_array = dpnp.array([[0, 1, 2], [3, 4, 0]], dtype=dt_in)
+        np_array = dpnp.asnumpy(dp_array)
+
+        expected = numpy.corrcoef(np_array, dtype=dt_out)
+        result = dpnp.corrcoef(dp_array, dtype=dt_out)
+        assert expected.dtype == result.dtype
+        assert_allclose(result, expected, rtol=1e-6)
+
+    @pytest.mark.usefixtures(
+        "suppress_divide_invalid_numpy_warnings",
+        "suppress_dof_numpy_warnings",
+    )
+    def test_corrcoef_scalar(self):
+        dp_array = dpnp.array(5)
+        np_array = dpnp.asnumpy(dp_array)
+
+        result = dpnp.corrcoef(dp_array)
+        expected = numpy.corrcoef(np_array)
+        assert_dtype_allclose(result, expected)
+
+
 @pytest.mark.usefixtures("allow_fall_back_on_numpy")
 class TestBincount:
     @pytest.mark.parametrize(
diff --git a/tests/test_sycl_queue.py b/tests/test_sycl_queue.py
@@ -442,6 +442,7 @@ def test_meshgrid(device):
         pytest.param("ceil", [-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]),
         pytest.param("conjugate", [[1.0 + 1.0j, 0.0], [0.0, 1.0 + 1.0j]]),
         pytest.param("copy", [1.0, 2.0, 3.0]),
+        pytest.param("corrcoef", [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]),
         pytest.param(
             "cos", [-dpnp.pi / 2, -dpnp.pi / 4, 0.0, dpnp.pi / 4, dpnp.pi / 2]
         ),
@@ -693,6 +694,11 @@ def test_reduce_hypot(device):
         pytest.param("append", [1, 2, 3], [4, 5, 6]),
         pytest.param("arctan2", [-1, +1, +1, -1], [-1, -1, +1, +1]),
         pytest.param("copysign", [0.0, 1.0, 2.0], [-1.0, 0.0, 1.0]),
+        pytest.param(
+            "corrcoef",
+            [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
+            [[0.7, 0.8, 0.9], [1.0, 1.1, 1.2]],
+        ),
         pytest.param("cross", [1.0, 2.0, 3.0], [4.0, 5.0, 6.0]),
         pytest.param("digitize", [0.2, 6.4, 3.0], [0.0, 1.0, 2.5, 4.0]),
         pytest.param(
diff --git a/tests/test_usm_type.py b/tests/test_usm_type.py
@@ -576,6 +576,7 @@ def test_norm(usm_type, ord, axis):
         pytest.param("cbrt", [1, 8, 27]),
         pytest.param("ceil", [-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]),
         pytest.param("conjugate", [[1.0 + 1.0j, 0.0], [0.0, 1.0 + 1.0j]]),
+        pytest.param("corrcoef", [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]]),
         pytest.param(
             "cos", [-dp.pi / 2, -dp.pi / 4, 0.0, dp.pi / 4, dp.pi / 2]
         ),
@@ -685,6 +686,11 @@ def test_1in_1out(func, data, usm_type):
         pytest.param("copysign", [0.0, 1.0, 2.0], [-1.0, 0.0, 1.0]),
         pytest.param("cross", [1.0, 2.0, 3.0], [4.0, 5.0, 6.0]),
         pytest.param("digitize", [0.2, 6.4, 3.0], [0.0, 1.0, 2.5, 4.0]),
+        pytest.param(
+            "corrcoef",
+            [[0.1, 0.2, 0.3], [0.4, 0.5, 0.6]],
+            [[0.7, 0.8, 0.9], [1.0, 1.1, 1.2]],
+        ),
         # dpnp.dot has 3 different implementations based on input arrays dtype
         # checking all of them
         pytest.param("dot", [3.0, 4.0, 5.0], [1.0, 2.0, 3.0]),
diff --git a/tests/third_party/cupy/statistics_tests/test_correlation.py b/tests/third_party/cupy/statistics_tests/test_correlation.py
@@ -12,26 +12,26 @@
 
 class TestCorrcoef(unittest.TestCase):
     @testing.for_all_dtypes()
-    @testing.numpy_cupy_allclose()
+    @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
     def test_corrcoef(self, xp, dtype):
         a = testing.shaped_arange((2, 3), xp, dtype)
         return xp.corrcoef(a)
 
     @testing.for_all_dtypes()
-    @testing.numpy_cupy_allclose()
+    @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
     def test_corrcoef_diag_exception(self, xp, dtype):
         a = testing.shaped_arange((1, 3), xp, dtype)
         return xp.corrcoef(a)
 
     @testing.for_all_dtypes()
-    @testing.numpy_cupy_allclose()
+    @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
     def test_corrcoef_y(self, xp, dtype):
         a = testing.shaped_arange((2, 3), xp, dtype)
         y = testing.shaped_arange((2, 3), xp, dtype)
         return xp.corrcoef(a, y=y)
 
     @testing.for_all_dtypes()
-    @testing.numpy_cupy_allclose()
+    @testing.numpy_cupy_allclose(type_check=has_support_aspect64())
     def test_corrcoef_rowvar(self, xp, dtype):
         a = testing.shaped_arange((2, 3), xp, dtype)
         y = testing.shaped_arange((2, 3), xp, dtype)