feat: modify pd.compare to compare with absolute and relative tolerance

diogojarodrigues · tomhoq · diogojarodrigues · commit 144e0b5e8438 · 2024-05-27T15:07:57.000+01:00
Co-authored-by: Tomaz Silva &lt;tomaz.goncalves-silva@tecnico.ulisboa.pt&gt;
diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -39,11 +39,14 @@ Other enhancements
 - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
 - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
 - :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
+- :meth:`DataFrame.compare` now supports comparing floating point values with tolerance (:issue:`58827`)
 - :meth:`DataFrame.corrwith` now accepts ``min_periods`` as optional arguments, as in :meth:`DataFrame.corr` and :meth:`Series.corr` (:issue:`9490`)
 - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
 - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
+- :meth:`Series.compare` now supports comparing floating point values with tolerance (:issue:`58827`)
 - :meth:`Series.cummin` and :meth:`Series.cummax` now supports :class:`CategoricalDtype` (:issue:`52335`)
 - :meth:`Series.plot` now correctly handle the ``ylabel`` parameter for pie charts, allowing for explicit control over the y-axis label (:issue:`58239`)
+
 - Support reading Stata 110-format (Stata 7) dta files (:issue:`47176`)
 -
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -101,6 +101,7 @@
     is_integer_dtype,
     is_iterator,
     is_list_like,
+    is_number,
     is_scalar,
     is_sequence,
     needs_i8_conversion,
@@ -8460,6 +8461,20 @@ def rpow(
         2    b     b  3.0   3.0  3.0   4.0
         3    b     b  NaN   NaN  4.0   4.0
         4    a     a  5.0   5.0  5.0   5.0
+
+        Compare dataframes with tolerance (float)
+
+        >>> df.compare(df2, atol=1)
+                col1
+          self other
+        0    a     c
+
+        Compare dataframes with tolerance (dict)
+
+        >>> df.compare(df2, atol={{"col3": 1}})
+                col1
+          self other
+        0    a     c
         """
         ),
         klass=_shared_doc_kwargs["klass"],
@@ -8471,13 +8486,31 @@ def compare(
         keep_shape: bool = False,
         keep_equal: bool = False,
         result_names: Suffixes = ("self", "other"),
+        check_exact: bool | lib.NoDefault = lib.no_default,
+        rtol: float | ListLike | dict | lib.NoDefault = lib.no_default,
+        atol: float | ListLike | dict | lib.NoDefault = lib.no_default,
     ) -> DataFrame:
+        if rtol is not lib.no_default:
+            if not (is_number(rtol) or is_dict_like(rtol) or is_list_like(rtol)):
+                raise TypeError(
+                    f"rtol must be a number, list or dict, got {type(rtol)}"
+                )
+
+        if atol is not lib.no_default:
+            if not (is_number(atol) or is_dict_like(atol) or is_list_like(atol)):
+                raise TypeError(
+                    f"atol must be a number, list or dict, got {type(atol)}"
+                )
+
         return super().compare(
             other=other,
             align_axis=align_axis,
             keep_shape=keep_shape,
             keep_equal=keep_equal,
             result_names=result_names,
+            check_exact=check_exact,
+            rtol=rtol,
+            atol=atol,
         )
 
     def combine(
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -114,6 +114,7 @@
     is_bool_dtype,
     is_dict_like,
     is_extension_array_dtype,
+    is_float_dtype,
     is_list_like,
     is_number,
     is_numeric_dtype,
@@ -9205,17 +9206,64 @@ def compare(
         keep_shape: bool = False,
         keep_equal: bool = False,
         result_names: Suffixes = ("self", "other"),
-    ):
+        check_exact: bool | lib.NoDefault = lib.no_default,
+        rtol: float | ListLike | dict | lib.NoDefault = lib.no_default,
+        atol: float | ListLike | dict | lib.NoDefault = lib.no_default,
+    ) -> DataFrame | Series:
+        if (
+            check_exact is lib.no_default
+            and rtol is lib.no_default
+            and atol is lib.no_default
+        ):
+            check_exact = True
+        elif check_exact is lib.no_default:  # tolerance is specified
+            check_exact = False
+
+        rtol = rtol if rtol is not lib.no_default else 1.0e-5
+        atol = atol if atol is not lib.no_default else 1.0e-8
+
         if type(self) is not type(other):
             cls_self, cls_other = type(self).__name__, type(other).__name__
             raise TypeError(
                 f"can only compare '{cls_self}' (not '{cls_other}') with '{cls_self}'"
             )
 
-        # error: Unsupported left operand type for & ("Self")
-        mask = ~((self == other) | (self.isna() & other.isna()))  # type: ignore[operator]
-        mask.fillna(True, inplace=True)
+        if not check_exact:
+            if isinstance(self, ABCDataFrame):
+                mask = np.ones(self.shape, dtype=bool)
 
+                for i, col in enumerate(self.columns):
+                    if is_dict_like(rtol):
+                        r_tol = rtol.get(col, 1.0e-5)
+                    elif is_list_like(rtol):
+                        r_tol = rtol[i]
+                    else:
+                        r_tol = rtol
+
+                    if is_dict_like(atol):
+                        a_tol = atol.get(col, 1.0e-8)
+                    elif is_list_like(atol):
+                        a_tol = atol[i]
+                    else:
+                        a_tol = atol
+
+                    if is_float_dtype(self[col]) and is_float_dtype(other[col]):
+                        mask[:, self.columns.get_loc(col)] = np.isclose(
+                            self[col], other[col], rtol=r_tol, atol=a_tol
+                        )
+                    else:
+                        mask[:, self.columns.get_loc(col)] = self[col] == other[col]
+            # is series
+            else:
+                if is_float_dtype(self):
+                    mask = np.isclose(self, other, rtol=rtol, atol=atol)
+                else:
+                    mask = self == other
+        else:
+            mask = self == other
+
+        mask = ~(mask | (self.isna() & other.isna()))
+        mask.fillna(True, inplace=True)
         if not keep_equal:
             self = self.where(mask)
             other = other.where(mask)
@@ -9229,6 +9277,7 @@ def compare(
             else:
                 self = self[mask]
                 other = other[mask]
+
         if not isinstance(result_names, tuple):
             raise TypeError(
                 f"Passing 'result_names' as a {type(result_names)} is not "
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -71,6 +71,7 @@
     is_integer,
     is_iterator,
     is_list_like,
+    is_number,
     is_object_dtype,
     is_scalar,
     pandas_dtype,
@@ -2986,6 +2987,17 @@ def _append(
         2    c     c
         3    d     b
         4    e     e
+
+        Compare dataframes with tolerance
+
+
+        >>> s1 = pd.Series([1.0, 2.0])
+        >>> s2 = pd.Series([1.1, 2.2])
+        >>> s1.compare(s2, atol=0.1)
+                col1
+          self other
+        0  2.0   2.2
+
         """
         ),
         klass=_shared_doc_kwargs["klass"],
@@ -2997,13 +3009,27 @@ def compare(
         keep_shape: bool = False,
         keep_equal: bool = False,
         result_names: Suffixes = ("self", "other"),
+        check_exact: bool | lib.NoDefault = lib.no_default,
+        rtol: int | float | lib.NoDefault = lib.no_default,
+        atol: int | float | lib.NoDefault = lib.no_default,
     ) -> DataFrame | Series:
+        if rtol is not lib.no_default:
+            if not is_number(rtol):
+                raise TypeError(f"rtol must be a number, got {type(atol)}")
+
+        if atol is not lib.no_default:
+            if not is_number(atol):
+                raise TypeError(f"atol must be number, got {type(atol)}")
+
         return super().compare(
             other=other,
             align_axis=align_axis,
             keep_shape=keep_shape,
             keep_equal=keep_equal,
             result_names=result_names,
+            check_exact=check_exact,
+            rtol=rtol,
+            atol=atol,
         )
 
     def combine(
diff --git a/pandas/tests/frame/methods/test_compare.py b/pandas/tests/frame/methods/test_compare.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 
+from pandas._libs import lib
 from pandas.compat.numpy import np_version_gte1p25
 
 import pandas as pd
@@ -214,6 +215,83 @@ def test_compare_result_names():
     tm.assert_frame_equal(result, expected)
 
 
+@pytest.mark.parametrize(
+    "atol, rtol, check_exact, expected_self, expected_other",
+    [
+        (lib.no_default, lib.no_default, True, [1.0, 2.0, 4], [0.4, 1.6, 3.5]),
+        (0, 0, False, [1.0, 2.0, 4], [0.4, 1.6, 3.5]),
+        (0.5, 0, False, [1.0], [0.4]),
+        (0, 0.5, False, [1.0], [0.4]),
+        (0.5, 0.00000001, False, [1.0], [0.4]),
+        (0.00000001, 0.5, False, [1.0], [0.4]),
+        (lib.no_default, lib.no_default, False, [1.0, 2.0, 4], [0.4, 1.6, 3.5]),
+        (0.5, lib.no_default, False, [1.0], [0.4]),
+        (lib.no_default, 0.5, False, [1.0], [0.4]),
+        ("a", lib.no_default, False, None, None),
+    ],
+)
+def test_compare_tolerance_float(
+    atol, rtol, check_exact, expected_self, expected_other
+):
+    df1 = pd.DataFrame(
+        {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [1.0, 2.0, 4]}
+    )
+
+    df2 = pd.DataFrame(
+        {"col1": ["a", "b", "c"], "col2": [1.0, 2.0, np.nan], "col3": [0.4, 1.6, 3.5]}
+    )
+
+    if expected_self is None:
+        with pytest.raises(TypeError):
+            df1.compare(df2, atol=atol, rtol=rtol, check_exact=check_exact)
+        return
+
+    result = df1.compare(df2, atol=atol, rtol=rtol, check_exact=check_exact)
+
+    expected_data = {
+        ("col3", "self"): pd.Series(expected_self),
+        ("col3", "other"): pd.Series(expected_other),
+    }
+
+    expected = pd.DataFrame(expected_data)
+
+    tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize(
+    "atol, expected_self, expected_other",
+    [
+        ([0.1, 0.2], [1.0, 2.0], [1.2, 2.2]),
+        ((0.1, 0.2), [1.0, 2.0], [1.2, 2.2]),
+        ({"col1": 0.1, "col2": 0.2}, [1.0, 2.0], [1.2, 2.2]),
+        ({"col2": 0.2}, [1.0, 2.0], [1.2, 2.2]),
+        ({"col1": "a"}, None, None),
+        ((0.1, "a"), None, None),
+        ([0.1, "a"], None, None),
+    ],
+)
+def test_compare_tolerance_dict_or_list(atol, expected_self, expected_other):
+    df1 = pd.DataFrame({"col1": [1.0, 2.0], "col2": [3.0, 4.0]})
+
+    df2 = pd.DataFrame({"col1": [1.2, 2.2], "col2": [3.2, 4.2]})
+
+    if expected_self is None:
+        with pytest.raises(TypeError):
+            df1.compare(df2, atol=atol)
+        return
+
+    result = df1.compare(df2, atol=atol)
+
+    expected_data = {
+        ("col1", "self"): pd.Series(expected_self),
+        ("col1", "other"): pd.Series(expected_other),
+    }
+
+    expected = pd.DataFrame(expected_data)
+
+    tm.assert_frame_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "result_names",
     [
diff --git a/pandas/tests/series/methods/test_compare.py b/pandas/tests/series/methods/test_compare.py
@@ -1,6 +1,8 @@
 import numpy as np
 import pytest
 
+from pandas._libs import lib
+
 import pandas as pd
 import pandas._testing as tm
 
@@ -115,6 +117,41 @@ def test_compare_different_lengths():
         ser1.compare(ser2)
 
 
+@pytest.mark.parametrize(
+    "atol, rtol, check_exact, expected_self, expected_other",
+    [
+        (lib.no_default, lib.no_default, True, [1.0, 2.0, 4], [0.4, 1.6, 3.5]),
+        (0, 0, False, [1.0, 2.0, 4], [0.4, 1.6, 3.5]),
+        (0.5, 0, False, [1.0], [0.4]),
+        (0, 0.5, False, [1.0], [0.4]),
+        (0.5, 0.00000001, False, [1.0], [0.4]),
+        (0.00000001, 0.5, False, [1.0], [0.4]),
+        (lib.no_default, lib.no_default, False, [1.0, 2.0, 4], [0.4, 1.6, 3.5]),
+        (0.5, lib.no_default, False, [1.0], [0.4]),
+        (lib.no_default, 0.5, False, [1.0], [0.4]),
+        ("a", lib.no_default, False, None, None),
+    ],
+)
+def test_compare_tolerance_float(
+    atol, rtol, check_exact, expected_self, expected_other
+):
+    df1 = pd.Series([1.0, 2.0, 4])
+
+    df2 = pd.Series([0.4, 1.6, 3.5])
+
+    if expected_self is None:
+        with pytest.raises(TypeError):
+            df1.compare(df2, atol=atol, rtol=rtol, check_exact=check_exact)
+        return
+
+    result = df1.compare(df2, atol=atol, rtol=rtol, check_exact=check_exact)
+
+    expected_data = {"self": expected_self, "other": expected_other}
+    expected = pd.DataFrame(expected_data)
+
+    tm.assert_frame_equal(result, expected)
+
+
 def test_compare_datetime64_and_string():
     # Issue https://github.com/pandas-dev/pandas/issues/45506
     # Catch OverflowError when comparing datetime64 and string