Fixes an issue raised with pandas 2.0 (#30)

sdpython · web-flow · commit 7d2632c41d2a · 2023-05-01T08:36:33.000+02:00
* Fixes an issue raised with pandas 2.0

* remove debug code

* Update test_pandas_groupbynan.py

* add drop
diff --git a/_unittests/ut_df/test_pandas_groupbynan.py b/_unittests/ut_df/test_pandas_groupbynan.py
@@ -49,7 +49,7 @@ def test_pandas_groupbynan(self):
                 gr = pandas_groupby_nan(df, ("value", "this"))
                 t = True
                 raise AssertionError("---")
-            except TypeError:
+            except (TypeError, KeyError):
                 t = False
             if t:
                 co = gr.sum()
@@ -91,17 +91,14 @@ def test_pandas_groupbynan_tuple(self):
 
     def test_pandas_groupbynan_regular(self):
         df = pandas.DataFrame([dict(a="a", b=1), dict(a="a", b=2)])
-        gr = df.groupby(["a"]).sum()
+        gr = df.groupby(["a"], as_index=False).sum()
         gr2_ = pandas_groupby_nan(df, ["a"]).sum()
         self.assertEqualDataFrame(gr, gr2_)
 
     def test_pandas_groupbynan_regular_nanback(self):
         df = pandas.DataFrame([dict(a="a", b=1, cc=0), dict(a="a", b=2)])
         gr = df.groupby(["a", "cc"]).sum()
         self.assertEqual(len(gr), 1)
-        self.assertRaise(
-            lambda: pandas_groupby_nan(df, ["a", "cc"], nanback=True).sum(),
-            NotImplementedError)
 
     def test_pandas_groupbynan_doc(self):
         data = [dict(a=2, ind="a", n=1),
@@ -132,10 +129,9 @@ def test_pandas_groupbynan_doc3(self):
                 dict(a=3, ind="b"),
                 dict(a=30)]
         df = pandas.DataFrame(data)
-        self.assertRaise(lambda: pandas_groupby_nan(df, ["ind", "n"]).sum(),
-                         NotImplementedError)
-        # ind = list(gr2['ind'])
-        # self.assertTrue(numpy.isnan(ind[-1]))
+        gr2 = pandas_groupby_nan(df, ["ind", "n"]).sum()
+        ind = list(gr2['ind'])
+        self.assertTrue(numpy.isnan(ind[-1]))
 
 
 if __name__ == "__main__":
diff --git a/_unittests/ut_df/test_streaming_dataframe.py b/_unittests/ut_df/test_streaming_dataframe.py
@@ -364,8 +364,8 @@ def test_groupby(self):
 
         # Do not replace lambda c:sum(c) by sum or...
         # pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum
-        gr2 = df20.groupby("key").agg([numpy.sum, lambda c:sum(c)])
-        gr = sdf20.groupby("key", lambda gr: gr.agg(
+        gr2 = df20.drop("cstr", axis=1).groupby("key").agg([numpy.sum, lambda c:sum(c)])
+        gr = sdf20.drop("cstr", axis=1).groupby("key", lambda gr: gr.agg(
             [numpy.sum, lambda c:sum(c)]))
         self.assertEqualDataFrame(gr, gr2)
 
diff --git a/pandas_streaming/df/dataframe.py b/pandas_streaming/df/dataframe.py
@@ -12,7 +12,10 @@
 import numpy.random as nrandom
 import pandas
 from pandas.testing import assert_frame_equal
-from pandas.io.json import json_normalize
+try:
+    from pandas import json_normalize
+except ImportError:
+    from pandas.io.json import json_normalize
 from .dataframe_split import sklearn_train_test_split, sklearn_train_test_split_streaming
 from .dataframe_io_helpers import enumerate_json_items, JsonIterator2Stream
 
@@ -609,6 +612,22 @@ def reservoir_iterate(sdf, indices, chunksize):
         return StreamingDataFrame(
             lambda: reservoir_iterate(sdf=self, indices=indices, chunksize=1000))
 
+    def drop(self, labels=None, *, axis=0, index=None, columns=None, level=None,
+             inplace=False, errors='raise') -> 'StreamingDataFrame':
+        """
+        Applies :epkg:`pandas:DataFrame:drop`.
+        This function returns a @see cl StreamingDataFrame.
+        """
+        if axis == 0:
+            raise NotImplementedError(f"drop is not implemented for axis={axis}.")
+        if inplace:
+            raise NotImplementedError(f"drop is not implemented for inplace={inplace}.")
+        return StreamingDataFrame(
+            lambda: map(lambda df: df.drop(
+                labels, axis=axis, index=index, columns=columns,
+                level=level, inplace=False, errors=errors), self),
+            **self.get_kwargs())
+
     def apply(self, *args, **kwargs) -> 'StreamingDataFrame':
         """
         Applies :epkg:`pandas:DataFrame:apply`.
@@ -1078,8 +1097,7 @@ def iterate_na(self, **kwargs):
         return StreamingDataFrame(
             lambda: iterate_na(self, **kwargs), **self.get_kwargs())
 
-    def describe(self, percentiles=None, include=None, exclude=None,
-                 datetime_is_numeric=False):
+    def describe(self, percentiles=None, include=None, exclude=None):
         """
         Calls :epkg:`pandas:DataFrame:describe` on every piece
         of the datasets. *percentiles* are not really accurate
@@ -1088,16 +1106,19 @@ def describe(self, percentiles=None, include=None, exclude=None,
         :param percentiles: see :epkg:`pandas:DataFrame:describe`
         :param include: see :epkg:`pandas:DataFrame:describe`
         :param exclude: see :epkg:`pandas:DataFrame:describe`
-        :param datetime_is_numeric: see :epkg:`pandas:DataFrame:describe`
         :return: :epkg:`pandas:DataFrame:describe`
+
+        .. versionchanged:: 0.3.219
+
+            Parameter *datetime_is_numeric* was removed
+            (see :epkg:`pandas:DataFrame:describe`).
         """
         merged = None
         stack = []
         notper = ['count', 'mean', 'std']
         for df in self:
             desc = df.describe(
-                percentiles=percentiles, include=include, exclude=exclude,
-                datetime_is_numeric=datetime_is_numeric)
+                percentiles=percentiles, include=include, exclude=exclude)
             count = desc.loc['count', :]
             rows = [name for name in desc.index if name not in notper]
             stack.append(desc.loc[rows, :])
@@ -1120,8 +1141,7 @@ def describe(self, percentiles=None, include=None, exclude=None,
             merged.loc['std', :] / merged.loc['count', :] -
             merged.loc['mean', :] ** 2) ** 0.5
         values = pandas.concat(stack)
-        summary = values.describe(percentiles=percentiles,
-                                  datetime_is_numeric=datetime_is_numeric)
+        summary = values.describe(percentiles=percentiles)
         merged = merged.loc[notper, :]
         rows = [name for name in summary.index if name not in notper]
         summary = summary.loc[rows, :]
diff --git a/pandas_streaming/df/dataframe_helpers.py b/pandas_streaming/df/dataframe_helpers.py
@@ -7,7 +7,7 @@
 import struct
 import warnings
 import numpy
-from pandas import DataFrame, Index
+from pandas import DataFrame, Index, Series
 
 
 def numpy_types():
@@ -389,6 +389,18 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True
             gr2 = pandas_groupby_nan(df, ["ind"]).sum()
             print(gr2)
     """
+    if nanback and suffix is None:
+        try:
+            res = df.groupby(by, axis=axis, as_index=as_index,
+                             dropna=False, **kwargs)
+        except TypeError:
+            # old version of pandas
+            res = None
+        if res is not None:
+            if suffix is None:
+                return res
+            res.index = Series(res.index).replace(numpy.nan, suffix)
+            return res
     if axis != 0:
         raise NotImplementedError("axis should be 0")
     if as_index:
@@ -519,5 +531,4 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True
             #                 "Not implemented for type: {0}".format(type(grou.grouper)))
             #     del res.grouper._cache
         return res
-    else:
-        return df.groupby(by, axis=axis, **kwargs)
+    return df.groupby(by, axis=axis, **kwargs)