Skip to content

Commit 7d2632c

Browse files
authored
Fixes an issue raised with pandas 2.0 (#30)
* Fixes an issue raised with pandas 2.0 * remove debug code * Update test_pandas_groupbynan.py * add drop
1 parent 3bc92fa commit 7d2632c

File tree

4 files changed

+49
-22
lines changed

4 files changed

+49
-22
lines changed

_unittests/ut_df/test_pandas_groupbynan.py

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def test_pandas_groupbynan(self):
4949
gr = pandas_groupby_nan(df, ("value", "this"))
5050
t = True
5151
raise AssertionError("---")
52-
except TypeError:
52+
except (TypeError, KeyError):
5353
t = False
5454
if t:
5555
co = gr.sum()
@@ -91,17 +91,14 @@ def test_pandas_groupbynan_tuple(self):
9191

9292
def test_pandas_groupbynan_regular(self):
9393
df = pandas.DataFrame([dict(a="a", b=1), dict(a="a", b=2)])
94-
gr = df.groupby(["a"]).sum()
94+
gr = df.groupby(["a"], as_index=False).sum()
9595
gr2_ = pandas_groupby_nan(df, ["a"]).sum()
9696
self.assertEqualDataFrame(gr, gr2_)
9797

9898
def test_pandas_groupbynan_regular_nanback(self):
9999
df = pandas.DataFrame([dict(a="a", b=1, cc=0), dict(a="a", b=2)])
100100
gr = df.groupby(["a", "cc"]).sum()
101101
self.assertEqual(len(gr), 1)
102-
self.assertRaise(
103-
lambda: pandas_groupby_nan(df, ["a", "cc"], nanback=True).sum(),
104-
NotImplementedError)
105102

106103
def test_pandas_groupbynan_doc(self):
107104
data = [dict(a=2, ind="a", n=1),
@@ -132,10 +129,9 @@ def test_pandas_groupbynan_doc3(self):
132129
dict(a=3, ind="b"),
133130
dict(a=30)]
134131
df = pandas.DataFrame(data)
135-
self.assertRaise(lambda: pandas_groupby_nan(df, ["ind", "n"]).sum(),
136-
NotImplementedError)
137-
# ind = list(gr2['ind'])
138-
# self.assertTrue(numpy.isnan(ind[-1]))
132+
gr2 = pandas_groupby_nan(df, ["ind", "n"]).sum()
133+
ind = list(gr2['ind'])
134+
self.assertTrue(numpy.isnan(ind[-1]))
139135

140136

141137
if __name__ == "__main__":

_unittests/ut_df/test_streaming_dataframe.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -364,8 +364,8 @@ def test_groupby(self):
364364

365365
# Do not replace lambda c:sum(c) by sum or...
366366
# pandas.core.base.SpecificationError: Function names must be unique, found multiple named sum
367-
gr2 = df20.groupby("key").agg([numpy.sum, lambda c:sum(c)])
368-
gr = sdf20.groupby("key", lambda gr: gr.agg(
367+
gr2 = df20.drop("cstr", axis=1).groupby("key").agg([numpy.sum, lambda c:sum(c)])
368+
gr = sdf20.drop("cstr", axis=1).groupby("key", lambda gr: gr.agg(
369369
[numpy.sum, lambda c:sum(c)]))
370370
self.assertEqualDataFrame(gr, gr2)
371371

pandas_streaming/df/dataframe.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,10 @@
1212
import numpy.random as nrandom
1313
import pandas
1414
from pandas.testing import assert_frame_equal
15-
from pandas.io.json import json_normalize
15+
try:
16+
from pandas import json_normalize
17+
except ImportError:
18+
from pandas.io.json import json_normalize
1619
from .dataframe_split import sklearn_train_test_split, sklearn_train_test_split_streaming
1720
from .dataframe_io_helpers import enumerate_json_items, JsonIterator2Stream
1821

@@ -609,6 +612,22 @@ def reservoir_iterate(sdf, indices, chunksize):
609612
return StreamingDataFrame(
610613
lambda: reservoir_iterate(sdf=self, indices=indices, chunksize=1000))
611614

615+
def drop(self, labels=None, *, axis=0, index=None, columns=None, level=None,
616+
inplace=False, errors='raise') -> 'StreamingDataFrame':
617+
"""
618+
Applies :epkg:`pandas:DataFrame:drop`.
619+
This function returns a @see cl StreamingDataFrame.
620+
"""
621+
if axis == 0:
622+
raise NotImplementedError(f"drop is not implemented for axis={axis}.")
623+
if inplace:
624+
raise NotImplementedError(f"drop is not implemented for inplace={inplace}.")
625+
return StreamingDataFrame(
626+
lambda: map(lambda df: df.drop(
627+
labels, axis=axis, index=index, columns=columns,
628+
level=level, inplace=False, errors=errors), self),
629+
**self.get_kwargs())
630+
612631
def apply(self, *args, **kwargs) -> 'StreamingDataFrame':
613632
"""
614633
Applies :epkg:`pandas:DataFrame:apply`.
@@ -1078,8 +1097,7 @@ def iterate_na(self, **kwargs):
10781097
return StreamingDataFrame(
10791098
lambda: iterate_na(self, **kwargs), **self.get_kwargs())
10801099

1081-
def describe(self, percentiles=None, include=None, exclude=None,
1082-
datetime_is_numeric=False):
1100+
def describe(self, percentiles=None, include=None, exclude=None):
10831101
"""
10841102
Calls :epkg:`pandas:DataFrame:describe` on every piece
10851103
of the datasets. *percentiles* are not really accurate
@@ -1088,16 +1106,19 @@ def describe(self, percentiles=None, include=None, exclude=None,
10881106
:param percentiles: see :epkg:`pandas:DataFrame:describe`
10891107
:param include: see :epkg:`pandas:DataFrame:describe`
10901108
:param exclude: see :epkg:`pandas:DataFrame:describe`
1091-
:param datetime_is_numeric: see :epkg:`pandas:DataFrame:describe`
10921109
:return: :epkg:`pandas:DataFrame:describe`
1110+
1111+
.. versionchanged:: 0.3.219
1112+
1113+
Parameter *datetime_is_numeric* was removed
1114+
(see :epkg:`pandas:DataFrame:describe`).
10931115
"""
10941116
merged = None
10951117
stack = []
10961118
notper = ['count', 'mean', 'std']
10971119
for df in self:
10981120
desc = df.describe(
1099-
percentiles=percentiles, include=include, exclude=exclude,
1100-
datetime_is_numeric=datetime_is_numeric)
1121+
percentiles=percentiles, include=include, exclude=exclude)
11011122
count = desc.loc['count', :]
11021123
rows = [name for name in desc.index if name not in notper]
11031124
stack.append(desc.loc[rows, :])
@@ -1120,8 +1141,7 @@ def describe(self, percentiles=None, include=None, exclude=None,
11201141
merged.loc['std', :] / merged.loc['count', :] -
11211142
merged.loc['mean', :] ** 2) ** 0.5
11221143
values = pandas.concat(stack)
1123-
summary = values.describe(percentiles=percentiles,
1124-
datetime_is_numeric=datetime_is_numeric)
1144+
summary = values.describe(percentiles=percentiles)
11251145
merged = merged.loc[notper, :]
11261146
rows = [name for name in summary.index if name not in notper]
11271147
summary = summary.loc[rows, :]

pandas_streaming/df/dataframe_helpers.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
import struct
88
import warnings
99
import numpy
10-
from pandas import DataFrame, Index
10+
from pandas import DataFrame, Index, Series
1111

1212

1313
def numpy_types():
@@ -389,6 +389,18 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True
389389
gr2 = pandas_groupby_nan(df, ["ind"]).sum()
390390
print(gr2)
391391
"""
392+
if nanback and suffix is None:
393+
try:
394+
res = df.groupby(by, axis=axis, as_index=as_index,
395+
dropna=False, **kwargs)
396+
except TypeError:
397+
# old version of pandas
398+
res = None
399+
if res is not None:
400+
if suffix is None:
401+
return res
402+
res.index = Series(res.index).replace(numpy.nan, suffix)
403+
return res
392404
if axis != 0:
393405
raise NotImplementedError("axis should be 0")
394406
if as_index:
@@ -519,5 +531,4 @@ def pandas_groupby_nan(df, by, axis=0, as_index=False, suffix=None, nanback=True
519531
# "Not implemented for type: {0}".format(type(grou.grouper)))
520532
# del res.grouper._cache
521533
return res
522-
else:
523-
return df.groupby(by, axis=axis, **kwargs)
534+
return df.groupby(by, axis=axis, **kwargs)

0 commit comments

Comments
 (0)