-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
CLN: Clean groupby/test_function.py #32027
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 15 commits
803a166
5f56e75
cc6b42c
507bad8
761c7e6
bac1eb5
23ad133
7b2cfcb
e8b37c1
fd84996
c3bc093
6a01a6d
4e02e07
20c1aa7
a00df9a
7cf1ae7
b923f5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,6 +26,18 @@ | |
from pandas.util import _test_decorators as td | ||
|
||
|
||
@pytest.fixture( | ||
params=[np.int32, np.int64, np.float32, np.float64], | ||
ids=["np.int32", "np.int64", "np.float32", "np.float64"], | ||
) | ||
def numpy_dtypes(request): | ||
""" | ||
Fixture of numpy dtypes with min and max values used for testing | ||
cummin and cummax | ||
""" | ||
return request.param | ||
|
||
|
||
@pytest.mark.parametrize("agg_func", ["any", "all"]) | ||
@pytest.mark.parametrize("skipna", [True, False]) | ||
@pytest.mark.parametrize( | ||
|
@@ -174,11 +186,10 @@ def test_arg_passthru(): | |
) | ||
|
||
for attr in ["mean", "median"]: | ||
f = getattr(df.groupby("group"), attr) | ||
result = f() | ||
result = getattr(df.groupby("group"), attr)() | ||
tm.assert_index_equal(result.columns, expected_columns_numeric) | ||
|
||
result = f(numeric_only=False) | ||
result = getattr(df.groupby("group"), attr)(numeric_only=False) | ||
tm.assert_frame_equal(result.reindex_like(expected), expected) | ||
|
||
# TODO: min, max *should* handle | ||
|
@@ -195,11 +206,10 @@ def test_arg_passthru(): | |
] | ||
) | ||
for attr in ["min", "max"]: | ||
f = getattr(df.groupby("group"), attr) | ||
result = f() | ||
result = getattr(df.groupby("group"), attr)() | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
result = f(numeric_only=False) | ||
result = getattr(df.groupby("group"), attr)(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
expected_columns = Index( | ||
|
@@ -215,52 +225,47 @@ def test_arg_passthru(): | |
] | ||
) | ||
for attr in ["first", "last"]: | ||
f = getattr(df.groupby("group"), attr) | ||
result = f() | ||
result = getattr(df.groupby("group"), attr)() | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
result = f(numeric_only=False) | ||
result = getattr(df.groupby("group"), attr)(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
expected_columns = Index(["int", "float", "string", "category_int", "timedelta"]) | ||
for attr in ["sum"]: | ||
f = getattr(df.groupby("group"), attr) | ||
result = f() | ||
tm.assert_index_equal(result.columns, expected_columns_numeric) | ||
|
||
result = f(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
result = df.groupby("group").sum() | ||
tm.assert_index_equal(result.columns, expected_columns_numeric) | ||
|
||
result = df.groupby("group").sum(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
expected_columns = Index(["int", "float", "category_int"]) | ||
for attr in ["prod", "cumprod"]: | ||
f = getattr(df.groupby("group"), attr) | ||
result = f() | ||
result = getattr(df.groupby("group"), attr)() | ||
tm.assert_index_equal(result.columns, expected_columns_numeric) | ||
|
||
result = f(numeric_only=False) | ||
result = getattr(df.groupby("group"), attr)(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
# like min, max, but don't include strings | ||
expected_columns = Index( | ||
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"] | ||
) | ||
for attr in ["cummin", "cummax"]: | ||
f = getattr(df.groupby("group"), attr) | ||
result = f() | ||
result = getattr(df.groupby("group"), attr)() | ||
# GH 15561: numeric_only=False set by default like min/max | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
result = f(numeric_only=False) | ||
result = getattr(df.groupby("group"), attr)(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
expected_columns = Index(["int", "float", "category_int", "timedelta"]) | ||
for attr in ["cumsum"]: | ||
f = getattr(df.groupby("group"), attr) | ||
result = f() | ||
tm.assert_index_equal(result.columns, expected_columns_numeric) | ||
|
||
result = f(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
result = getattr(df.groupby("group"), "cumsum")() | ||
tm.assert_index_equal(result.columns, expected_columns_numeric) | ||
|
||
result = getattr(df.groupby("group"), "cumsum")(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
|
||
def test_non_cython_api(): | ||
|
@@ -691,59 +696,33 @@ def test_numpy_compat(func): | |
reason="https://github.com/pandas-dev/pandas/issues/31992", | ||
strict=False, | ||
) | ||
def test_cummin_cummax(): | ||
def test_cummin(numpy_dtypes): | ||
dtype = numpy_dtypes | ||
min_val = ( | ||
np.iinfo(dtype).min if np.dtype(dtype).kind == "i" else np.finfo(dtype).min | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. for readability, could just include num_mins and num_max in parameterisation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I think this could be better There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would now be more difficult if a fixture is used for dtype. (as requested) ( could create a composable min and max fixtures but probably OTT for one test) |
||
) | ||
|
||
# GH 15048 | ||
num_types = [np.int32, np.int64, np.float32, np.float64] | ||
num_mins = [ | ||
np.iinfo(np.int32).min, | ||
np.iinfo(np.int64).min, | ||
np.finfo(np.float32).min, | ||
np.finfo(np.float64).min, | ||
] | ||
num_max = [ | ||
np.iinfo(np.int32).max, | ||
np.iinfo(np.int64).max, | ||
np.finfo(np.float32).max, | ||
np.finfo(np.float64).max, | ||
] | ||
base_df = pd.DataFrame( | ||
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]} | ||
) | ||
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] | ||
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] | ||
|
||
for dtype, min_val, max_val in zip(num_types, num_mins, num_max): | ||
df = base_df.astype(dtype) | ||
|
||
# cummin | ||
expected = pd.DataFrame({"B": expected_mins}).astype(dtype) | ||
result = df.groupby("A").cummin() | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test cummin w/ min value for dtype | ||
df.loc[[2, 6], "B"] = min_val | ||
expected.loc[[2, 3, 6, 7], "B"] = min_val | ||
result = df.groupby("A").cummin() | ||
tm.assert_frame_equal(result, expected) | ||
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
df = base_df.astype(dtype) | ||
|
||
# cummax | ||
expected = pd.DataFrame({"B": expected_maxs}).astype(dtype) | ||
result = df.groupby("A").cummax() | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
expected = pd.DataFrame({"B": expected_mins}).astype(dtype) | ||
result = df.groupby("A").cummin() | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test cummax w/ max value for dtype | ||
df.loc[[2, 6], "B"] = max_val | ||
expected.loc[[2, 3, 6, 7], "B"] = max_val | ||
result = df.groupby("A").cummax() | ||
tm.assert_frame_equal(result, expected) | ||
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
# Test w/ min value for dtype | ||
df.loc[[2, 6], "B"] = min_val | ||
expected.loc[[2, 3, 6, 7], "B"] = min_val | ||
result = df.groupby("A").cummin() | ||
tm.assert_frame_equal(result, expected) | ||
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test nan in some values | ||
base_df.loc[[0, 2, 4, 6], "B"] = np.nan | ||
|
@@ -753,41 +732,103 @@ def test_cummin_cummax(): | |
expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) | ||
result = base_df.groupby("A").cummax() | ||
tm.assert_frame_equal(result, expected) | ||
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
# GH 15561 | ||
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) | ||
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b") | ||
|
||
result = df.groupby("a")["b"].cummin() | ||
tm.assert_series_equal(expected, result) | ||
|
||
# GH 15635 | ||
df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) | ||
result = df.groupby("a").b.cummin() | ||
expected = pd.Series([1, 2, 1], name="b") | ||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
@pytest.mark.xfail( | ||
_is_numpy_dev, | ||
reason="https://github.com/pandas-dev/pandas/issues/31992", | ||
strict=False, | ||
) | ||
def test_cummin_all_nan_column(): | ||
base_df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) | ||
|
||
# Test nan in entire column | ||
base_df["B"] = np.nan | ||
expected = pd.DataFrame({"B": [np.nan] * 8}) | ||
result = base_df.groupby("A").cummin() | ||
tm.assert_frame_equal(expected, result) | ||
result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(expected, result) | ||
|
||
|
||
@pytest.mark.xfail( | ||
_is_numpy_dev, | ||
reason="https://github.com/pandas-dev/pandas/issues/31992", | ||
strict=False, | ||
) | ||
def test_cummax(numpy_dtypes): | ||
dtype = numpy_dtypes | ||
max_val = ( | ||
np.iinfo(dtype).max if np.dtype(dtype).kind == "i" else np.finfo(dtype).max | ||
) | ||
|
||
# GH 15048 | ||
base_df = pd.DataFrame( | ||
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]} | ||
) | ||
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] | ||
|
||
df = base_df.astype(dtype) | ||
|
||
expected = pd.DataFrame({"B": expected_maxs}).astype(dtype) | ||
result = df.groupby("A").cummax() | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test w/ max value for dtype | ||
df.loc[[2, 6], "B"] = max_val | ||
expected.loc[[2, 3, 6, 7], "B"] = max_val | ||
result = df.groupby("A").cummax() | ||
tm.assert_frame_equal(result, expected) | ||
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test nan in some values | ||
base_df.loc[[0, 2, 4, 6], "B"] = np.nan | ||
expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) | ||
result = base_df.groupby("A").cummax() | ||
tm.assert_frame_equal(expected, result) | ||
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(expected, result) | ||
tm.assert_frame_equal(result, expected) | ||
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# GH 15561 | ||
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) | ||
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b") | ||
for method in ["cummax", "cummin"]: | ||
result = getattr(df.groupby("a")["b"], method)() | ||
tm.assert_series_equal(expected, result) | ||
|
||
result = df.groupby("a")["b"].cummax() | ||
tm.assert_series_equal(expected, result) | ||
|
||
# GH 15635 | ||
df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) | ||
result = df.groupby("a").b.cummax() | ||
expected = pd.Series([2, 1, 2], name="b") | ||
tm.assert_series_equal(result, expected) | ||
|
||
df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) | ||
result = df.groupby("a").b.cummin() | ||
expected = pd.Series([1, 2, 1], name="b") | ||
tm.assert_series_equal(result, expected) | ||
|
||
@pytest.mark.xfail( | ||
_is_numpy_dev, | ||
reason="https://github.com/pandas-dev/pandas/issues/31992", | ||
strict=False, | ||
) | ||
def test_cummax_all_nan_column(): | ||
base_df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8}) | ||
|
||
expected = pd.DataFrame({"B": [np.nan] * 8}) | ||
result = base_df.groupby("A").cummax() | ||
tm.assert_frame_equal(expected, result) | ||
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(expected, result) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you name this something else, maybe numpy_dtypes_for_minmax; also you can include in the return the iinfo or finfo result (e.g. you would return a tuple); i find this a bit more obvious than doing it in the test function.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated based on comments and green
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not a fan of returning tuples and unpacking in tests in scenarios where composable fixtures could be used instead and could have more utility. However, naming the fixture
numpy_dtypes_for_minmax
somewhat limits the reuse of composible fixtures.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we take the min and max out of the fixture I'd just say compute inside the test, since the min and max are each only used in one test respectively, so there isn't much value in having them in a fixture anyways
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry I wasn't clear. I'm not a fan but no need to change. see #32027 (comment)