-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
CLN: Clean groupby/test_function.py #32027
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 9 commits
803a166
5f56e75
cc6b42c
507bad8
761c7e6
bac1eb5
23ad133
7b2cfcb
e8b37c1
fd84996
c3bc093
6a01a6d
4e02e07
20c1aa7
a00df9a
7cf1ae7
b923f5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -223,13 +223,13 @@ def test_arg_passthru(): | |
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
expected_columns = Index(["int", "float", "string", "category_int", "timedelta"]) | ||
for attr in ["sum"]: | ||
f = getattr(df.groupby("group"), attr) | ||
result = f() | ||
tm.assert_index_equal(result.columns, expected_columns_numeric) | ||
|
||
result = f(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
f = getattr(df.groupby("group"), "sum") | ||
result = f() | ||
tm.assert_index_equal(result.columns, expected_columns_numeric) | ||
|
||
result = f(numeric_only=False) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done, I think it's a bit easier to read / understand if we just call these functions directly instead of assigning to f then calling, so changed that as well |
||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
expected_columns = Index(["int", "float", "category_int"]) | ||
for attr in ["prod", "cumprod"]: | ||
|
@@ -254,13 +254,13 @@ def test_arg_passthru(): | |
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
expected_columns = Index(["int", "float", "category_int", "timedelta"]) | ||
for attr in ["cumsum"]: | ||
f = getattr(df.groupby("group"), attr) | ||
result = f() | ||
tm.assert_index_equal(result.columns, expected_columns_numeric) | ||
|
||
result = f(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
f = getattr(df.groupby("group"), "cumsum") | ||
result = f() | ||
tm.assert_index_equal(result.columns, expected_columns_numeric) | ||
|
||
result = f(numeric_only=False) | ||
tm.assert_index_equal(result.columns, expected_columns) | ||
|
||
|
||
def test_non_cython_api(): | ||
|
@@ -686,62 +686,40 @@ def test_numpy_compat(func): | |
getattr(g, func)(foo=1) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"dtype, min_val, max_val", | ||
[ | ||
(np.int32, np.iinfo(np.int32).min, np.iinfo(np.int32).max), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. min_val, max_val can all be computed inside the function |
||
(np.int64, np.iinfo(np.int64).min, np.iinfo(np.int64).max), | ||
(np.float32, np.finfo(np.float32).min, np.finfo(np.float32).max), | ||
(np.float64, np.finfo(np.float64).min, np.finfo(np.float64).max), | ||
], | ||
) | ||
@pytest.mark.xfail( | ||
_is_numpy_dev, reason="https://github.com/pandas-dev/pandas/issues/31992" | ||
) | ||
def test_cummin_cummax(): | ||
def test_cummin(dtype, min_val, max_val): | ||
# GH 15048 | ||
num_types = [np.int32, np.int64, np.float32, np.float64] | ||
num_mins = [ | ||
np.iinfo(np.int32).min, | ||
np.iinfo(np.int64).min, | ||
np.finfo(np.float32).min, | ||
np.finfo(np.float64).min, | ||
] | ||
num_max = [ | ||
np.iinfo(np.int32).max, | ||
np.iinfo(np.int64).max, | ||
np.finfo(np.float32).max, | ||
np.finfo(np.float64).max, | ||
] | ||
base_df = pd.DataFrame( | ||
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]} | ||
) | ||
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] | ||
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] | ||
|
||
for dtype, min_val, max_val in zip(num_types, num_mins, num_max): | ||
df = base_df.astype(dtype) | ||
|
||
# cummin | ||
expected = pd.DataFrame({"B": expected_mins}).astype(dtype) | ||
result = df.groupby("A").cummin() | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test cummin w/ min value for dtype | ||
df.loc[[2, 6], "B"] = min_val | ||
expected.loc[[2, 3, 6, 7], "B"] = min_val | ||
result = df.groupby("A").cummin() | ||
tm.assert_frame_equal(result, expected) | ||
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
df = base_df.astype(dtype) | ||
|
||
# cummax | ||
expected = pd.DataFrame({"B": expected_maxs}).astype(dtype) | ||
result = df.groupby("A").cummax() | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
expected = pd.DataFrame({"B": expected_mins}).astype(dtype) | ||
result = df.groupby("A").cummin() | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test cummax w/ max value for dtype | ||
df.loc[[2, 6], "B"] = max_val | ||
expected.loc[[2, 3, 6, 7], "B"] = max_val | ||
result = df.groupby("A").cummax() | ||
tm.assert_frame_equal(result, expected) | ||
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
# Test w/ min value for dtype | ||
df.loc[[2, 6], "B"] = min_val | ||
expected.loc[[2, 3, 6, 7], "B"] = min_val | ||
result = df.groupby("A").cummin() | ||
tm.assert_frame_equal(result, expected) | ||
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test nan in some values | ||
base_df.loc[[0, 2, 4, 6], "B"] = np.nan | ||
|
@@ -751,6 +729,65 @@ def test_cummin_cummax(): | |
expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test nan in entire column | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you split to another test rather than make this test very long |
||
base_df["B"] = np.nan | ||
expected = pd.DataFrame({"B": [np.nan] * 8}) | ||
result = base_df.groupby("A").cummin() | ||
tm.assert_frame_equal(expected, result) | ||
result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(expected, result) | ||
|
||
# GH 15561 | ||
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) | ||
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b") | ||
|
||
result = df.groupby("a")["b"].cummin() | ||
tm.assert_series_equal(expected, result) | ||
|
||
# GH 15635 | ||
df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) | ||
result = df.groupby("a").b.cummin() | ||
expected = pd.Series([1, 2, 1], name="b") | ||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. simply make this a fixture at the top of the file rather than repeating |
||
"dtype, min_val, max_val", | ||
[ | ||
(np.int32, np.iinfo(np.int32).min, np.iinfo(np.int32).max), | ||
(np.int64, np.iinfo(np.int64).min, np.iinfo(np.int64).max), | ||
(np.float32, np.finfo(np.float32).min, np.finfo(np.float32).max), | ||
(np.float64, np.finfo(np.float64).min, np.finfo(np.float64).max), | ||
], | ||
) | ||
@pytest.mark.xfail( | ||
_is_numpy_dev, reason="https://github.com/pandas-dev/pandas/issues/31992" | ||
) | ||
def test_cummax(dtype, min_val, max_val): | ||
# GH 15048 | ||
base_df = pd.DataFrame( | ||
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]} | ||
) | ||
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] | ||
|
||
df = base_df.astype(dtype) | ||
|
||
expected = pd.DataFrame({"B": expected_maxs}).astype(dtype) | ||
result = df.groupby("A").cummax() | ||
tm.assert_frame_equal(result, expected) | ||
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test w/ max value for dtype | ||
df.loc[[2, 6], "B"] = max_val | ||
expected.loc[[2, 3, 6, 7], "B"] = max_val | ||
result = df.groupby("A").cummax() | ||
tm.assert_frame_equal(result, expected) | ||
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
tm.assert_frame_equal(result, expected) | ||
|
||
# Test nan in some values | ||
base_df.loc[[0, 2, 4, 6], "B"] = np.nan | ||
expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]}) | ||
result = base_df.groupby("A").cummax() | ||
tm.assert_frame_equal(result, expected) | ||
|
@@ -760,10 +797,6 @@ def test_cummin_cummax(): | |
# Test nan in entire column | ||
base_df["B"] = np.nan | ||
expected = pd.DataFrame({"B": [np.nan] * 8}) | ||
result = base_df.groupby("A").cummin() | ||
tm.assert_frame_equal(expected, result) | ||
result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame() | ||
tm.assert_frame_equal(expected, result) | ||
result = base_df.groupby("A").cummax() | ||
tm.assert_frame_equal(expected, result) | ||
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame() | ||
|
@@ -772,21 +805,16 @@ def test_cummin_cummax(): | |
# GH 15561 | ||
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"]))) | ||
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b") | ||
for method in ["cummax", "cummin"]: | ||
result = getattr(df.groupby("a")["b"], method)() | ||
tm.assert_series_equal(expected, result) | ||
|
||
result = df.groupby("a")["b"].cummax() | ||
tm.assert_series_equal(expected, result) | ||
|
||
# GH 15635 | ||
df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1])) | ||
result = df.groupby("a").b.cummax() | ||
expected = pd.Series([2, 1, 2], name="b") | ||
tm.assert_series_equal(result, expected) | ||
|
||
df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2])) | ||
result = df.groupby("a").b.cummin() | ||
expected = pd.Series([1, 2, 1], name="b") | ||
tm.assert_series_equal(result, expected) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"in_vals, out_vals", | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just call this directly
df.groupby('group').sum()