Skip to content

CLN: Clean groupby/test_function.py #32027

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Feb 20, 2020
166 changes: 97 additions & 69 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,13 +223,13 @@ def test_arg_passthru():
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "string", "category_int", "timedelta"])
for attr in ["sum"]:
f = getattr(df.groupby("group"), attr)
result = f()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
f = getattr(df.groupby("group"), "sum")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

just call this directly
df.groupby('group').sum()

result = f()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done, I think it's a bit easier to read / understand if we just call these functions directly instead of assigning to f then calling, so changed that as well

tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "category_int"])
for attr in ["prod", "cumprod"]:
Expand All @@ -254,13 +254,13 @@ def test_arg_passthru():
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "category_int", "timedelta"])
for attr in ["cumsum"]:
f = getattr(df.groupby("group"), attr)
result = f()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
f = getattr(df.groupby("group"), "cumsum")
result = f()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)


def test_non_cython_api():
Expand Down Expand Up @@ -686,62 +686,40 @@ def test_numpy_compat(func):
getattr(g, func)(foo=1)


@pytest.mark.parametrize(
"dtype, min_val, max_val",
[
(np.int32, np.iinfo(np.int32).min, np.iinfo(np.int32).max),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

min_val, max_val can all be computed inside the function

(np.int64, np.iinfo(np.int64).min, np.iinfo(np.int64).max),
(np.float32, np.finfo(np.float32).min, np.finfo(np.float32).max),
(np.float64, np.finfo(np.float64).min, np.finfo(np.float64).max),
],
)
@pytest.mark.xfail(
_is_numpy_dev, reason="https://github.com/pandas-dev/pandas/issues/31992"
)
def test_cummin_cummax():
def test_cummin(dtype, min_val, max_val):
# GH 15048
num_types = [np.int32, np.int64, np.float32, np.float64]
num_mins = [
np.iinfo(np.int32).min,
np.iinfo(np.int64).min,
np.finfo(np.float32).min,
np.finfo(np.float64).min,
]
num_max = [
np.iinfo(np.int32).max,
np.iinfo(np.int64).max,
np.finfo(np.float32).max,
np.finfo(np.float64).max,
]
base_df = pd.DataFrame(
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}
)
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
df = base_df.astype(dtype)

# cummin
expected = pd.DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test cummin w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
expected.loc[[2, 3, 6, 7], "B"] = min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)
df = base_df.astype(dtype)

# cummax
expected = pd.DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test cummax w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# Test w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
expected.loc[[2, 3, 6, 7], "B"] = min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
Expand All @@ -751,6 +729,65 @@ def test_cummin_cummax():
expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test nan in entire column
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you split to another test rather than make this test very long

base_df["B"] = np.nan
expected = pd.DataFrame({"B": [np.nan] * 8})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(expected, result)

# GH 15561
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b")

result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)

# GH 15635
df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
result = df.groupby("a").b.cummin()
expected = pd.Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

simply make this a fixture at the top of the file rather than repeating

"dtype, min_val, max_val",
[
(np.int32, np.iinfo(np.int32).min, np.iinfo(np.int32).max),
(np.int64, np.iinfo(np.int64).min, np.iinfo(np.int64).max),
(np.float32, np.finfo(np.float32).min, np.finfo(np.float32).max),
(np.float64, np.finfo(np.float64).min, np.finfo(np.float64).max),
],
)
@pytest.mark.xfail(
_is_numpy_dev, reason="https://github.com/pandas-dev/pandas/issues/31992"
)
def test_cummax(dtype, min_val, max_val):
# GH 15048
base_df = pd.DataFrame(
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}
)
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

df = base_df.astype(dtype)

expected = pd.DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
Expand All @@ -760,10 +797,6 @@ def test_cummin_cummax():
# Test nan in entire column
base_df["B"] = np.nan
expected = pd.DataFrame({"B": [np.nan] * 8})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
Expand All @@ -772,21 +805,16 @@ def test_cummin_cummax():
# GH 15561
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b")
for method in ["cummax", "cummin"]:
result = getattr(df.groupby("a")["b"], method)()
tm.assert_series_equal(expected, result)

result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)

# GH 15635
df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
result = df.groupby("a").b.cummax()
expected = pd.Series([2, 1, 2], name="b")
tm.assert_series_equal(result, expected)

df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
result = df.groupby("a").b.cummin()
expected = pd.Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize(
"in_vals, out_vals",
Expand Down