Skip to content

CLN: Clean groupby/test_function.py #32027

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 17 commits into from
Feb 20, 2020
219 changes: 130 additions & 89 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,18 @@
from pandas.util import _test_decorators as td


@pytest.fixture(
params=[np.int32, np.int64, np.float32, np.float64],
ids=["np.int32", "np.int64", "np.float32", "np.float64"],
)
def numpy_dtypes(request):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you name this something else, maybe numpy_dtypes_for_minmax; also you can include in the return the iinfo or finfo result (e.g. you would return a tuple); i find this a bit more obvious than doing it in the test function.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated based on comments and green

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

not a fan of returning tuples and unpacking in tests in scenarios where composable fixtures could be used instead and could have more utility. However, naming the fixture numpy_dtypes_for_minmax somewhat limits the reuse of composible fixtures.

diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py
index 6205dfb87..348a6c189 100644
--- a/pandas/tests/groupby/test_function.py
+++ b/pandas/tests/groupby/test_function.py
@@ -35,15 +35,31 @@ def numpy_dtypes_for_minmax(request):
     Fixture of numpy dtypes with min and max values used for testing
     cummin and cummax
     """
-    dtype = request.param
-    min_val = (
-        np.iinfo(dtype).min if np.dtype(dtype).kind == "i" else np.finfo(dtype).min
-    )
-    max_val = (
-        np.iinfo(dtype).max if np.dtype(dtype).kind == "i" else np.finfo(dtype).max
+    return request.param
+
+
+@pytest.fixture()
+def dtype_min(numpy_dtypes_for_minmax):
+    """
+    Fixture to return minimum value of dtype.
+    """
+    return (
+        np.iinfo(numpy_dtypes_for_minmax).min
+        if np.dtype(numpy_dtypes_for_minmax).kind == "i"
+        else np.finfo(numpy_dtypes_for_minmax).min
     )
 
-    return (dtype, min_val, max_val)
+
+@pytest.fixture()
+def dtype_max(numpy_dtypes_for_minmax):
+    """
+    Fixture to return maximum value of dtype.
+    """
+    return (
+        np.iinfo(numpy_dtypes_for_minmax).max
+        if np.dtype(numpy_dtypes_for_minmax).kind == "i"
+        else np.finfo(numpy_dtypes_for_minmax).max
+    )
 
 
 @pytest.mark.parametrize("agg_func", ["any", "all"])
@@ -704,9 +720,8 @@ def test_numpy_compat(func):
     reason="https://github.com/pandas-dev/pandas/issues/31992",
     strict=False,
 )
-def test_cummin(numpy_dtypes_for_minmax):
-    dtype = numpy_dtypes_for_minmax[0]
-    min_val = numpy_dtypes_for_minmax[1]
+def test_cummin(numpy_dtypes_for_minmax, dtype_min):
+    dtype = numpy_dtypes_for_minmax
 
     # GH 15048
     base_df = pd.DataFrame(
@@ -723,8 +738,8 @@ def test_cummin(numpy_dtypes_for_minmax):
     tm.assert_frame_equal(result, expected)
 
     # Test w/ min value for dtype
-    df.loc[[2, 6], "B"] = min_val
-    expected.loc[[2, 3, 6, 7], "B"] = min_val
+    df.loc[[2, 6], "B"] = dtype_min
+    expected.loc[[2, 3, 6, 7], "B"] = dtype_min
     result = df.groupby("A").cummin()
     tm.assert_frame_equal(result, expected)
     expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
@@ -772,9 +787,8 @@ def test_cummin_all_nan_column():
     reason="https://github.com/pandas-dev/pandas/issues/31992",
     strict=False,
 )
-def test_cummax(numpy_dtypes_for_minmax):
-    dtype = numpy_dtypes_for_minmax[0]
-    max_val = numpy_dtypes_for_minmax[2]
+def test_cummax(numpy_dtypes_for_minmax, dtype_max):
+    dtype = numpy_dtypes_for_minmax
 
     # GH 15048
     base_df = pd.DataFrame(
@@ -791,8 +805,8 @@ def test_cummax(numpy_dtypes_for_minmax):
     tm.assert_frame_equal(result, expected)
 
     # Test w/ max value for dtype
-    df.loc[[2, 6], "B"] = max_val
-    expected.loc[[2, 3, 6, 7], "B"] = max_val
+    df.loc[[2, 6], "B"] = dtype_max
+    expected.loc[[2, 3, 6, 7], "B"] = dtype_max
     result = df.groupby("A").cummax()
     tm.assert_frame_equal(result, expected)
     expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we take the min and max out of the fixture I'd just say compute inside the test, since the min and max are each only used in one test respectively, so there isn't much value in having them in a fixture anyways

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry I wasn't clear. I'm not a fan but no need to change. see #32027 (comment)

"""
Fixture of numpy dtypes with min and max values used for testing
cummin and cummax
"""
return request.param


@pytest.mark.parametrize("agg_func", ["any", "all"])
@pytest.mark.parametrize("skipna", [True, False])
@pytest.mark.parametrize(
Expand Down Expand Up @@ -174,11 +186,10 @@ def test_arg_passthru():
)

for attr in ["mean", "median"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_frame_equal(result.reindex_like(expected), expected)

# TODO: min, max *should* handle
Expand All @@ -195,11 +206,10 @@ def test_arg_passthru():
]
)
for attr in ["min", "max"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(
Expand All @@ -215,52 +225,47 @@ def test_arg_passthru():
]
)
for attr in ["first", "last"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "string", "category_int", "timedelta"])
for attr in ["sum"]:
f = getattr(df.groupby("group"), attr)
result = f()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
result = df.groupby("group").sum()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = df.groupby("group").sum(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "category_int"])
for attr in ["prod", "cumprod"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

# like min, max, but don't include strings
expected_columns = Index(
["int", "float", "category_int", "datetime", "datetimetz", "timedelta"]
)
for attr in ["cummin", "cummax"]:
f = getattr(df.groupby("group"), attr)
result = f()
result = getattr(df.groupby("group"), attr)()
# GH 15561: numeric_only=False set by default like min/max
tm.assert_index_equal(result.columns, expected_columns)

result = f(numeric_only=False)
result = getattr(df.groupby("group"), attr)(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)

expected_columns = Index(["int", "float", "category_int", "timedelta"])
for attr in ["cumsum"]:
f = getattr(df.groupby("group"), attr)
result = f()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = f(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)
result = getattr(df.groupby("group"), "cumsum")()
tm.assert_index_equal(result.columns, expected_columns_numeric)

result = getattr(df.groupby("group"), "cumsum")(numeric_only=False)
tm.assert_index_equal(result.columns, expected_columns)


def test_non_cython_api():
Expand Down Expand Up @@ -691,59 +696,33 @@ def test_numpy_compat(func):
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummin_cummax():
def test_cummin(numpy_dtypes):
dtype = numpy_dtypes
min_val = (
np.iinfo(dtype).min if np.dtype(dtype).kind == "i" else np.finfo(dtype).min
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for readability, could just include num_mins and num_max in parameterisation.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I think this could be better

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would now be more difficult if a fixture is used for dtype. (as requested) ( could create a composable min and max fixtures but probably OTT for one test)

)

# GH 15048
num_types = [np.int32, np.int64, np.float32, np.float64]
num_mins = [
np.iinfo(np.int32).min,
np.iinfo(np.int64).min,
np.finfo(np.float32).min,
np.finfo(np.float64).min,
]
num_max = [
np.iinfo(np.int32).max,
np.iinfo(np.int64).max,
np.finfo(np.float32).max,
np.finfo(np.float64).max,
]
base_df = pd.DataFrame(
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}
)
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

for dtype, min_val, max_val in zip(num_types, num_mins, num_max):
df = base_df.astype(dtype)

# cummin
expected = pd.DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test cummin w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
expected.loc[[2, 3, 6, 7], "B"] = min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)
df = base_df.astype(dtype)

# cummax
expected = pd.DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
expected = pd.DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test cummax w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# Test w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
expected.loc[[2, 3, 6, 7], "B"] = min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
Expand All @@ -753,41 +732,103 @@ def test_cummin_cummax():
expected = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)
# GH 15561
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b")

result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)

# GH 15635
df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
result = df.groupby("a").b.cummin()
expected = pd.Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)


@pytest.mark.xfail(
_is_numpy_dev,
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummin_all_nan_column():
base_df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})

# Test nan in entire column
base_df["B"] = np.nan
expected = pd.DataFrame({"B": [np.nan] * 8})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(expected, result)


@pytest.mark.xfail(
_is_numpy_dev,
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummax(numpy_dtypes):
dtype = numpy_dtypes
max_val = (
np.iinfo(dtype).max if np.dtype(dtype).kind == "i" else np.finfo(dtype).max
)

# GH 15048
base_df = pd.DataFrame(
{"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]}
)
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

df = base_df.astype(dtype)

expected = pd.DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = pd.DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(expected, result)
tm.assert_frame_equal(result, expected)
expected = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# GH 15561
df = pd.DataFrame(dict(a=[1], b=pd.to_datetime(["2001"])))
expected = pd.Series(pd.to_datetime("2001"), index=[0], name="b")
for method in ["cummax", "cummin"]:
result = getattr(df.groupby("a")["b"], method)()
tm.assert_series_equal(expected, result)

result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)

# GH 15635
df = pd.DataFrame(dict(a=[1, 2, 1], b=[2, 1, 1]))
result = df.groupby("a").b.cummax()
expected = pd.Series([2, 1, 2], name="b")
tm.assert_series_equal(result, expected)

df = pd.DataFrame(dict(a=[1, 2, 1], b=[1, 2, 2]))
result = df.groupby("a").b.cummin()
expected = pd.Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)

@pytest.mark.xfail(
_is_numpy_dev,
reason="https://github.com/pandas-dev/pandas/issues/31992",
strict=False,
)
def test_cummax_all_nan_column():
base_df = pd.DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})

expected = pd.DataFrame({"B": [np.nan] * 8})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(expected, result)
result = base_df.groupby("A").B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(expected, result)


@pytest.mark.parametrize(
Expand Down