Skip to content

REF: Add tests.groupby.methods #55312

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Oct 12, 2023
Empty file.
24 changes: 24 additions & 0 deletions pandas/tests/groupby/methods/test_corrwith.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import numpy as np

from pandas import (
DataFrame,
Index,
Series,
)
import pandas._testing as tm


def test_corrwith_with_1_axis():
# GH 47723
df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})
gb = df.groupby("a")

msg = "DataFrameGroupBy.corrwith with axis=1 is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
result = gb.corrwith(df, axis=1)
index = Index(
data=[(1, 0), (1, 1), (1, 2), (2, 2), (2, 0), (2, 1)],
name=("a", None),
)
expected = Series([np.nan] * 6, index=index)
tm.assert_series_equal(result, expected)
291 changes: 291 additions & 0 deletions pandas/tests/groupby/methods/test_cum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,291 @@
import numpy as np
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in tests.frame and tests.series with have test_cumulative.py. can we use that pattern? and if we're really trying to follow the pattern, that file goes outside the methods/ directory

import pytest

from pandas.errors import UnsupportedFunctionCall
import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
DataFrame,
Series,
)
import pandas._testing as tm


@pytest.fixture(
params=[np.int32, np.int64, np.float32, np.float64, "Int64", "Float64"],
ids=["np.int32", "np.int64", "np.float32", "np.float64", "Int64", "Float64"],
)
def dtypes_for_minmax(request):
"""
Fixture of dtypes with min and max values used for testing
cummin and cummax
"""
dtype = request.param

np_type = dtype
if dtype == "Int64":
np_type = np.int64
elif dtype == "Float64":
np_type = np.float64

min_val = (
np.iinfo(np_type).min
if np.dtype(np_type).kind == "i"
else np.finfo(np_type).min
)
max_val = (
np.iinfo(np_type).max
if np.dtype(np_type).kind == "i"
else np.finfo(np_type).max
)

return (dtype, min_val, max_val)


def test_groupby_cumprod():
# GH 4095
df = DataFrame({"key": ["b"] * 10, "value": 2})

actual = df.groupby("key")["value"].cumprod()
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)

df = DataFrame({"key": ["b"] * 100, "value": 2})
df["value"] = df["value"].astype(float)
actual = df.groupby("key")["value"].cumprod()
expected = df.groupby("key", group_keys=False)["value"].apply(lambda x: x.cumprod())
expected.name = "value"
tm.assert_series_equal(actual, expected)


def test_groupby_cumprod_overflow():
# GH#37493 if we overflow we return garbage consistent with numpy
df = DataFrame({"key": ["b"] * 4, "value": 100_000})
actual = df.groupby("key")["value"].cumprod()
expected = Series(
[100_000, 10_000_000_000, 1_000_000_000_000_000, 7766279631452241920],
name="value",
)
tm.assert_series_equal(actual, expected)

numpy_result = df.groupby("key", group_keys=False)["value"].apply(
lambda x: x.cumprod()
)
numpy_result.name = "value"
tm.assert_series_equal(actual, numpy_result)


def test_groupby_cumprod_nan_influences_other_columns():
# GH#48064
df = DataFrame(
{
"a": 1,
"b": [1, np.nan, 2],
"c": [1, 2, 3.0],
}
)
result = df.groupby("a").cumprod(numeric_only=True, skipna=False)
expected = DataFrame({"b": [1, np.nan, np.nan], "c": [1, 2, 6.0]})
tm.assert_frame_equal(result, expected)


def test_cummin(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
min_val = dtypes_for_minmax[1]

# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_mins = [3, 3, 3, 2, 2, 2, 2, 1]

df = base_df.astype(dtype)

expected = DataFrame({"B": expected_mins}).astype(dtype)
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)

# Test w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
df.loc[[1, 5], "B"] = min_val + 1
expected.loc[[2, 3, 6, 7], "B"] = min_val
expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected, check_exact=True)
expected = (
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
)
tm.assert_frame_equal(result, expected, check_exact=True)

# Test nan in some values
# Explicit cast to float to avoid implicit cast when setting nan
base_df = base_df.astype({"B": "float"})
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 2, np.nan, 3, np.nan, 1]})
result = base_df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
expected = (
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummin()).to_frame()
)
tm.assert_frame_equal(result, expected)

# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")

result = df.groupby("a")["b"].cummin()
tm.assert_series_equal(expected, result)

# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [1, 2, 2]})
result = df.groupby("a").b.cummin()
expected = Series([1, 2, 1], name="b")
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["UInt64", "Int64", "Float64", "float", "boolean"])
def test_cummin_max_all_nan_column(method, dtype):
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [np.nan] * 8})
base_df["B"] = base_df["B"].astype(dtype)
grouped = base_df.groupby("A")

expected = DataFrame({"B": [np.nan] * 8}, dtype=dtype)
result = getattr(grouped, method)()
tm.assert_frame_equal(expected, result)

result = getattr(grouped["B"], method)().to_frame()
tm.assert_frame_equal(expected, result)


def test_cummax(dtypes_for_minmax):
dtype = dtypes_for_minmax[0]
max_val = dtypes_for_minmax[2]

# GH 15048
base_df = DataFrame({"A": [1, 1, 1, 1, 2, 2, 2, 2], "B": [3, 4, 3, 2, 2, 3, 2, 1]})
expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3]

df = base_df.astype(dtype)

expected = DataFrame({"B": expected_maxs}).astype(dtype)
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
result = df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
tm.assert_frame_equal(result, expected)

# Test w/ max value for dtype
df.loc[[2, 6], "B"] = max_val
expected.loc[[2, 3, 6, 7], "B"] = max_val
result = df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = (
df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
)
tm.assert_frame_equal(result, expected)

# Test nan in some values
# Explicit cast to float to avoid implicit cast when setting nan
base_df = base_df.astype({"B": "float"})
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
expected = DataFrame({"B": [np.nan, 4, np.nan, 4, np.nan, 3, np.nan, 3]})
result = base_df.groupby("A").cummax()
tm.assert_frame_equal(result, expected)
expected = (
base_df.groupby("A", group_keys=False).B.apply(lambda x: x.cummax()).to_frame()
)
tm.assert_frame_equal(result, expected)

# GH 15561
df = DataFrame({"a": [1], "b": pd.to_datetime(["2001"])})
expected = Series(pd.to_datetime("2001"), index=[0], name="b")

result = df.groupby("a")["b"].cummax()
tm.assert_series_equal(expected, result)

# GH 15635
df = DataFrame({"a": [1, 2, 1], "b": [2, 1, 1]})
result = df.groupby("a").b.cummax()
expected = Series([2, 1, 2], name="b")
tm.assert_series_equal(result, expected)


def test_cummax_i8_at_implementation_bound():
# the minimum value used to be treated as NPY_NAT+1 instead of NPY_NAT
# for int64 dtype GH#46382
ser = Series([pd.NaT._value + n for n in range(5)])
df = DataFrame({"A": 1, "B": ser, "C": ser.view("M8[ns]")})
gb = df.groupby("A")

res = gb.cummax()
exp = df[["B", "C"]]
tm.assert_frame_equal(res, exp)


@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize("dtype", ["float", "Int64", "Float64"])
@pytest.mark.parametrize(
"groups,expected_data",
[
([1, 1, 1], [1, None, None]),
([1, 2, 3], [1, None, 2]),
([1, 3, 3], [1, None, None]),
],
)
def test_cummin_max_skipna(method, dtype, groups, expected_data):
# GH-34047
df = DataFrame({"a": Series([1, None, 2], dtype=dtype)})
orig = df.copy()
gb = df.groupby(groups)["a"]

result = getattr(gb, method)(skipna=False)
expected = Series(expected_data, dtype=dtype, name="a")

# check we didn't accidentally alter df
tm.assert_frame_equal(df, orig)

tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["cummin", "cummax"])
def test_cummin_max_skipna_multiple_cols(method):
# Ensure missing value in "a" doesn't cause "b" to be nan-filled
df = DataFrame({"a": [np.nan, 2.0, 2.0], "b": [2.0, 2.0, 2.0]})
gb = df.groupby([1, 1, 1])[["a", "b"]]

result = getattr(gb, method)(skipna=False)
expected = DataFrame({"a": [np.nan, np.nan, np.nan], "b": [2.0, 2.0, 2.0]})

tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("func", ["cumprod", "cumsum"])
def test_numpy_compat(func):
# see gh-12811
df = DataFrame({"A": [1, 2, 1], "B": [1, 2, 3]})
g = df.groupby("A")

msg = "numpy operations are not valid with groupby"

with pytest.raises(UnsupportedFunctionCall, match=msg):
getattr(g, func)(1, 2, 3)
with pytest.raises(UnsupportedFunctionCall, match=msg):
getattr(g, func)(foo=1)


@td.skip_if_32bit
@pytest.mark.parametrize("method", ["cummin", "cummax"])
@pytest.mark.parametrize(
"dtype,val", [("UInt64", np.iinfo("uint64").max), ("Int64", 2**53 + 1)]
)
def test_nullable_int_not_cast_as_float(method, dtype, val):
data = [val, pd.NA]
df = DataFrame({"grp": [1, 1], "b": data}, dtype=dtype)
grouped = df.groupby("grp")

result = grouped.transform(method)
expected = DataFrame({"b": data}, dtype=dtype)

tm.assert_frame_equal(result, expected)
Loading