Skip to content

BUG: Make nullable booleans numeric #34056

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 11, 2020
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -800,6 +800,7 @@ ExtensionArray
- Fixed bug that caused :meth:`Series.__repr__()` to crash for extension types whose elements are multidimensional arrays (:issue:`33770`).
- Fixed bug where :meth:`Series.update` would raise a ``ValueError`` for ``ExtensionArray`` dtypes with missing values (:issue:`33980`)
- Fixed bug where :meth:`StringArray.memory_usage` was not implemented (:issue:`33963`)
- Fixed bug where :meth:`DataFrameGroupBy` would ignore the ``min_count`` argument for aggregations on nullable boolean dtypes (:issue:`34051`)
- Fixed bug that `DataFrame(columns=.., dtype='string')` would fail (:issue:`27953`, :issue:`33623`)


Expand Down
4 changes: 4 additions & 0 deletions pandas/core/arrays/boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ def __repr__(self) -> str:
def _is_boolean(self) -> bool:
return True

@property
def _is_numeric(self) -> bool:
return True

def __from_arrow__(
self, array: Union["pyarrow.Array", "pyarrow.ChunkedArray"]
) -> "BooleanArray":
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,8 @@
from pandas.core.dtypes.inference import is_list_like
from pandas.core.dtypes.missing import isna, notna

import pandas as pd

if TYPE_CHECKING:
from pandas import Series
from pandas.core.arrays import ExtensionArray # noqa: F401
Expand Down Expand Up @@ -312,12 +314,11 @@ def maybe_cast_result_dtype(dtype: DtypeObj, how: str) -> DtypeObj:
DtypeObj
The desired dtype of the result.
"""
d = {
(np.dtype(np.bool), "add"): np.dtype(np.int64),
(np.dtype(np.bool), "cumsum"): np.dtype(np.int64),
(np.dtype(np.bool), "sum"): np.dtype(np.int64),
}
return d.get((dtype, how), dtype)
if how in ["add", "cumsum", "sum"] and (dtype == np.dtype(np.bool)):
return np.dtype(np.int64)
if how in ["add", "cumsum", "sum"] and isinstance(dtype, pd.BooleanDtype):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

dont use pd import anywhere, rather prefer to import directly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm hitting some circular import issues when I try to import these directly, is it okay to import from within the function itself?

return pd.Int64Dtype()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

don't use pd

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can either import (best) or return a string "Int64"

return dtype


def maybe_cast_to_extension_array(cls: Type["ExtensionArray"], obj, dtype=None):
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,7 @@ def _cython_operation(
values = values.view("int64")
is_numeric = True
elif is_bool_dtype(values.dtype):
values = ensure_float64(values)
values = ensure_int_or_float(values)
elif is_integer_dtype(values):
# we use iNaT for the missing value on ints
# so pre-convert to guard this condition
Expand Down
17 changes: 17 additions & 0 deletions pandas/tests/extension/test_boolean.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,23 @@ def test_in_numeric_groupby(self, data_for_grouping):

tm.assert_index_equal(result, expected)

@pytest.mark.parametrize("min_count", [0, 10])
def test_groupby_sum_mincount(self, data_for_grouping, min_count):
df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping})
result = df.groupby("A").sum(min_count=min_count)
if min_count == 0:
expected = pd.DataFrame(
{"B": pd.array([3, 0, 0], dtype="Int64")},
index=pd.Index([1, 2, 3], name="A"),
)
tm.assert_frame_equal(result, expected)
else:
expected = pd.DataFrame(
{"B": pd.array([pd.NA] * 3, dtype="Int64")},
index=pd.Index([1, 2, 3], name="A"),
)
tm.assert_frame_equal(result, expected)


class TestNumericReduce(base.BaseNumericReduceTests):
def check_reduce(self, s, op_name, skipna):
Expand Down