Skip to content

REF: share IntegerArray/FloatingArray coerce_to_array #45596

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jan 27, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
112 changes: 17 additions & 95 deletions pandas/core/arrays/floating.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,9 @@

import numpy as np

from pandas._libs import (
lib,
missing as libmissing,
)
from pandas._typing import DtypeObj
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.common import (
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
is_string_dtype,
)
from pandas.core.dtypes.dtypes import register_extension_dtype

from pandas.core.arrays.numeric import (
Expand All @@ -34,6 +23,8 @@ class FloatingDtype(NumericDtype):
The attributes name & type are set when these subclasses are created.
"""

_default_np_dtype = np.dtype(np.float64)

def __repr__(self) -> str:
return f"{self.name}Dtype()"

Expand Down Expand Up @@ -66,31 +57,8 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
return FLOAT_STR_TO_DTYPE[str(np_dtype)]
return None


def coerce_to_array(
values, dtype=None, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
"""
Coerce the input values array to numpy arrays with a mask.

Parameters
----------
values : 1D list-like
dtype : float dtype
mask : bool 1D array, optional
copy : bool, default False
if True, copy the input

Returns
-------
tuple of (values, mask)
"""
# if values is floating numpy array, preserve its dtype
if dtype is None and hasattr(values, "dtype"):
if is_float_dtype(values.dtype):
dtype = values.dtype

if dtype is not None:
@classmethod
def _standardize_dtype(cls, dtype) -> FloatingDtype:
if isinstance(dtype, str) and dtype.startswith("Float"):
# Avoid DeprecationWarning from NumPy about np.dtype("Float64")
# https://github.com/numpy/numpy/pull/7476
Expand All @@ -101,60 +69,18 @@ def coerce_to_array(
dtype = FLOAT_STR_TO_DTYPE[str(np.dtype(dtype))]
except KeyError as err:
raise ValueError(f"invalid dtype specified {dtype}") from err
return dtype

if isinstance(values, FloatingArray):
values, mask = values._data, values._mask
if dtype is not None:
values = values.astype(dtype.numpy_dtype, copy=False)

if copy:
values = values.copy()
mask = mask.copy()
return values, mask

values = np.array(values, copy=copy)
if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
pass
elif inferred_type == "boolean":
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")

elif is_bool_dtype(values) and is_float_dtype(dtype):
values = np.array(values, dtype=float, copy=copy)

elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")

if values.ndim != 1:
raise TypeError("values must be a 1D list-like")

if mask is None:
mask = libmissing.is_numeric_na(values)

else:
assert len(mask) == len(values)

if not mask.ndim == 1:
raise TypeError("mask must be a 1D list-like")

# infer dtype if needed
if dtype is None:
dtype = np.dtype("float64")
else:
dtype = dtype.type

# if we are float, let's make sure that we can
# safely cast

# we copy as need to coerce here
# TODO should this be a safe cast?
if mask.any():
values = values.copy()
values[mask] = np.nan
values = values.astype(dtype, copy=False) # , casting="safe")
@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.

return values, mask
"safe" in this context means the casting is lossless.
"""
# This is really only here for compatibility with IntegerDtype
# Here for compat with IntegerDtype
return values.astype(dtype, copy=copy)


class FloatingArray(NumericArray):
Expand Down Expand Up @@ -217,8 +143,10 @@ class FloatingArray(NumericArray):
Length: 3, dtype: Float32
"""

_dtype_cls = FloatingDtype

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 0.0
_internal_fill_value = np.nan
# Fill values used for any/all
_truthy_value = 1.0
_falsey_value = 0.0
Expand All @@ -239,12 +167,6 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):

super().__init__(values, mask, copy=copy)

@classmethod
def _coerce_to_array(
cls, value, *, dtype: DtypeObj, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
return coerce_to_array(value, dtype=dtype, copy=copy)


_dtype_docstring = """
An ExtensionDtype for {dtype} data.
Expand Down
142 changes: 25 additions & 117 deletions pandas/core/arrays/integer.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,10 @@

import numpy as np

from pandas._libs import (
lib,
missing as libmissing,
)
from pandas._typing import DtypeObj
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.base import register_extension_dtype
from pandas.core.dtypes.common import (
is_bool_dtype,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
is_string_dtype,
)

from pandas.core.arrays.masked import BaseMaskedDtype
from pandas.core.arrays.numeric import (
Expand All @@ -35,6 +24,8 @@ class _IntegerDtype(NumericDtype):
The attributes name & type are set when these subclasses are created.
"""

_default_np_dtype = np.dtype(np.int64)

def __repr__(self) -> str:
sign = "U" if self.is_unsigned_integer else ""
return f"{sign}Int{8 * self.itemsize}Dtype()"
Expand Down Expand Up @@ -94,49 +85,8 @@ def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None:
return FLOAT_STR_TO_DTYPE[str(np_dtype)]
return None


def safe_cast(values, dtype, copy: bool):
"""
Safely cast the values to the dtype if they
are equivalent, meaning floats must be equivalent to the
ints.
"""
try:
return values.astype(dtype, casting="safe", copy=copy)
except TypeError as err:
casted = values.astype(dtype, copy=copy)
if (casted == values).all():
return casted

raise TypeError(
f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
) from err


def coerce_to_array(
values, dtype, mask=None, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
"""
Coerce the input values array to numpy arrays with a mask.

Parameters
----------
values : 1D list-like
dtype : integer dtype
mask : bool 1D array, optional
copy : bool, default False
if True, copy the input

Returns
-------
tuple of (values, mask)
"""
# if values is integer numpy array, preserve its dtype
if dtype is None and hasattr(values, "dtype"):
if is_integer_dtype(values.dtype):
dtype = values.dtype

if dtype is not None:
@classmethod
def _standardize_dtype(cls, dtype) -> _IntegerDtype:
if isinstance(dtype, str) and (
dtype.startswith("Int") or dtype.startswith("UInt")
):
Expand All @@ -149,64 +99,26 @@ def coerce_to_array(
dtype = INT_STR_TO_DTYPE[str(np.dtype(dtype))]
except KeyError as err:
raise ValueError(f"invalid dtype specified {dtype}") from err
return dtype

@classmethod
def _safe_cast(cls, values: np.ndarray, dtype: np.dtype, copy: bool) -> np.ndarray:
"""
Safely cast the values to the given dtype.

"safe" in this context means the casting is lossless. e.g. if 'values'
has a floating dtype, each value must be an integer.
"""
try:
return values.astype(dtype, casting="safe", copy=copy)
except TypeError as err:
casted = values.astype(dtype, copy=copy)
if (casted == values).all():
return casted

if isinstance(values, IntegerArray):
values, mask = values._data, values._mask
if dtype is not None:
values = values.astype(dtype.numpy_dtype, copy=False)

if copy:
values = values.copy()
mask = mask.copy()
return values, mask

values = np.array(values, copy=copy)
inferred_type = None
if is_object_dtype(values.dtype) or is_string_dtype(values.dtype):
inferred_type = lib.infer_dtype(values, skipna=True)
if inferred_type == "empty":
pass
elif inferred_type == "boolean":
raise TypeError(f"{values.dtype} cannot be converted to a FloatingDtype")

elif is_bool_dtype(values) and is_integer_dtype(dtype):
values = np.array(values, dtype=int, copy=copy)

elif not (is_integer_dtype(values) or is_float_dtype(values)):
raise TypeError(f"{values.dtype} cannot be converted to an IntegerDtype")

if values.ndim != 1:
raise TypeError("values must be a 1D list-like")

if mask is None:
mask = libmissing.is_numeric_na(values)
else:
assert len(mask) == len(values)

if mask.ndim != 1:
raise TypeError("mask must be a 1D list-like")

# infer dtype if needed
if dtype is None:
dtype = np.dtype("int64")
else:
dtype = dtype.type

# if we are float, let's make sure that we can
# safely cast

# we copy as need to coerce here
if mask.any():
values = values.copy()
values[mask] = 1
if inferred_type in ("string", "unicode"):
# casts from str are always safe since they raise
# a ValueError if the str cannot be parsed into an int
values = values.astype(dtype, copy=copy)
else:
values = safe_cast(values, dtype, copy=False)

return values, mask
raise TypeError(
f"cannot safely cast non-equivalent {values.dtype} to {np.dtype(dtype)}"
) from err


class IntegerArray(NumericArray):
Expand Down Expand Up @@ -277,6 +189,8 @@ class IntegerArray(NumericArray):
Length: 3, dtype: UInt16
"""

_dtype_cls = _IntegerDtype

# The value used to fill '_data' to avoid upcasting
_internal_fill_value = 1
# Fill values used for any/all
Expand All @@ -295,12 +209,6 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
)
super().__init__(values, mask, copy=copy)

@classmethod
def _coerce_to_array(
cls, value, *, dtype: DtypeObj, copy: bool = False
) -> tuple[np.ndarray, np.ndarray]:
return coerce_to_array(value, dtype=dtype, copy=copy)


_dtype_docstring = """
An ExtensionDtype for {dtype} integer data.
Expand Down
Loading