Skip to content

Sync Fork from Upstream Repo #184

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 19, 2021
16 changes: 10 additions & 6 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1876,29 +1876,33 @@ def _sort_tuples(values: np.ndarray) -> np.ndarray:
return values[indexer]


def union_with_duplicates(lvals: np.ndarray, rvals: np.ndarray) -> np.ndarray:
def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
"""
Extracts the union from lvals and rvals with respect to duplicates and nans in
both arrays.

Parameters
----------
lvals: np.ndarray
lvals: np.ndarray or ExtensionArray
left values which is ordered in front.
rvals: np.ndarray
rvals: np.ndarray or ExtensionArray
right values ordered after lvals.

Returns
-------
np.ndarray containing the unsorted union of both arrays
np.ndarray or ExtensionArray
Containing the unsorted union of both arrays.
"""
indexer = []
l_count = value_counts(lvals, dropna=False)
r_count = value_counts(rvals, dropna=False)
l_count, r_count = l_count.align(r_count, fill_value=0)
unique_array = unique(np.append(lvals, rvals))
if is_extension_array_dtype(lvals) or is_extension_array_dtype(rvals):
unique_array = pd_array(unique_array)
if not isinstance(lvals, np.ndarray):
# i.e. ExtensionArray
# Note: we only get here with lvals.dtype == rvals.dtype
# TODO: are there any cases where union won't be type/dtype preserving?
unique_array = type(lvals)._from_sequence(unique_array, dtype=lvals.dtype)
for i, value in enumerate(unique_array):
indexer += [i] * int(max(l_count[value], r_count[value]))
return unique_array.take(indexer)
15 changes: 10 additions & 5 deletions pandas/core/arrays/string_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -675,13 +675,18 @@ def value_counts(self, dropna: bool = True) -> Series:

vc = self._data.value_counts()

# Index cannot hold ExtensionArrays yet
index = Index(type(self)(vc.field(0)).astype(object))
values = vc.field(0)
counts = vc.field(1)
if dropna and self._data.null_count > 0:
mask = values.is_valid()
values = values.filter(mask)
counts = counts.filter(mask)

# No missing values so we can adhere to the interface and return a numpy array.
counts = np.array(vc.field(1))
counts = np.array(counts)

if dropna and self._data.null_count > 0:
raise NotImplementedError("yo")
# Index cannot hold ExtensionArrays yet
index = Index(type(self)(values)).astype(object)

return Series(counts, index=index).astype("Int64")

Expand Down
5 changes: 3 additions & 2 deletions pandas/core/computation/engines.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
align_terms,
reconstruct_object,
)
from pandas.core.computation.expr import Expr
from pandas.core.computation.ops import (
MATHOPS,
REDUCTIONS,
Expand All @@ -26,13 +27,13 @@ class NumExprClobberingError(NameError):
pass


def _check_ne_builtin_clash(expr):
def _check_ne_builtin_clash(expr: Expr) -> None:
"""
Attempt to prevent foot-shooting in a helpful way.

Parameters
----------
terms : Term
expr : Expr
Terms can contain
"""
names = expr.names
Expand Down
13 changes: 8 additions & 5 deletions pandas/core/computation/eval.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
"""
Top level ``eval`` module.
"""
from __future__ import annotations

import tokenize
from typing import Optional
import warnings

from pandas._libs.lib import no_default
Expand All @@ -14,13 +14,14 @@
PARSERS,
Expr,
)
from pandas.core.computation.ops import BinOp
from pandas.core.computation.parsing import tokenize_string
from pandas.core.computation.scope import ensure_scope

from pandas.io.formats.printing import pprint_thing


def _check_engine(engine: Optional[str]) -> str:
def _check_engine(engine: str | None) -> str:
"""
Make sure a valid engine is passed.

Expand Down Expand Up @@ -161,9 +162,9 @@ def _check_for_locals(expr: str, stack_level: int, parser: str):


def eval(
expr,
parser="pandas",
engine: Optional[str] = None,
expr: str | BinOp, # we leave BinOp out of the docstr bc it isn't for users
parser: str = "pandas",
engine: str | None = None,
truediv=no_default,
local_dict=None,
global_dict=None,
Expand Down Expand Up @@ -309,10 +310,12 @@ def eval(
stacklevel=2,
)

exprs: list[str | BinOp]
if isinstance(expr, str):
_check_expression(expr)
exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""]
else:
# ops.BinOp; for internal compat, not intended to be passed by users
exprs = [expr]
multi_line = len(exprs) > 1

Expand Down
4 changes: 3 additions & 1 deletion pandas/core/computation/pytables.py
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,7 @@ class PyTablesExpr(expr.Expr):

_visitor: PyTablesExprVisitor | None
env: PyTablesScope
expr: str

def __init__(
self,
Expand All @@ -570,7 +571,7 @@ def __init__(
local_dict = where.env.scope
_where = where.expr

elif isinstance(where, (list, tuple)):
elif is_list_like(where):
where = list(where)
for idx, w in enumerate(where):
if isinstance(w, PyTablesExpr):
Expand All @@ -580,6 +581,7 @@ def __init__(
where[idx] = w
_where = " & ".join(f"({w})" for w in com.flatten(where))
else:
# _validate_where ensures we otherwise have a string
_where = where

self.expr = _where
Expand Down
17 changes: 10 additions & 7 deletions pandas/core/computation/scope.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,13 @@ class Scope:
"""

__slots__ = ["level", "scope", "target", "resolvers", "temps"]
level: int
scope: DeepChainMap
resolvers: DeepChainMap
temps: dict

def __init__(
self, level, global_dict=None, local_dict=None, resolvers=(), target=None
self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None
):
self.level = level + 1

Expand Down Expand Up @@ -146,8 +150,7 @@ def __init__(

# assumes that resolvers are going from outermost scope to inner
if isinstance(local_dict, Scope):
# error: Cannot determine type of 'resolvers'
resolvers += tuple(local_dict.resolvers.maps) # type: ignore[has-type]
resolvers += tuple(local_dict.resolvers.maps)
self.resolvers = DeepChainMap(*resolvers)
self.temps = {}

Expand Down Expand Up @@ -212,7 +215,7 @@ def resolve(self, key: str, is_local: bool):

raise UndefinedVariableError(key, is_local) from err

def swapkey(self, old_key: str, new_key: str, new_value=None):
def swapkey(self, old_key: str, new_key: str, new_value=None) -> None:
"""
Replace a variable name, with a potentially new value.

Expand All @@ -238,7 +241,7 @@ def swapkey(self, old_key: str, new_key: str, new_value=None):
mapping[new_key] = new_value # type: ignore[index]
return

def _get_vars(self, stack, scopes: list[str]):
def _get_vars(self, stack, scopes: list[str]) -> None:
"""
Get specifically scoped variables from a list of stack frames.

Expand All @@ -263,7 +266,7 @@ def _get_vars(self, stack, scopes: list[str]):
# scope after the loop
del frame

def _update(self, level: int):
def _update(self, level: int) -> None:
"""
Update the current scope by going back `level` levels.

Expand Down Expand Up @@ -313,7 +316,7 @@ def ntemps(self) -> int:
return len(self.temps)

@property
def full_scope(self):
def full_scope(self) -> DeepChainMap:
"""
Return the full scope for use with passing to engines transparently
as a mapping.
Expand Down
14 changes: 6 additions & 8 deletions pandas/core/dtypes/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,6 @@
ensure_str,
is_bool,
is_bool_dtype,
is_categorical_dtype,
is_complex,
is_complex_dtype,
is_datetime64_dtype,
Expand All @@ -79,6 +78,7 @@
pandas_dtype,
)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
ExtensionDtype,
IntervalDtype,
Expand Down Expand Up @@ -359,15 +359,15 @@ def trans(x):
return result


def maybe_cast_result(
def maybe_cast_pointwise_result(
result: ArrayLike,
dtype: DtypeObj,
numeric_only: bool = False,
how: str = "",
same_dtype: bool = True,
) -> ArrayLike:
"""
Try casting result to a different type if appropriate
Try casting result of a pointwise operation back to the original dtype if
appropriate.

Parameters
----------
Expand All @@ -377,8 +377,6 @@ def maybe_cast_result(
Input Series from which result was calculated.
numeric_only : bool, default False
Whether to cast only numerics or datetimes as well.
how : str, default ""
How the result was computed.
same_dtype : bool, default True
Specify dtype when calling _from_sequence

Expand All @@ -387,12 +385,12 @@ def maybe_cast_result(
result : array-like
result maybe casted to the dtype.
"""
dtype = maybe_cast_result_dtype(dtype, how)

assert not is_scalar(result)

if isinstance(dtype, ExtensionDtype):
if not is_categorical_dtype(dtype) and dtype.kind != "M":
if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)):
# TODO: avoid this special-casing
# We have to special case categorical so as not to upcast
# things like counts back to categorical

Expand Down
45 changes: 0 additions & 45 deletions pandas/core/dtypes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,51 +128,6 @@ def ensure_str(value: Union[bytes, Any]) -> str:
return value


def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.ndarray:
"""
Ensure that an dtype array of some integer dtype
has an int64 dtype if possible.
If it's not possible, potentially because of overflow,
convert the array to float64 instead.

Parameters
----------
arr : array-like
The array whose data type we want to enforce.
copy: bool
Whether to copy the original array or reuse
it in place, if possible.

Returns
-------
out_arr : The input array cast as int64 if
possible without overflow.
Otherwise the input array cast to float64.

Notes
-----
If the array is explicitly of type uint64 the type
will remain unchanged.
"""
# TODO: GH27506 potential bug with ExtensionArrays
try:
# error: Unexpected keyword argument "casting" for "astype"
return arr.astype("int64", copy=copy, casting="safe") # type: ignore[call-arg]
except TypeError:
pass
try:
# error: Unexpected keyword argument "casting" for "astype"
return arr.astype("uint64", copy=copy, casting="safe") # type: ignore[call-arg]
except TypeError:
if is_extension_array_dtype(arr.dtype):
# pandas/core/dtypes/common.py:168: error: Item "ndarray" of
# "Union[ExtensionArray, ndarray]" has no attribute "to_numpy" [union-attr]
return arr.to_numpy( # type: ignore[union-attr]
dtype="float64", na_value=np.nan
)
return arr.astype("float64", copy=copy)


def ensure_python_int(value: Union[int, np.integer]) -> int:
"""
Ensure that a value is a python int.
Expand Down
13 changes: 7 additions & 6 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,13 @@
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.cast import (
maybe_cast_result,
maybe_cast_pointwise_result,
maybe_cast_result_dtype,
maybe_downcast_to_dtype,
)
from pandas.core.dtypes.common import (
ensure_float64,
ensure_int64,
ensure_int_or_float,
ensure_platform_int,
is_bool_dtype,
is_categorical_dtype,
Expand Down Expand Up @@ -582,7 +581,7 @@ def _ea_wrap_cython_operation(

elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
# IntegerArray or BooleanArray
values = ensure_int_or_float(values)
values = values.to_numpy("float64", na_value=np.nan)
res_values = self._cython_operation(
kind, values, how, axis, min_count, **kwargs
)
Expand Down Expand Up @@ -660,9 +659,11 @@ def _cython_operation(
values = values.view("int64")
is_numeric = True
elif is_bool_dtype(dtype):
values = ensure_int_or_float(values)
values = values.astype("int64")
elif is_integer_dtype(dtype):
values = ensure_int_or_float(values)
# e.g. uint8 -> uint64, int16 -> int64
dtype = dtype.kind + "8"
values = values.astype(dtype, copy=False)
elif is_numeric:
if not is_complex_dtype(dtype):
values = ensure_float64(values)
Expand Down Expand Up @@ -797,7 +798,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F):
result[label] = res

out = lib.maybe_convert_objects(result, try_float=False)
out = maybe_cast_result(out, obj.dtype, numeric_only=True)
out = maybe_cast_pointwise_result(out, obj.dtype, numeric_only=True)

return out, counts

Expand Down
Loading