sthagen · sthagen · Apr 19, 2021 · Apr 19, 2021 · Apr 19, 2021 · Apr 19, 2021
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1876,29 +1876,33 @@ def _sort_tuples(values: np.ndarray) -> np.ndarray:
     return values[indexer]
 
 
-def union_with_duplicates(lvals: np.ndarray, rvals: np.ndarray) -> np.ndarray:
+def union_with_duplicates(lvals: ArrayLike, rvals: ArrayLike) -> ArrayLike:
     """
     Extracts the union from lvals and rvals with respect to duplicates and nans in
     both arrays.
 
     Parameters
     ----------
-    lvals: np.ndarray
+    lvals: np.ndarray or ExtensionArray
         left values which is ordered in front.
-    rvals: np.ndarray
+    rvals: np.ndarray or ExtensionArray
         right values ordered after lvals.
 
     Returns
     -------
-    np.ndarray containing the unsorted union of both arrays
+    np.ndarray or ExtensionArray
+        Containing the unsorted union of both arrays.
     """
     indexer = []
     l_count = value_counts(lvals, dropna=False)
     r_count = value_counts(rvals, dropna=False)
     l_count, r_count = l_count.align(r_count, fill_value=0)
     unique_array = unique(np.append(lvals, rvals))
-    if is_extension_array_dtype(lvals) or is_extension_array_dtype(rvals):
-        unique_array = pd_array(unique_array)
+    if not isinstance(lvals, np.ndarray):
+        # i.e. ExtensionArray
+        # Note: we only get here with lvals.dtype == rvals.dtype
+        # TODO: are there any cases where union won't be type/dtype preserving?
+        unique_array = type(lvals)._from_sequence(unique_array, dtype=lvals.dtype)
     for i, value in enumerate(unique_array):
         indexer += [i] * int(max(l_count[value], r_count[value]))
     return unique_array.take(indexer)
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -675,13 +675,18 @@ def value_counts(self, dropna: bool = True) -> Series:
 
         vc = self._data.value_counts()
 
-        # Index cannot hold ExtensionArrays yet
-        index = Index(type(self)(vc.field(0)).astype(object))
+        values = vc.field(0)
+        counts = vc.field(1)
+        if dropna and self._data.null_count > 0:
+            mask = values.is_valid()
+            values = values.filter(mask)
+            counts = counts.filter(mask)
+
         # No missing values so we can adhere to the interface and return a numpy array.
-        counts = np.array(vc.field(1))
+        counts = np.array(counts)
 
-        if dropna and self._data.null_count > 0:
-            raise NotImplementedError("yo")
+        # Index cannot hold ExtensionArrays yet
+        index = Index(type(self)(values)).astype(object)
 
         return Series(counts, index=index).astype("Int64")
 

diff --git a/pandas/core/computation/engines.py b/pandas/core/computation/engines.py
@@ -12,6 +12,7 @@
     align_terms,
     reconstruct_object,
 )
+from pandas.core.computation.expr import Expr
 from pandas.core.computation.ops import (
     MATHOPS,
     REDUCTIONS,
@@ -26,13 +27,13 @@ class NumExprClobberingError(NameError):
     pass
 
 
-def _check_ne_builtin_clash(expr):
+def _check_ne_builtin_clash(expr: Expr) -> None:
     """
     Attempt to prevent foot-shooting in a helpful way.
 
     Parameters
     ----------
-    terms : Term
+    expr : Expr
         Terms can contain
     """
     names = expr.names

diff --git a/pandas/core/computation/eval.py b/pandas/core/computation/eval.py
@@ -1,9 +1,9 @@
 """
 Top level ``eval`` module.
 """
+from __future__ import annotations
 
 import tokenize
-from typing import Optional
 import warnings
 
 from pandas._libs.lib import no_default
@@ -14,13 +14,14 @@
     PARSERS,
     Expr,
 )
+from pandas.core.computation.ops import BinOp
 from pandas.core.computation.parsing import tokenize_string
 from pandas.core.computation.scope import ensure_scope
 
 from pandas.io.formats.printing import pprint_thing
 
 
-def _check_engine(engine: Optional[str]) -> str:
+def _check_engine(engine: str | None) -> str:
     """
     Make sure a valid engine is passed.
 
@@ -161,9 +162,9 @@ def _check_for_locals(expr: str, stack_level: int, parser: str):
 
 
 def eval(
-    expr,
-    parser="pandas",
-    engine: Optional[str] = None,
+    expr: str | BinOp,  # we leave BinOp out of the docstr bc it isn't for users
+    parser: str = "pandas",
+    engine: str | None = None,
     truediv=no_default,
     local_dict=None,
     global_dict=None,
@@ -309,10 +310,12 @@ def eval(
             stacklevel=2,
         )
 
+    exprs: list[str | BinOp]
     if isinstance(expr, str):
         _check_expression(expr)
         exprs = [e.strip() for e in expr.splitlines() if e.strip() != ""]
     else:
+        # ops.BinOp; for internal compat, not intended to be passed by users
         exprs = [expr]
     multi_line = len(exprs) > 1
 

diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py
@@ -546,6 +546,7 @@ class PyTablesExpr(expr.Expr):
 
     _visitor: PyTablesExprVisitor | None
     env: PyTablesScope
+    expr: str
 
     def __init__(
         self,
@@ -570,7 +571,7 @@ def __init__(
             local_dict = where.env.scope
             _where = where.expr
 
-        elif isinstance(where, (list, tuple)):
+        elif is_list_like(where):
             where = list(where)
             for idx, w in enumerate(where):
                 if isinstance(w, PyTablesExpr):
@@ -580,6 +581,7 @@ def __init__(
                     where[idx] = w
             _where = " & ".join(f"({w})" for w in com.flatten(where))
         else:
+            # _validate_where ensures we otherwise have a string
             _where = where
 
         self.expr = _where

diff --git a/pandas/core/computation/scope.py b/pandas/core/computation/scope.py
@@ -106,9 +106,13 @@ class Scope:
     """
 
     __slots__ = ["level", "scope", "target", "resolvers", "temps"]
+    level: int
+    scope: DeepChainMap
+    resolvers: DeepChainMap
+    temps: dict
 
     def __init__(
-        self, level, global_dict=None, local_dict=None, resolvers=(), target=None
+        self, level: int, global_dict=None, local_dict=None, resolvers=(), target=None
     ):
         self.level = level + 1
 
@@ -146,8 +150,7 @@ def __init__(
 
         # assumes that resolvers are going from outermost scope to inner
         if isinstance(local_dict, Scope):
-            # error: Cannot determine type of 'resolvers'
-            resolvers += tuple(local_dict.resolvers.maps)  # type: ignore[has-type]
+            resolvers += tuple(local_dict.resolvers.maps)
         self.resolvers = DeepChainMap(*resolvers)
         self.temps = {}
 
@@ -212,7 +215,7 @@ def resolve(self, key: str, is_local: bool):
 
                 raise UndefinedVariableError(key, is_local) from err
 
-    def swapkey(self, old_key: str, new_key: str, new_value=None):
+    def swapkey(self, old_key: str, new_key: str, new_value=None) -> None:
         """
         Replace a variable name, with a potentially new value.
 
@@ -238,7 +241,7 @@ def swapkey(self, old_key: str, new_key: str, new_value=None):
                 mapping[new_key] = new_value  # type: ignore[index]
                 return
 
-    def _get_vars(self, stack, scopes: list[str]):
+    def _get_vars(self, stack, scopes: list[str]) -> None:
         """
         Get specifically scoped variables from a list of stack frames.
 
@@ -263,7 +266,7 @@ def _get_vars(self, stack, scopes: list[str]):
                 # scope after the loop
                 del frame
 
-    def _update(self, level: int):
+    def _update(self, level: int) -> None:
         """
         Update the current scope by going back `level` levels.
 
@@ -313,7 +316,7 @@ def ntemps(self) -> int:
         return len(self.temps)
 
     @property
-    def full_scope(self):
+    def full_scope(self) -> DeepChainMap:
         """
         Return the full scope for use with passing to engines transparently
         as a mapping.

diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -55,7 +55,6 @@
     ensure_str,
     is_bool,
     is_bool_dtype,
-    is_categorical_dtype,
     is_complex,
     is_complex_dtype,
     is_datetime64_dtype,
@@ -79,6 +78,7 @@
     pandas_dtype,
 )
 from pandas.core.dtypes.dtypes import (
+    CategoricalDtype,
     DatetimeTZDtype,
     ExtensionDtype,
     IntervalDtype,
@@ -359,15 +359,15 @@ def trans(x):
     return result
 
 
-def maybe_cast_result(
+def maybe_cast_pointwise_result(
     result: ArrayLike,
     dtype: DtypeObj,
     numeric_only: bool = False,
-    how: str = "",
     same_dtype: bool = True,
 ) -> ArrayLike:
     """
-    Try casting result to a different type if appropriate
+    Try casting result of a pointwise operation back to the original dtype if
+    appropriate.
 
     Parameters
     ----------
@@ -377,8 +377,6 @@ def maybe_cast_result(
         Input Series from which result was calculated.
     numeric_only : bool, default False
         Whether to cast only numerics or datetimes as well.
-    how : str, default ""
-        How the result was computed.
     same_dtype : bool, default True
         Specify dtype when calling _from_sequence
 
@@ -387,12 +385,12 @@ def maybe_cast_result(
     result : array-like
         result maybe casted to the dtype.
     """
-    dtype = maybe_cast_result_dtype(dtype, how)
 
     assert not is_scalar(result)
 
     if isinstance(dtype, ExtensionDtype):
-        if not is_categorical_dtype(dtype) and dtype.kind != "M":
+        if not isinstance(dtype, (CategoricalDtype, DatetimeTZDtype)):
+            # TODO: avoid this special-casing
             # We have to special case categorical so as not to upcast
             # things like counts back to categorical
 

diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py
@@ -128,51 +128,6 @@ def ensure_str(value: Union[bytes, Any]) -> str:
     return value
 
 
-def ensure_int_or_float(arr: ArrayLike, copy: bool = False) -> np.ndarray:
-    """
-    Ensure that an dtype array of some integer dtype
-    has an int64 dtype if possible.
-    If it's not possible, potentially because of overflow,
-    convert the array to float64 instead.
-
-    Parameters
-    ----------
-    arr : array-like
-          The array whose data type we want to enforce.
-    copy: bool
-          Whether to copy the original array or reuse
-          it in place, if possible.
-
-    Returns
-    -------
-    out_arr : The input array cast as int64 if
-              possible without overflow.
-              Otherwise the input array cast to float64.
-
-    Notes
-    -----
-    If the array is explicitly of type uint64 the type
-    will remain unchanged.
-    """
-    # TODO: GH27506 potential bug with ExtensionArrays
-    try:
-        # error: Unexpected keyword argument "casting" for "astype"
-        return arr.astype("int64", copy=copy, casting="safe")  # type: ignore[call-arg]
-    except TypeError:
-        pass
-    try:
-        # error: Unexpected keyword argument "casting" for "astype"
-        return arr.astype("uint64", copy=copy, casting="safe")  # type: ignore[call-arg]
-    except TypeError:
-        if is_extension_array_dtype(arr.dtype):
-            # pandas/core/dtypes/common.py:168: error: Item "ndarray" of
-            # "Union[ExtensionArray, ndarray]" has no attribute "to_numpy"  [union-attr]
-            return arr.to_numpy(  # type: ignore[union-attr]
-                dtype="float64", na_value=np.nan
-            )
-        return arr.astype("float64", copy=copy)
-
-
 def ensure_python_int(value: Union[int, np.integer]) -> int:
     """
     Ensure that a value is a python int.

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
@@ -36,14 +36,13 @@
 from pandas.util._decorators import cache_readonly
 
 from pandas.core.dtypes.cast import (
-    maybe_cast_result,
+    maybe_cast_pointwise_result,
     maybe_cast_result_dtype,
     maybe_downcast_to_dtype,
 )
 from pandas.core.dtypes.common import (
     ensure_float64,
     ensure_int64,
-    ensure_int_or_float,
     ensure_platform_int,
     is_bool_dtype,
     is_categorical_dtype,
@@ -582,7 +581,7 @@ def _ea_wrap_cython_operation(
 
         elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
             # IntegerArray or BooleanArray
-            values = ensure_int_or_float(values)
+            values = values.to_numpy("float64", na_value=np.nan)
             res_values = self._cython_operation(
                 kind, values, how, axis, min_count, **kwargs
             )
@@ -660,9 +659,11 @@ def _cython_operation(
             values = values.view("int64")
             is_numeric = True
         elif is_bool_dtype(dtype):
-            values = ensure_int_or_float(values)
+            values = values.astype("int64")
         elif is_integer_dtype(dtype):
-            values = ensure_int_or_float(values)
+            # e.g. uint8 -> uint64, int16 -> int64
+            dtype = dtype.kind + "8"
+            values = values.astype(dtype, copy=False)
         elif is_numeric:
             if not is_complex_dtype(dtype):
                 values = ensure_float64(values)
@@ -797,7 +798,7 @@ def _aggregate_series_pure_python(self, obj: Series, func: F):
             result[label] = res
 
         out = lib.maybe_convert_objects(result, try_float=False)
-        out = maybe_cast_result(out, obj.dtype, numeric_only=True)
+        out = maybe_cast_pointwise_result(out, obj.dtype, numeric_only=True)
 
         return out, counts