Skip to content

PERF: avoid object dtype cast for Categorical in _ensure_data #46208

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 3, 2022
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 13 additions & 20 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,6 @@ def _ensure_data(values: ArrayLike) -> np.ndarray:
# extract_array would raise
values = extract_array(values, extract_numpy=True)

# we check some simple dtypes first
if is_object_dtype(values.dtype):
return ensure_object(np.asarray(values))

Expand All @@ -153,17 +152,19 @@ def _ensure_data(values: ArrayLike) -> np.ndarray:
return _ensure_data(values._data)
return np.asarray(values)

elif is_categorical_dtype(values.dtype):
# NB: cases that go through here should NOT be using _reconstruct_data
# on the back-end.
values = cast("Categorical", values)
return values.codes

elif is_bool_dtype(values.dtype):
if isinstance(values, np.ndarray):
# i.e. actually dtype == np.dtype("bool")
return np.asarray(values).view("uint8")
else:
# i.e. all-bool Categorical, BooleanArray
try:
return np.asarray(values).astype("uint8", copy=False)
except (TypeError, ValueError):
# GH#42107 we have pd.NAs present
return np.asarray(values)
# e.g. Sparse[bool, False] # TODO: no test cases get here
return np.asarray(values).astype("uint8", copy=False)

elif is_integer_dtype(values.dtype):
return np.asarray(values)
Expand All @@ -178,10 +179,7 @@ def _ensure_data(values: ArrayLike) -> np.ndarray:
return np.asarray(values)

elif is_complex_dtype(values.dtype):
# Incompatible return value type (got "Tuple[Union[Any, ExtensionArray,
# ndarray[Any, Any]], Union[Any, ExtensionDtype]]", expected
# "Tuple[ndarray[Any, Any], Union[dtype[Any], ExtensionDtype]]")
return values # type: ignore[return-value]
return cast(np.ndarray, values)

# datetimelike
elif needs_i8_conversion(values.dtype):
Expand All @@ -191,11 +189,6 @@ def _ensure_data(values: ArrayLike) -> np.ndarray:
npvalues = cast(np.ndarray, npvalues)
return npvalues

elif is_categorical_dtype(values.dtype):
values = cast("Categorical", values)
values = values.codes
return values

# we have failed, return object
values = np.asarray(values, dtype=object)
return ensure_object(values)
Expand All @@ -222,7 +215,8 @@ def _reconstruct_data(
return values

if not isinstance(dtype, np.dtype):
# i.e. ExtensionDtype
# i.e. ExtensionDtype; note we have ruled out above the possibility
# that values.dtype == dtype
cls = dtype.construct_array_type()

values = cls._from_sequence(values, dtype=dtype)
Expand Down Expand Up @@ -949,9 +943,8 @@ def mode(values: ArrayLike, dropna: bool = True) -> ArrayLike:
if needs_i8_conversion(values.dtype):
# Got here with ndarray; dispatch to DatetimeArray/TimedeltaArray.
values = ensure_wrapped_if_datetimelike(values)
# error: Item "ndarray[Any, Any]" of "Union[ExtensionArray,
# ndarray[Any, Any]]" has no attribute "_mode"
return values._mode(dropna=dropna) # type: ignore[union-attr]
values = cast("ExtensionArray", values)
return values._mode(dropna=dropna)

values = _ensure_data(values)

Expand Down