Skip to content

Sync Fork from Upstream Repo #222

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jul 16, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions doc/source/user_guide/categorical.rst
Original file line number Diff line number Diff line change
Expand Up @@ -777,8 +777,8 @@ value is included in the ``categories``:
df
try:
df.iloc[2:4, :] = [["c", 3], ["c", 3]]
except ValueError as e:
print("ValueError:", str(e))
except TypeError as e:
print("TypeError:", str(e))

Setting values by assigning categorical data will also check that the ``categories`` match:

Expand All @@ -788,8 +788,8 @@ Setting values by assigning categorical data will also check that the ``categori
df
try:
df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], categories=["a", "b", "c"])
except ValueError as e:
print("ValueError:", str(e))
except TypeError as e:
print("TypeError:", str(e))

Assigning a ``Categorical`` to parts of a column of other types will use the values:

Expand Down
3 changes: 1 addition & 2 deletions doc/source/user_guide/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,6 @@ something different for each of the columns. Thus, using ``[]`` similar to
getting a column from a DataFrame, you can do:

.. ipython:: python
:suppress:

df = pd.DataFrame(
{
Expand All @@ -402,7 +401,7 @@ getting a column from a DataFrame, you can do:
}
)

.. ipython:: python
df

grouped = df.groupby(["A"])
grouped_C = grouped["C"]
Expand Down
7 changes: 6 additions & 1 deletion doc/source/whatsnew/v1.4.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,11 @@ Bug fixes

Categorical
^^^^^^^^^^^
-
- Bug in setting dtype-incompatible values into a :class:`Categorical` (or ``Series`` or ``DataFrame`` backed by ``Categorical``) raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.searchsorted` when passing a dtype-incompatible value raising ``KeyError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
- Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`)
-

Datetimelike
Expand Down Expand Up @@ -229,6 +233,7 @@ Missing

MultiIndex
^^^^^^^^^^
- Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`)
- Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`)
-

Expand Down
35 changes: 15 additions & 20 deletions pandas/core/arrays/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1394,17 +1394,14 @@ def map(self, mapper):
# -------------------------------------------------------------
# Validators; ideally these can be de-duplicated

def _validate_searchsorted_value(self, value):
# searchsorted is very performance sensitive. By converting codes
# to same dtype as self.codes, we get much faster performance.
if is_scalar(value):
codes = self._unbox_scalar(value)
def _validate_setitem_value(self, value):
if not is_hashable(value):
# wrap scalars and hashable-listlikes in list
return self._validate_listlike(value)
else:
locs = [self.categories.get_loc(x) for x in value]
# error: Incompatible types in assignment (expression has type
# "ndarray", variable has type "int")
codes = np.array(locs, dtype=self.codes.dtype) # type: ignore[assignment]
return codes
return self._validate_scalar(value)

_validate_searchsorted_value = _validate_setitem_value

def _validate_scalar(self, fill_value):
"""
Expand All @@ -1430,8 +1427,8 @@ def _validate_scalar(self, fill_value):
fill_value = self._unbox_scalar(fill_value)
else:
raise TypeError(
f"'fill_value={fill_value}' is not present "
"in this Categorical's categories"
"Cannot setitem on a Categorical with a new "
f"category ({fill_value}), set the categories first"
)
return fill_value

Expand Down Expand Up @@ -2016,37 +2013,35 @@ def __getitem__(self, key):
deprecate_ndim_indexing(result)
return result

def _validate_setitem_value(self, value):
def _validate_listlike(self, value):
# NB: here we assume scalar-like tuples have already been excluded
value = extract_array(value, extract_numpy=True)

# require identical categories set
if isinstance(value, Categorical):
if not is_dtype_equal(self.dtype, value.dtype):
raise ValueError(
raise TypeError(
"Cannot set a Categorical with another, "
"without identical categories"
)
# is_dtype_equal implies categories_match_up_to_permutation
value = self._encode_with_my_categories(value)
return value._codes

# wrap scalars and hashable-listlikes in list
rvalue = value if not is_hashable(value) else [value]

from pandas import Index

# tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
to_add = Index(rvalue, tupleize_cols=False).difference(self.categories)
to_add = Index(value, tupleize_cols=False).difference(self.categories)

# no assignments of values not in categories, but it's always ok to set
# something to np.nan
if len(to_add) and not isna(to_add).all():
raise ValueError(
raise TypeError(
"Cannot setitem on a Categorical with a new "
"category, set the categories first"
)

codes = self.categories.get_indexer(rvalue)
codes = self.categories.get_indexer(value)
return codes.astype(self._ndarray.dtype, copy=False)

def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3452,7 +3452,7 @@ def __getitem__(self, key):
else:
if is_iterator(key):
key = list(key)
indexer = self.loc._get_listlike_indexer(key, axis=1)[1]
indexer = self.columns._get_indexer_strict(key, "columns")[1]

# take() does not accept boolean indexers
if getattr(indexer, "dtype", None) == bool:
Expand Down
83 changes: 83 additions & 0 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5384,6 +5384,89 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]:
indexer, _ = self.get_indexer_non_unique(target)
return indexer

def _get_indexer_strict(self, key, axis_name: str_t) -> tuple[Index, np.ndarray]:
"""
Analogue to get_indexer that raises if any elements are missing.
"""
keyarr = key
if not isinstance(keyarr, Index):
keyarr = com.asarray_tuplesafe(keyarr)

if self._index_as_unique:
indexer = self.get_indexer_for(keyarr)
keyarr = self.reindex(keyarr)[0]
else:
keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)

self._raise_if_missing(keyarr, indexer, axis_name)

if (
needs_i8_conversion(self.dtype)
or is_categorical_dtype(self.dtype)
or is_interval_dtype(self.dtype)
):
# For CategoricalIndex take instead of reindex to preserve dtype.
# For IntervalIndex this is to map integers to the Intervals they match to.
keyarr = self.take(indexer)
if keyarr.dtype.kind in ["m", "M"]:
# DTI/TDI.take can infer a freq in some cases when we dont want one
if isinstance(key, list) or (
isinstance(key, type(self))
# "Index" has no attribute "freq"
and key.freq is None # type: ignore[attr-defined]
):
keyarr = keyarr._with_freq(None)

return keyarr, indexer

def _raise_if_missing(self, key, indexer, axis_name: str_t):
"""
Check that indexer can be used to return a result.

e.g. at least one element was found,
unless the list of keys was actually empty.

Parameters
----------
key : list-like
Targeted labels (only used to show correct error message).
indexer: array-like of booleans
Indices corresponding to the key,
(with -1 indicating not found).
axis_name : str

Raises
------
KeyError
If at least one key was requested but none was found.
"""
if len(key) == 0:
return

# Count missing values
missing_mask = indexer < 0
nmissing = missing_mask.sum()

if nmissing:

# TODO: remove special-case; this is just to keep exception
# message tests from raising while debugging
use_interval_msg = is_interval_dtype(self.dtype) or (
is_categorical_dtype(self.dtype)
# "Index" has no attribute "categories" [attr-defined]
and is_interval_dtype(
self.categories.dtype # type: ignore[attr-defined]
)
)

if nmissing == len(indexer):
if use_interval_msg:
key = list(key)
raise KeyError(f"None of [{key}] are in the [{axis_name}]")

not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
raise KeyError(f"{not_found} not in index")

@overload
def _get_indexer_non_comparable(
self, target: Index, method, unique: Literal[True] = ...
Expand Down
38 changes: 22 additions & 16 deletions pandas/core/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2541,24 +2541,28 @@ def _get_values_for_loc(self, series: Series, loc, key):
new_ser = series._constructor(new_values, index=new_index, name=series.name)
return new_ser.__finalize__(series)

def _convert_listlike_indexer(self, keyarr) -> np.ndarray | None:
"""
Analogous to get_indexer when we are partial-indexing on our first level.

Parameters
----------
keyarr : Index, np.ndarray, or ExtensionArray
Indexer to convert.
def _get_indexer_strict(self, key, axis_name: str) -> tuple[Index, np.ndarray]:

Returns
-------
np.ndarray[intp] or None
"""
indexer = None
keyarr = key
if not isinstance(keyarr, Index):
keyarr = com.asarray_tuplesafe(keyarr)

# are we indexing a specific level
if len(keyarr) and not isinstance(keyarr[0], tuple):
indexer = self._get_indexer_level_0(keyarr)

self._raise_if_missing(key, indexer, axis_name)
return self[indexer], indexer

return super()._get_indexer_strict(key, axis_name)

def _raise_if_missing(self, key, indexer, axis_name: str):
keyarr = key
if not isinstance(key, Index):
keyarr = com.asarray_tuplesafe(key)

if len(keyarr) and not isinstance(keyarr[0], tuple):
# i.e. same condition for special case in MultiIndex._get_indexer_strict

mask = indexer == -1
if mask.any():
check = self.levels[0].get_indexer(keyarr)
Expand All @@ -2568,8 +2572,8 @@ def _convert_listlike_indexer(self, keyarr) -> np.ndarray | None:
# We get here when levels still contain values which are not
# actually in Index anymore
raise KeyError(f"{keyarr} not in index")

return indexer
else:
return super()._raise_if_missing(key, indexer, axis_name)

def _get_indexer_level_0(self, target) -> np.ndarray:
"""
Expand Down Expand Up @@ -3160,10 +3164,12 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
if level > 0 or self._lexsort_depth == 0:
# Desired level is not sorted
if isinstance(idx, slice):
# test_get_loc_partial_timestamp_multiindex
locs = (level_codes >= idx.start) & (level_codes < idx.stop)
return locs

locs = np.array(level_codes == idx, dtype=bool, copy=False)

if not locs.any():
# The label is present in self.levels[level] but unused:
raise KeyError(key)
Expand Down
Loading