Skip to content

Commit da933b9

Browse files
authored
Merge pull request #222 from pandas-dev/master
Sync Fork from Upstream Repo
2 parents 6d5c84a + ddd90b0 commit da933b9

File tree

23 files changed

+225
-160
lines changed

23 files changed

+225
-160
lines changed

doc/source/user_guide/categorical.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -777,8 +777,8 @@ value is included in the ``categories``:
777777
df
778778
try:
779779
df.iloc[2:4, :] = [["c", 3], ["c", 3]]
780-
except ValueError as e:
781-
print("ValueError:", str(e))
780+
except TypeError as e:
781+
print("TypeError:", str(e))
782782
783783
Setting values by assigning categorical data will also check that the ``categories`` match:
784784

@@ -788,8 +788,8 @@ Setting values by assigning categorical data will also check that the ``categori
788788
df
789789
try:
790790
df.loc["j":"k", "cats"] = pd.Categorical(["b", "b"], categories=["a", "b", "c"])
791-
except ValueError as e:
792-
print("ValueError:", str(e))
791+
except TypeError as e:
792+
print("TypeError:", str(e))
793793
794794
Assigning a ``Categorical`` to parts of a column of other types will use the values:
795795

doc/source/user_guide/groupby.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,6 @@ something different for each of the columns. Thus, using ``[]`` similar to
391391
getting a column from a DataFrame, you can do:
392392

393393
.. ipython:: python
394-
:suppress:
395394
396395
df = pd.DataFrame(
397396
{
@@ -402,7 +401,7 @@ getting a column from a DataFrame, you can do:
402401
}
403402
)
404403
405-
.. ipython:: python
404+
df
406405
407406
grouped = df.groupby(["A"])
408407
grouped_C = grouped["C"]

doc/source/whatsnew/v1.4.0.rst

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,11 @@ Bug fixes
174174

175175
Categorical
176176
^^^^^^^^^^^
177-
-
177+
- Bug in setting dtype-incompatible values into a :class:`Categorical` (or ``Series`` or ``DataFrame`` backed by ``Categorical``) raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
178+
- Bug in :meth:`Categorical.searchsorted` when passing a dtype-incompatible value raising ``KeyError`` instead of ``TypeError`` (:issue:`41919`)
179+
- Bug in :meth:`Series.where` with ``CategoricalDtype`` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
180+
- Bug in :meth:`Categorical.fillna` when passing a dtype-incompatible value raising ``ValueError`` instead of ``TypeError`` (:issue:`41919`)
181+
- Bug in :meth:`Categorical.fillna` with a tuple-like category raising ``ValueError`` instead of ``TypeError`` when filling with a non-category tuple (:issue:`41919`)
178182
-
179183

180184
Datetimelike
@@ -229,6 +233,7 @@ Missing
229233

230234
MultiIndex
231235
^^^^^^^^^^
236+
- Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`)
232237
- Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`)
233238
-
234239

pandas/core/arrays/categorical.py

Lines changed: 15 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1394,17 +1394,14 @@ def map(self, mapper):
13941394
# -------------------------------------------------------------
13951395
# Validators; ideally these can be de-duplicated
13961396

1397-
def _validate_searchsorted_value(self, value):
1398-
# searchsorted is very performance sensitive. By converting codes
1399-
# to same dtype as self.codes, we get much faster performance.
1400-
if is_scalar(value):
1401-
codes = self._unbox_scalar(value)
1397+
def _validate_setitem_value(self, value):
1398+
if not is_hashable(value):
1399+
# wrap scalars and hashable-listlikes in list
1400+
return self._validate_listlike(value)
14021401
else:
1403-
locs = [self.categories.get_loc(x) for x in value]
1404-
# error: Incompatible types in assignment (expression has type
1405-
# "ndarray", variable has type "int")
1406-
codes = np.array(locs, dtype=self.codes.dtype) # type: ignore[assignment]
1407-
return codes
1402+
return self._validate_scalar(value)
1403+
1404+
_validate_searchsorted_value = _validate_setitem_value
14081405

14091406
def _validate_scalar(self, fill_value):
14101407
"""
@@ -1430,8 +1427,8 @@ def _validate_scalar(self, fill_value):
14301427
fill_value = self._unbox_scalar(fill_value)
14311428
else:
14321429
raise TypeError(
1433-
f"'fill_value={fill_value}' is not present "
1434-
"in this Categorical's categories"
1430+
"Cannot setitem on a Categorical with a new "
1431+
f"category ({fill_value}), set the categories first"
14351432
)
14361433
return fill_value
14371434

@@ -2016,37 +2013,35 @@ def __getitem__(self, key):
20162013
deprecate_ndim_indexing(result)
20172014
return result
20182015

2019-
def _validate_setitem_value(self, value):
2016+
def _validate_listlike(self, value):
2017+
# NB: here we assume scalar-like tuples have already been excluded
20202018
value = extract_array(value, extract_numpy=True)
20212019

20222020
# require identical categories set
20232021
if isinstance(value, Categorical):
20242022
if not is_dtype_equal(self.dtype, value.dtype):
2025-
raise ValueError(
2023+
raise TypeError(
20262024
"Cannot set a Categorical with another, "
20272025
"without identical categories"
20282026
)
20292027
# is_dtype_equal implies categories_match_up_to_permutation
20302028
value = self._encode_with_my_categories(value)
20312029
return value._codes
20322030

2033-
# wrap scalars and hashable-listlikes in list
2034-
rvalue = value if not is_hashable(value) else [value]
2035-
20362031
from pandas import Index
20372032

20382033
# tupleize_cols=False for e.g. test_fillna_iterable_category GH#41914
2039-
to_add = Index(rvalue, tupleize_cols=False).difference(self.categories)
2034+
to_add = Index(value, tupleize_cols=False).difference(self.categories)
20402035

20412036
# no assignments of values not in categories, but it's always ok to set
20422037
# something to np.nan
20432038
if len(to_add) and not isna(to_add).all():
2044-
raise ValueError(
2039+
raise TypeError(
20452040
"Cannot setitem on a Categorical with a new "
20462041
"category, set the categories first"
20472042
)
20482043

2049-
codes = self.categories.get_indexer(rvalue)
2044+
codes = self.categories.get_indexer(value)
20502045
return codes.astype(self._ndarray.dtype, copy=False)
20512046

20522047
def _reverse_indexer(self) -> dict[Hashable, npt.NDArray[np.intp]]:

pandas/core/frame.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3452,7 +3452,7 @@ def __getitem__(self, key):
34523452
else:
34533453
if is_iterator(key):
34543454
key = list(key)
3455-
indexer = self.loc._get_listlike_indexer(key, axis=1)[1]
3455+
indexer = self.columns._get_indexer_strict(key, "columns")[1]
34563456

34573457
# take() does not accept boolean indexers
34583458
if getattr(indexer, "dtype", None) == bool:

pandas/core/indexes/base.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5384,6 +5384,89 @@ def get_indexer_for(self, target) -> npt.NDArray[np.intp]:
53845384
indexer, _ = self.get_indexer_non_unique(target)
53855385
return indexer
53865386

5387+
def _get_indexer_strict(self, key, axis_name: str_t) -> tuple[Index, np.ndarray]:
5388+
"""
5389+
Analogue to get_indexer that raises if any elements are missing.
5390+
"""
5391+
keyarr = key
5392+
if not isinstance(keyarr, Index):
5393+
keyarr = com.asarray_tuplesafe(keyarr)
5394+
5395+
if self._index_as_unique:
5396+
indexer = self.get_indexer_for(keyarr)
5397+
keyarr = self.reindex(keyarr)[0]
5398+
else:
5399+
keyarr, indexer, new_indexer = self._reindex_non_unique(keyarr)
5400+
5401+
self._raise_if_missing(keyarr, indexer, axis_name)
5402+
5403+
if (
5404+
needs_i8_conversion(self.dtype)
5405+
or is_categorical_dtype(self.dtype)
5406+
or is_interval_dtype(self.dtype)
5407+
):
5408+
# For CategoricalIndex take instead of reindex to preserve dtype.
5409+
# For IntervalIndex this is to map integers to the Intervals they match to.
5410+
keyarr = self.take(indexer)
5411+
if keyarr.dtype.kind in ["m", "M"]:
5412+
# DTI/TDI.take can infer a freq in some cases when we dont want one
5413+
if isinstance(key, list) or (
5414+
isinstance(key, type(self))
5415+
# "Index" has no attribute "freq"
5416+
and key.freq is None # type: ignore[attr-defined]
5417+
):
5418+
keyarr = keyarr._with_freq(None)
5419+
5420+
return keyarr, indexer
5421+
5422+
def _raise_if_missing(self, key, indexer, axis_name: str_t):
5423+
"""
5424+
Check that indexer can be used to return a result.
5425+
5426+
e.g. at least one element was found,
5427+
unless the list of keys was actually empty.
5428+
5429+
Parameters
5430+
----------
5431+
key : list-like
5432+
Targeted labels (only used to show correct error message).
5433+
indexer: array-like of booleans
5434+
Indices corresponding to the key,
5435+
(with -1 indicating not found).
5436+
axis_name : str
5437+
5438+
Raises
5439+
------
5440+
KeyError
5441+
If at least one key was requested but none was found.
5442+
"""
5443+
if len(key) == 0:
5444+
return
5445+
5446+
# Count missing values
5447+
missing_mask = indexer < 0
5448+
nmissing = missing_mask.sum()
5449+
5450+
if nmissing:
5451+
5452+
# TODO: remove special-case; this is just to keep exception
5453+
# message tests from raising while debugging
5454+
use_interval_msg = is_interval_dtype(self.dtype) or (
5455+
is_categorical_dtype(self.dtype)
5456+
# "Index" has no attribute "categories" [attr-defined]
5457+
and is_interval_dtype(
5458+
self.categories.dtype # type: ignore[attr-defined]
5459+
)
5460+
)
5461+
5462+
if nmissing == len(indexer):
5463+
if use_interval_msg:
5464+
key = list(key)
5465+
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
5466+
5467+
not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())
5468+
raise KeyError(f"{not_found} not in index")
5469+
53875470
@overload
53885471
def _get_indexer_non_comparable(
53895472
self, target: Index, method, unique: Literal[True] = ...

pandas/core/indexes/multi.py

Lines changed: 22 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2541,24 +2541,28 @@ def _get_values_for_loc(self, series: Series, loc, key):
25412541
new_ser = series._constructor(new_values, index=new_index, name=series.name)
25422542
return new_ser.__finalize__(series)
25432543

2544-
def _convert_listlike_indexer(self, keyarr) -> np.ndarray | None:
2545-
"""
2546-
Analogous to get_indexer when we are partial-indexing on our first level.
2547-
2548-
Parameters
2549-
----------
2550-
keyarr : Index, np.ndarray, or ExtensionArray
2551-
Indexer to convert.
2544+
def _get_indexer_strict(self, key, axis_name: str) -> tuple[Index, np.ndarray]:
25522545

2553-
Returns
2554-
-------
2555-
np.ndarray[intp] or None
2556-
"""
2557-
indexer = None
2546+
keyarr = key
2547+
if not isinstance(keyarr, Index):
2548+
keyarr = com.asarray_tuplesafe(keyarr)
25582549

2559-
# are we indexing a specific level
25602550
if len(keyarr) and not isinstance(keyarr[0], tuple):
25612551
indexer = self._get_indexer_level_0(keyarr)
2552+
2553+
self._raise_if_missing(key, indexer, axis_name)
2554+
return self[indexer], indexer
2555+
2556+
return super()._get_indexer_strict(key, axis_name)
2557+
2558+
def _raise_if_missing(self, key, indexer, axis_name: str):
2559+
keyarr = key
2560+
if not isinstance(key, Index):
2561+
keyarr = com.asarray_tuplesafe(key)
2562+
2563+
if len(keyarr) and not isinstance(keyarr[0], tuple):
2564+
# i.e. same condition for special case in MultiIndex._get_indexer_strict
2565+
25622566
mask = indexer == -1
25632567
if mask.any():
25642568
check = self.levels[0].get_indexer(keyarr)
@@ -2568,8 +2572,8 @@ def _convert_listlike_indexer(self, keyarr) -> np.ndarray | None:
25682572
# We get here when levels still contain values which are not
25692573
# actually in Index anymore
25702574
raise KeyError(f"{keyarr} not in index")
2571-
2572-
return indexer
2575+
else:
2576+
return super()._raise_if_missing(key, indexer, axis_name)
25732577

25742578
def _get_indexer_level_0(self, target) -> np.ndarray:
25752579
"""
@@ -3160,10 +3164,12 @@ def convert_indexer(start, stop, step, indexer=indexer, codes=level_codes):
31603164
if level > 0 or self._lexsort_depth == 0:
31613165
# Desired level is not sorted
31623166
if isinstance(idx, slice):
3167+
# test_get_loc_partial_timestamp_multiindex
31633168
locs = (level_codes >= idx.start) & (level_codes < idx.stop)
31643169
return locs
31653170

31663171
locs = np.array(level_codes == idx, dtype=bool, copy=False)
3172+
31673173
if not locs.any():
31683174
# The label is present in self.levels[level] but unused:
31693175
raise KeyError(key)

0 commit comments

Comments
 (0)