Skip to content

Commit 39773bc

Browse files
BUG: CategoricalIndex allowed reindexing duplicate sources, but not duplicate targets: this is the wrong way around
1 parent 2164af5 commit 39773bc

File tree

4 files changed

+75
-63
lines changed

4 files changed

+75
-63
lines changed

doc/source/whatsnew/v1.0.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ Categorical
106106
^^^^^^^^^^^
107107

108108
- Added test to assert the :func:`fillna` raises the correct ValueError message when the value isn't a value from categories (:issue:`13628`)
109-
-
109+
- For :class:`CategoricalIndex`, `DataFrame.reindex` would fail when the targets contained duplicates, and wouldn't fail if the source contained duplicates (:issue:`28107`)
110110
-
111111

112112

pandas/core/indexes/category.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -551,10 +551,6 @@ def get_value(self, series: AnyArrayLike, key: Any):
551551
# we might be a positional inexer
552552
return super().get_value(series, key)
553553

554-
def _can_reindex(self, indexer):
555-
""" always allow reindexing """
556-
pass
557-
558554
@Appender(_index_shared_docs["where"])
559555
def where(self, cond, other=None):
560556
# TODO: Investigate an alternative implementation with
@@ -579,7 +575,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
579575
Indices of output values in original index
580576
581577
"""
582-
583578
if method is not None:
584579
raise NotImplementedError(
585580
"argument method is not implemented for CategoricalIndex.reindex"
@@ -599,9 +594,6 @@ def reindex(self, target, method=None, level=None, limit=None, tolerance=None):
599594
indexer = None
600595
missing = []
601596
else:
602-
if not target.is_unique:
603-
raise ValueError("cannot reindex with a non-unique indexer")
604-
605597
indexer, missing = self.get_indexer_non_unique(np.array(target))
606598

607599
if len(self.codes) and indexer is not None:

pandas/tests/indexes/test_category.py

Lines changed: 33 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -574,41 +574,56 @@ def test_reindexing(self):
574574
tm.assert_numpy_array_equal(expected, actual)
575575

576576
def test_reindex_dtype(self):
577-
c = CategoricalIndex(["a", "b", "c", "a"])
577+
c = CategoricalIndex(["a", "b", "c"])
578578
res, indexer = c.reindex(["a", "c"])
579-
tm.assert_index_equal(res, Index(["a", "a", "c"]), exact=True)
580-
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
579+
tm.assert_index_equal(res, Index(["a", "c"]), exact=True)
580+
tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp))
581581

582-
c = CategoricalIndex(["a", "b", "c", "a"])
582+
c = CategoricalIndex(["a", "b", "c"])
583583
res, indexer = c.reindex(Categorical(["a", "c"]))
584584

585-
exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
585+
exp = CategoricalIndex(["a", "c"], categories=["a", "c"])
586586
tm.assert_index_equal(res, exp, exact=True)
587-
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
587+
tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp))
588588

589-
c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
589+
c = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"])
590590
res, indexer = c.reindex(["a", "c"])
591-
exp = Index(["a", "a", "c"], dtype="object")
591+
exp = Index(["a", "c"], dtype="object")
592592
tm.assert_index_equal(res, exp, exact=True)
593-
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
593+
tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp))
594594

595-
c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
595+
c = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"])
596596
res, indexer = c.reindex(Categorical(["a", "c"]))
597-
exp = CategoricalIndex(["a", "a", "c"], categories=["a", "c"])
597+
exp = CategoricalIndex(["a", "c"], categories=["a", "c"])
598598
tm.assert_index_equal(res, exp, exact=True)
599-
tm.assert_numpy_array_equal(indexer, np.array([0, 3, 2], dtype=np.intp))
599+
tm.assert_numpy_array_equal(indexer, np.array([0, 2], dtype=np.intp))
600600

601-
def test_reindex_duplicate_target(self):
601+
def test_reindex_duplicate_source(self):
602602
# See GH23963
603603
c = CategoricalIndex(["a", "b", "c", "a"], categories=["a", "b", "c", "d"])
604-
with pytest.raises(ValueError, match="non-unique indexer"):
605-
c.reindex(["a", "a", "c"])
604+
with pytest.raises(ValueError, match="duplicate axis"):
605+
c._can_reindex(["a", "c"])
606606

607-
with pytest.raises(ValueError, match="non-unique indexer"):
608-
c.reindex(
609-
CategoricalIndex(["a", "a", "c"], categories=["a", "b", "c", "d"])
607+
with pytest.raises(ValueError, match="duplicate axis"):
608+
c._can_reindex(
609+
CategoricalIndex(["a", "c"], categories=["a", "b", "c", "d"])
610610
)
611611

612+
def test_reindex_duplicate_target(self):
613+
# See GH25459
614+
c = CategoricalIndex(["a", "b", "c"], categories=["a", "b", "c", "d"])
615+
res, indexer = c.reindex(["a", "c", "c"])
616+
exp = Index(["a", "c", "c"], dtype="object")
617+
tm.assert_index_equal(res, exp, exact=True)
618+
tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))
619+
620+
res, indexer = c.reindex(
621+
CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
622+
)
623+
exp = CategoricalIndex(["a", "c", "c"], categories=["a", "b", "c", "d"])
624+
tm.assert_index_equal(res, exp, exact=True)
625+
tm.assert_numpy_array_equal(indexer, np.array([0, 2, 2], dtype=np.intp))
626+
612627
def test_reindex_empty_index(self):
613628
# See GH16770
614629
c = CategoricalIndex([])

pandas/tests/indexing/test_categorical.py

Lines changed: 41 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,18 @@ def setup_method(self, method):
4646
"B": (Series([1, 1, 2, 1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))),
4747
}
4848
).set_index("B")
49+
self.df5 = DataFrame(
50+
{
51+
"A": np.arange(3, dtype="int64"),
52+
"B": Series(list("abc")).astype(CDT(list("cabe"))),
53+
}
54+
).set_index("B")
55+
self.df6 = DataFrame(
56+
{
57+
"A": np.arange(3, dtype="int64"),
58+
"B": (Series([1, 3, 2]).astype(CDT([3, 2, 1], ordered=False))),
59+
}
60+
).set_index("B")
4961

5062
def test_loc_scalar(self):
5163
result = self.df.loc["a"]
@@ -564,89 +576,82 @@ def test_reindexing(self):
564576

565577
# reindexing
566578
# convert to a regular index
567-
result = self.df2.reindex(["a", "b", "e"])
568-
expected = DataFrame(
569-
{"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))}
570-
).set_index("B")
579+
result = self.df5.reindex(["a", "b", "e"])
580+
expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
581+
"B"
582+
)
571583
assert_frame_equal(result, expected, check_index_type=True)
572584

573-
result = self.df2.reindex(["a", "b"])
574-
expected = DataFrame(
575-
{"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))}
576-
).set_index("B")
585+
result = self.df5.reindex(["a", "b"])
586+
expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
577587
assert_frame_equal(result, expected, check_index_type=True)
578588

579-
result = self.df2.reindex(["e"])
589+
result = self.df5.reindex(["e"])
580590
expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
581591
assert_frame_equal(result, expected, check_index_type=True)
582592

583-
result = self.df2.reindex(["d"])
593+
result = self.df5.reindex(["d"])
584594
expected = DataFrame({"A": [np.nan], "B": Series(["d"])}).set_index("B")
585595
assert_frame_equal(result, expected, check_index_type=True)
586596

587597
# since we are actually reindexing with a Categorical
588598
# then return a Categorical
589599
cats = list("cabe")
590600

591-
result = self.df2.reindex(Categorical(["a", "d"], categories=cats))
601+
result = self.df5.reindex(Categorical(["a", "e"], categories=cats))
592602
expected = DataFrame(
593-
{"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(cats))}
603+
{"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats))}
594604
).set_index("B")
595605
assert_frame_equal(result, expected, check_index_type=True)
596606

597-
result = self.df2.reindex(Categorical(["a"], categories=cats))
607+
result = self.df5.reindex(Categorical(["a"], categories=cats))
598608
expected = DataFrame(
599-
{"A": [0, 1, 5], "B": Series(list("aaa")).astype(CDT(cats))}
609+
{"A": [0], "B": Series(list("a")).astype(CDT(cats))}
600610
).set_index("B")
601611
assert_frame_equal(result, expected, check_index_type=True)
602612

603-
result = self.df2.reindex(["a", "b", "e"])
604-
expected = DataFrame(
605-
{"A": [0, 1, 5, 2, 3, np.nan], "B": Series(list("aaabbe"))}
606-
).set_index("B")
613+
result = self.df5.reindex(["a", "b", "e"])
614+
expected = DataFrame({"A": [0, 1, np.nan], "B": Series(list("abe"))}).set_index(
615+
"B"
616+
)
607617
assert_frame_equal(result, expected, check_index_type=True)
608618

609-
result = self.df2.reindex(["a", "b"])
610-
expected = DataFrame(
611-
{"A": [0, 1, 5, 2, 3], "B": Series(list("aaabb"))}
612-
).set_index("B")
619+
result = self.df5.reindex(["a", "b"])
620+
expected = DataFrame({"A": [0, 1], "B": Series(list("ab"))}).set_index("B")
613621
assert_frame_equal(result, expected, check_index_type=True)
614622

615-
result = self.df2.reindex(["e"])
623+
result = self.df5.reindex(["e"])
616624
expected = DataFrame({"A": [np.nan], "B": Series(["e"])}).set_index("B")
617625
assert_frame_equal(result, expected, check_index_type=True)
618626

619627
# give back the type of categorical that we received
620-
result = self.df2.reindex(
621-
Categorical(["a", "d"], categories=cats, ordered=True)
628+
result = self.df5.reindex(
629+
Categorical(["a", "e"], categories=cats, ordered=True)
622630
)
623631
expected = DataFrame(
624-
{
625-
"A": [0, 1, 5, np.nan],
626-
"B": Series(list("aaad")).astype(CDT(cats, ordered=True)),
627-
}
632+
{"A": [0, np.nan], "B": Series(list("ae")).astype(CDT(cats, ordered=True))}
628633
).set_index("B")
629634
assert_frame_equal(result, expected, check_index_type=True)
630635

631-
result = self.df2.reindex(Categorical(["a", "d"], categories=["a", "d"]))
636+
result = self.df5.reindex(Categorical(["a", "d"], categories=["a", "d"]))
632637
expected = DataFrame(
633-
{"A": [0, 1, 5, np.nan], "B": Series(list("aaad")).astype(CDT(["a", "d"]))}
638+
{"A": [0, np.nan], "B": Series(list("ad")).astype(CDT(["a", "d"]))}
634639
).set_index("B")
635640
assert_frame_equal(result, expected, check_index_type=True)
636641

637642
# passed duplicate indexers are not allowed
638-
msg = "cannot reindex with a non-unique indexer"
643+
msg = "cannot reindex from a duplicate axis"
639644
with pytest.raises(ValueError, match=msg):
640-
self.df2.reindex(["a", "a"])
645+
self.df2.reindex(["a", "b"])
641646

642647
# args NotImplemented ATM
643648
msg = r"argument {} is not implemented for CategoricalIndex\.reindex"
644649
with pytest.raises(NotImplementedError, match=msg.format("method")):
645-
self.df2.reindex(["a"], method="ffill")
650+
self.df5.reindex(["a"], method="ffill")
646651
with pytest.raises(NotImplementedError, match=msg.format("level")):
647-
self.df2.reindex(["a"], level=1)
652+
self.df5.reindex(["a"], level=1)
648653
with pytest.raises(NotImplementedError, match=msg.format("limit")):
649-
self.df2.reindex(["a"], limit=2)
654+
self.df5.reindex(["a"], limit=2)
650655

651656
def test_loc_slice(self):
652657
# slicing

0 commit comments

Comments
 (0)