-
-
Notifications
You must be signed in to change notification settings - Fork 18.6k
BUG: df.loc setitem-with-expansion with duplicate index #40096
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
3507373
09eb6c5
05e9fc3
8a9ee17
0afec63
a280932
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1641,7 +1641,12 @@ def _setitem_with_indexer(self, indexer, value, name="iloc"): | |
# so the object is the same | ||
index = self.obj._get_axis(i) | ||
labels = index.insert(len(index), key) | ||
self.obj._mgr = self.obj.reindex(labels, axis=i)._mgr | ||
taker = list(range(len(index))) + [-1] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add comments on what is happening here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. prob doesn't make much difference but can you just use np.arange here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wait, better idea, never mind. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
reindexers = {i: (labels, taker)} | ||
new_obj = self.obj._reindex_with_indexers( | ||
reindexers, allow_dups=True | ||
) | ||
self.obj._mgr = new_obj._mgr | ||
self.obj._maybe_update_cacher(clear=True) | ||
self.obj._is_copy = None | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,20 +37,24 @@ def setup_method(self, method): | |
) | ||
|
||
def test_loc_scalar(self): | ||
dtype = CDT(list("cab")) | ||
result = self.df.loc["a"] | ||
expected = DataFrame( | ||
{"A": [0, 1, 5], "B": (Series(list("aaa")).astype(CDT(list("cab"))))} | ||
).set_index("B") | ||
bidx = Series(list("aaa"), name="B").astype(dtype) | ||
assert bidx.dtype == dtype | ||
|
||
expected = DataFrame({"A": [0, 1, 5]}, index=Index(bidx)) | ||
tm.assert_frame_equal(result, expected) | ||
|
||
df = self.df.copy() | ||
df.loc["a"] = 20 | ||
bidx2 = Series(list("aabbca"), name="B").astype(dtype) | ||
assert bidx2.dtype == dtype | ||
expected = DataFrame( | ||
{ | ||
"A": [20, 20, 2, 3, 4, 20], | ||
"B": (Series(list("aabbca")).astype(CDT(list("cab")))), | ||
} | ||
).set_index("B") | ||
}, | ||
index=Index(bidx2), | ||
) | ||
tm.assert_frame_equal(df, expected) | ||
|
||
# value not in the categories | ||
|
@@ -64,11 +68,28 @@ def test_loc_scalar(self): | |
df2.loc["d"] = 10 | ||
tm.assert_frame_equal(df2, expected) | ||
|
||
msg = "'fill_value=d' is not present in this Categorical's categories" | ||
with pytest.raises(TypeError, match=msg): | ||
df.loc["d", "A"] = 10 | ||
with pytest.raises(TypeError, match=msg): | ||
df.loc["d", "C"] = 10 | ||
# Setting-with-expansion with a new key "d" that is not among caegories | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. might be worth splitting this test (e.g. errors to a new tests) and even these cases |
||
df3 = df.copy() | ||
df3.loc["d", "A"] = 10 | ||
bidx3 = Index(list("aabbcad"), name="B") | ||
expected3 = DataFrame( | ||
{ | ||
"A": [20, 20, 2, 3, 4, 20, 10.0], | ||
}, | ||
index=Index(bidx3), | ||
) | ||
tm.assert_frame_equal(df3, expected3) | ||
|
||
df4 = df.copy() | ||
df4.loc["d", "C"] = 10 | ||
expected3 = DataFrame( | ||
{ | ||
"A": [20, 20, 2, 3, 4, 20, np.nan], | ||
"C": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, 10], | ||
}, | ||
index=Index(bidx3), | ||
) | ||
tm.assert_frame_equal(df4, expected3) | ||
|
||
with pytest.raises(KeyError, match="^1$"): | ||
df.loc[1] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
DatetimeIndex, | ||
Index, | ||
IndexSlice, | ||
IntervalIndex, | ||
MultiIndex, | ||
Period, | ||
Series, | ||
|
@@ -1657,6 +1658,56 @@ def test_loc_setitem_with_expansion_inf_upcast_empty(self): | |
expected = pd.Float64Index([0, 1, np.inf]) | ||
tm.assert_index_equal(result, expected) | ||
|
||
@pytest.mark.filterwarnings("ignore:indexing past lexsort depth") | ||
def test_loc_setitem_with_expansion_nonunique_index(self, index, request): | ||
# GH#40096 | ||
if not len(index): | ||
return | ||
if isinstance(index, IntervalIndex): | ||
mark = pytest.mark.xfail(reason="IntervalIndex raises") | ||
request.node.add_marker(mark) | ||
|
||
index = index.repeat(2) # ensure non-unique | ||
N = len(index) | ||
arr = np.arange(N).astype(np.int64) | ||
|
||
orig = DataFrame(arr, index=index, columns=[0]) | ||
|
||
# key that will requiring object-dtype casting in the index | ||
key = "kapow" | ||
assert key not in index # otherwise test is invalid | ||
# TODO: using a tuple key breaks here in many cases | ||
|
||
exp_index = index.insert(len(index), key) | ||
if isinstance(index, MultiIndex): | ||
exp_index = index.insert(len(index), key) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this is not needed? LGTM otherwise There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep, thanks |
||
assert exp_index[-1][0] == key | ||
else: | ||
assert exp_index[-1] == key | ||
exp_data = np.arange(N + 1).astype(np.float64) | ||
expected = DataFrame(exp_data, index=exp_index, columns=[0]) | ||
|
||
# Add new row, but no new columns | ||
df = orig.copy() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same might be worth splitting this up a bit |
||
df.loc[key, 0] = N | ||
tm.assert_frame_equal(df, expected) | ||
|
||
# add new row on a Series | ||
ser = orig.copy()[0] | ||
ser.loc[key] = N | ||
# the series machinery lets us preserve int dtype instead of float | ||
expected = expected[0].astype(np.int64) | ||
tm.assert_series_equal(ser, expected) | ||
|
||
# add new row and new column | ||
df = orig.copy() | ||
df.loc[key, 1] = N | ||
expected = DataFrame( | ||
{0: list(arr) + [np.nan], 1: [np.nan] * N + [float(N)]}, | ||
index=exp_index, | ||
) | ||
tm.assert_frame_equal(df, expected) | ||
|
||
|
||
class TestLocCallable: | ||
def test_frame_loc_getitem_callable(self): | ||
|
Uh oh!
There was an error while loading. Please reload this page.