Skip to content

Commit 5c85eca

Browse files
Nico CernekMarco Gorelli
authored andcommitted
add failing test to check row order preservation
correct the imports broken commit with a bunch of print statements and comments add test for left merge swap left and right keys when how == "right" correct old test: right-merge row order is now the same as the right df clean up spacing and delete temp code add whatsnew replace .from_records with default constructor add GH issue # to tests revert commit ed54bec change logic to swap left and right if how==right clean formatting rename vars and add comment for clarity combine tests into one update whatsnew Update doc/source/whatsnew/v1.0.0.rst Co-Authored-By: William Ayd <[email protected]> add before and after examples linting cleanup changes requested by jreback update docs
1 parent 761bceb commit 5c85eca

File tree

3 files changed

+87
-14
lines changed

3 files changed

+87
-14
lines changed

doc/source/whatsnew/v1.0.0.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,8 +1245,13 @@ Reshaping
12451245
- Bug in :func:`melt` where supplying mixed strings and numeric values for ``id_vars`` or ``value_vars`` would incorrectly raise a ``ValueError`` (:issue:`29718`)
12461246
- Dtypes are now preserved when transposing a ``DataFrame`` where each column is the same extension dtype (:issue:`30091`)
12471247
- Bug in :func:`merge_asof` merging on a tz-aware ``left_index`` and ``right_on`` a tz-aware column (:issue:`29864`)
1248+
<<<<<<< HEAD
12481249
- Improved error message and docstring in :func:`cut` and :func:`qcut` when `labels=True` (:issue:`13318`)
12491250
- Bug in missing `fill_na` parameter to :meth:`DataFrame.unstack` with list of levels (:issue:`30740`)
1251+
- :meth:`DataFrame.merge` now preserves right frame's row order when executing a right merge (:issue:`27453`)
1252+
=======
1253+
>>>>>>> 2b1b67592... changes requested by jreback
1254+
-
12501255

12511256
Sparse
12521257
^^^^^^

pandas/core/reshape/merge.py

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -568,10 +568,10 @@ def __init__(
568568
indicator: bool = False,
569569
validate=None,
570570
):
571-
_left = _validate_operand(left)
572-
_right = _validate_operand(right)
573-
self.left = self.orig_left = _left
574-
self.right = self.orig_right = _right
571+
left = validate_operand(left)
572+
right = validate_operand(right)
573+
self.left = self.orig_left = left
574+
self.right = self.orig_right = right
575575
self.how = how
576576
self.axis = axis
577577

@@ -1295,6 +1295,9 @@ def _get_join_indexers(
12951295
right_keys
12961296
), "left_key and right_keys must be the same length"
12971297

1298+
# bind `sort` arg. of _factorize_keys
1299+
fkeys = partial(_factorize_keys, sort=sort)
1300+
12981301
# get left & right join labels and num. of levels at each location
12991302
mapped = (
13001303
_factorize_keys(left_keys[n], right_keys[n], sort=sort)
@@ -1309,15 +1312,20 @@ def _get_join_indexers(
13091312
# factorize keys to a dense i8 space
13101313
# `count` is the num. of unique keys
13111314
# set(lkey) | set(rkey) == range(count)
1312-
lkey, rkey, count = _factorize_keys(lkey, rkey, sort=sort)
13131315

1316+
# flip left and right keys if performing a right merge
1317+
# to preserve right merge row order (GH 27453)
1318+
if how == "right":
1319+
factorized_rkey, factorized_lkey, count = fkeys(rkey, lkey)
1320+
else:
1321+
factorized_lkey, factorized_rkey, count = fkeys(lkey, rkey)
13141322
# preserve left frame order if how == 'left' and sort == False
13151323
kwargs = copy.copy(kwargs)
13161324
if how == "left":
13171325
kwargs["sort"] = sort
13181326
join_func = _join_functions[how]
13191327

1320-
return join_func(lkey, rkey, count, **kwargs)
1328+
return join_func(factorized_lkey, factorized_rkey, count, **kwargs)
13211329

13221330

13231331
def _restore_dropped_levels_multijoin(

pandas/tests/reshape/merge/test_merge.py

Lines changed: 68 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,17 +1288,17 @@ def test_merge_on_index_with_more_values(self, how, index, expected_index):
12881288
# GH 24212
12891289
# pd.merge gets [0, 1, 2, -1, -1, -1] as left_indexer, ensure that
12901290
# -1 is interpreted as a missing value instead of the last element
1291-
df1 = pd.DataFrame({"a": [1, 2, 3], "key": [0, 2, 2]}, index=index)
1292-
df2 = pd.DataFrame({"b": [1, 2, 3, 4, 5]})
1291+
df1 = pd.DataFrame({"a": [0, 1, 2], "key": [0, 1, 2]}, index=index)
1292+
df2 = pd.DataFrame({"b": [0, 1, 2, 3, 4, 5]})
12931293
result = df1.merge(df2, left_on="key", right_index=True, how=how)
12941294
expected = pd.DataFrame(
12951295
[
1296-
[1.0, 0, 1],
1297-
[2.0, 2, 3],
1298-
[3.0, 2, 3],
1299-
[np.nan, 1, 2],
1300-
[np.nan, 3, 4],
1301-
[np.nan, 4, 5],
1296+
[0, 0, 0],
1297+
[1, 1, 1],
1298+
[2, 2, 2],
1299+
[np.nan, 3, 3],
1300+
[np.nan, 4, 4],
1301+
[np.nan, 5, 5],
13021302
],
13031303
columns=["a", "key", "b"],
13041304
)
@@ -2169,3 +2169,63 @@ def test_merge_datetime_upcast_dtype():
21692169
}
21702170
)
21712171
tm.assert_frame_equal(result, expected)
2172+
2173+
2174+
@pytest.mark.parametrize("how", ["left", "right"])
2175+
def test_merge_preserves_row_order(how):
2176+
# GH 27453
2177+
population = [
2178+
("Jenn", "Jamaica", 3),
2179+
("Beth", "Bulgaria", 7),
2180+
("Carl", "Canada", 30),
2181+
]
2182+
columns = ["name", "country", "population"]
2183+
population_df = DataFrame(population, columns=columns)
2184+
2185+
people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")]
2186+
columns = ["name", "country"]
2187+
people_df = DataFrame(people, columns=columns)
2188+
2189+
expected_data = [
2190+
("Abe", "America", np.nan),
2191+
("Beth", "Bulgaria", 7),
2192+
("Carl", "Canada", 30),
2193+
]
2194+
expected_cols = ["name", "country", "population"]
2195+
expected = DataFrame(expected_data, columns=expected_cols)
2196+
2197+
result = pop.merge(ppl, on=("name", "country"), how="right")
2198+
2199+
tm.assert_frame_equal(result, expected)
2200+
2201+
2202+
def test_left_merge_preserves_row_order():
2203+
# GH 27453
2204+
population = [
2205+
("Jenn", "Jamaica", 3),
2206+
("Beth", "Bulgaria", 7),
2207+
("Carl", "Canada", 30),
2208+
]
2209+
columns = ["name", "country", "population"]
2210+
pop = DataFrame(population, columns=columns)
2211+
2212+
people = [("Abe", "America"), ("Beth", "Bulgaria"), ("Carl", "Canada")]
2213+
columns = ["name", "country"]
2214+
ppl = DataFrame(people, columns=columns)
2215+
2216+
expected_data = [
2217+
("Abe", "America", np.nan),
2218+
("Beth", "Bulgaria", 7),
2219+
("Carl", "Canada", 30),
2220+
]
2221+
expected_cols = ["name", "country", "population"]
2222+
expected = DataFrame(expected_data, columns=expected_cols)
2223+
2224+
result = ppl.merge(pop, on=("name", "country"), how="left")
2225+
if how == "right":
2226+
left_df, right_df = population_df, people_df
2227+
elif how == "left":
2228+
left_df, right_df = people_df, population_df
2229+
2230+
result = left_df.merge(right_df, on=("name", "country"), how=how)
2231+
tm.assert_frame_equal(expected, result)

0 commit comments

Comments
 (0)