Skip to content

Commit 202b587

Browse files
authored
Merge branch 'main' into 2024-01-11-fix_56147
2 parents 35149d2 + c778746 commit 202b587

38 files changed

+599
-540
lines changed

ci/code_checks.sh

Lines changed: 0 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -71,21 +71,12 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
7171

7272
MSG='Partially validate docstrings (EX03)' ; echo $MSG
7373
$BASE_DIR/scripts/validate_docstrings.py --format=actions --errors=EX03 --ignore_functions \
74-
pandas.Series.dt.day_name \
75-
pandas.Series.str.len \
76-
pandas.Series.cat.set_categories \
77-
pandas.Series.plot.bar \
78-
pandas.Series.plot.hist \
7974
pandas.Series.plot.line \
8075
pandas.Series.to_sql \
8176
pandas.Series.to_latex \
82-
pandas.errors.CategoricalConversionWarning \
83-
pandas.errors.ChainedAssignmentError \
84-
pandas.errors.ClosedFileError \
8577
pandas.errors.DatabaseError \
8678
pandas.errors.IndexingError \
8779
pandas.errors.InvalidColumnName \
88-
pandas.errors.NumExprClobberingError \
8980
pandas.errors.PossibleDataLossError \
9081
pandas.errors.PossiblePrecisionLoss \
9182
pandas.errors.SettingWithCopyError \
@@ -106,7 +97,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
10697
pandas.DataFrame.to_sql \
10798
pandas.read_stata \
10899
pandas.core.resample.Resampler.pipe \
109-
pandas.core.resample.Resampler.fillna \
110100
pandas.core.resample.Resampler.interpolate \
111101
pandas.plotting.scatter_matrix \
112102
pandas.pivot \
@@ -115,26 +105,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
115105
pandas.Index.rename \
116106
pandas.Index.droplevel \
117107
pandas.Index.isin \
118-
pandas.CategoricalIndex.set_categories \
119108
pandas.MultiIndex.names \
120109
pandas.MultiIndex.droplevel \
121110
pandas.IndexSlice \
122-
pandas.DatetimeIndex.month_name \
123-
pandas.DatetimeIndex.day_name \
124-
pandas.core.window.rolling.Rolling.corr \
125111
pandas.Grouper \
126-
pandas.core.groupby.SeriesGroupBy.apply \
127-
pandas.core.groupby.DataFrameGroupBy.apply \
128-
pandas.core.groupby.SeriesGroupBy.transform \
129-
pandas.core.groupby.SeriesGroupBy.pipe \
130-
pandas.core.groupby.DataFrameGroupBy.pipe \
131-
pandas.core.groupby.DataFrameGroupBy.describe \
132-
pandas.core.groupby.DataFrameGroupBy.idxmax \
133-
pandas.core.groupby.DataFrameGroupBy.idxmin \
134-
pandas.core.groupby.DataFrameGroupBy.value_counts \
135-
pandas.core.groupby.SeriesGroupBy.describe \
136-
pandas.core.groupby.DataFrameGroupBy.boxplot \
137-
pandas.core.groupby.DataFrameGroupBy.hist \
138112
pandas.io.formats.style.Styler.map \
139113
pandas.io.formats.style.Styler.apply_index \
140114
pandas.io.formats.style.Styler.map_index \
@@ -152,20 +126,12 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
152126
pandas.io.formats.style.Styler.text_gradient \
153127
pandas.DataFrame.values \
154128
pandas.DataFrame.groupby \
155-
pandas.DataFrame.skew \
156-
pandas.DataFrame.var \
157129
pandas.DataFrame.idxmax \
158130
pandas.DataFrame.idxmin \
159-
pandas.DataFrame.last \
160131
pandas.DataFrame.pivot \
161132
pandas.DataFrame.sort_values \
162-
pandas.DataFrame.tz_convert \
163-
pandas.DataFrame.tz_localize \
164-
pandas.DataFrame.plot.bar \
165133
pandas.DataFrame.plot.hexbin \
166-
pandas.DataFrame.plot.hist \
167134
pandas.DataFrame.plot.line \
168-
pandas.DataFrame.hist \
169135
RET=$(($RET + $?)) ; echo $MSG "DONE"
170136

171137
fi

doc/source/whatsnew/v2.3.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ Deprecations
101101

102102
Performance improvements
103103
~~~~~~~~~~~~~~~~~~~~~~~~
104+
- Performance improvement in :meth:`DataFrame.join` for sorted but non-unique indexes (:issue:`56941`)
105+
- Performance improvement in :meth:`DataFrame.join` when left and/or right are non-unique and ``how`` is ``"left"``, ``"right"``, or ``"inner"`` (:issue:`56817`)
104106
- Performance improvement in :meth:`Index.take` when ``indices`` is a full range indexer from zero to length of index (:issue:`56806`)
105107
-
106108

@@ -119,6 +121,7 @@ Categorical
119121

120122
Datetimelike
121123
^^^^^^^^^^^^
124+
- Bug in :func:`date_range` where the last valid timestamp would sometimes not be produced (:issue:`56134`)
122125
- Bug in :func:`date_range` where a timestamp out of the valid range would be produced with a negative ``freq`` parameter (:issue:`56147`)
123126
-
124127

pandas/_libs/tslibs/offsets.pyx

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4860,15 +4860,15 @@ cpdef to_offset(freq, bint is_period=False):
48604860

48614861
tups = zip(split[0::4], split[1::4], split[2::4])
48624862
for n, (sep, stride, name) in enumerate(tups):
4863-
if is_period is False and name in c_OFFSET_DEPR_FREQSTR:
4863+
if is_period is False and name.upper() in c_OFFSET_DEPR_FREQSTR:
48644864
warnings.warn(
48654865
f"\'{name}\' is deprecated and will be removed "
48664866
f"in a future version, please use "
4867-
f"\'{c_OFFSET_DEPR_FREQSTR.get(name)}\' instead.",
4867+
f"\'{c_OFFSET_DEPR_FREQSTR.get(name.upper())}\' instead.",
48684868
FutureWarning,
48694869
stacklevel=find_stack_level(),
48704870
)
4871-
name = c_OFFSET_DEPR_FREQSTR[name]
4871+
name = c_OFFSET_DEPR_FREQSTR[name.upper()]
48724872
if is_period is True and name in c_REVERSE_OFFSET_DEPR_FREQSTR:
48734873
if name.startswith("Y"):
48744874
raise ValueError(

pandas/conftest.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1973,4 +1973,7 @@ def warsaw(request) -> str:
19731973

19741974
@pytest.fixture
19751975
def arrow_string_storage():
1976+
"""
1977+
Fixture that lists possible PyArrow values for StringDtype storage field.
1978+
"""
19761979
return ("pyarrow", "pyarrow_numpy")

pandas/core/arrays/categorical.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1082,7 +1082,7 @@ def set_categories(
10821082
For :class:`pandas.Series`:
10831083
10841084
>>> raw_cat = pd.Categorical(['a', 'b', 'c', 'A'],
1085-
... categories=['a', 'b', 'c'], ordered=True)
1085+
... categories=['a', 'b', 'c'], ordered=True)
10861086
>>> ser = pd.Series(raw_cat)
10871087
>>> ser
10881088
0 a

pandas/core/arrays/datetimes.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1365,7 +1365,7 @@ def day_name(self, locale=None) -> npt.NDArray[np.object_]:
13651365
>>> idx
13661366
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03'],
13671367
dtype='datetime64[ns]', freq='D')
1368-
>>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP
1368+
>>> idx.day_name(locale='pt_BR.utf8') # doctest: +SKIP
13691369
Index(['Segunda', 'Terça', 'Quarta'], dtype='object')
13701370
"""
13711371
values = self._local_timestamps()
@@ -2780,11 +2780,6 @@ def _generate_range(
27802780
else:
27812781
start = offset.rollback(start) # type: ignore[assignment]
27822782

2783-
elif end and not offset.is_on_offset(end):
2784-
# Incompatible types in assignment (expression has type "datetime",
2785-
# variable has type "Optional[Timestamp]")
2786-
end = offset.rollback(end) # type: ignore[assignment]
2787-
27882783
# Unsupported operand types for < ("Timestamp" and "None")
27892784
if periods is None and end < start and offset.n >= 0: # type: ignore[operator]
27902785
end = None

pandas/core/frame.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9223,11 +9223,11 @@ def groupby(
92239223
You could also assign a list of column names or a list of index names.
92249224
92259225
>>> df = pd.DataFrame({
9226-
... "lev1": [1, 1, 1, 2, 2, 2],
9227-
... "lev2": [1, 1, 2, 1, 1, 2],
9228-
... "lev3": [1, 2, 1, 2, 1, 2],
9229-
... "lev4": [1, 2, 3, 4, 5, 6],
9230-
... "values": [0, 1, 2, 3, 4, 5]})
9226+
... "lev1": [1, 1, 1, 2, 2, 2],
9227+
... "lev2": [1, 1, 2, 1, 1, 2],
9228+
... "lev3": [1, 2, 1, 2, 1, 2],
9229+
... "lev4": [1, 2, 3, 4, 5, 6],
9230+
... "values": [0, 1, 2, 3, 4, 5]})
92319231
>>> df
92329232
lev1 lev2 lev3 lev4 values
92339233
0 1 1 1 1 0

pandas/core/indexes/base.py

Lines changed: 27 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -4809,11 +4809,18 @@ def _join_non_unique(
48094809
left_idx, right_idx = get_join_indexers_non_unique(
48104810
self._values, other._values, how=how, sort=sort
48114811
)
4812-
mask = left_idx == -1
48134812

4814-
join_idx = self.take(left_idx)
4815-
right = other.take(right_idx)
4816-
join_index = join_idx.putmask(mask, right)
4813+
if how == "right":
4814+
join_index = other.take(right_idx)
4815+
else:
4816+
join_index = self.take(left_idx)
4817+
4818+
if how == "outer":
4819+
mask = left_idx == -1
4820+
if mask.any():
4821+
right = other.take(right_idx)
4822+
join_index = join_index.putmask(mask, right)
4823+
48174824
if isinstance(join_index, ABCMultiIndex) and how == "outer":
48184825
# test_join_index_levels
48194826
join_index = join_index._sort_levels_monotonic()
@@ -4989,35 +4996,29 @@ def _join_monotonic(
49894996
ridx: npt.NDArray[np.intp] | None
49904997
lidx: npt.NDArray[np.intp] | None
49914998

4992-
if self.is_unique and other.is_unique:
4993-
# We can perform much better than the general case
4994-
if how == "left":
4999+
if how == "left":
5000+
if other.is_unique:
5001+
# We can perform much better than the general case
49955002
join_index = self
49965003
lidx = None
49975004
ridx = self._left_indexer_unique(other)
4998-
elif how == "right":
5005+
else:
5006+
join_array, lidx, ridx = self._left_indexer(other)
5007+
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
5008+
elif how == "right":
5009+
if self.is_unique:
5010+
# We can perform much better than the general case
49995011
join_index = other
50005012
lidx = other._left_indexer_unique(self)
50015013
ridx = None
5002-
elif how == "inner":
5003-
join_array, lidx, ridx = self._inner_indexer(other)
5004-
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
5005-
elif how == "outer":
5006-
join_array, lidx, ridx = self._outer_indexer(other)
5007-
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
5008-
else:
5009-
if how == "left":
5010-
join_array, lidx, ridx = self._left_indexer(other)
5011-
elif how == "right":
5014+
else:
50125015
join_array, ridx, lidx = other._left_indexer(self)
5013-
elif how == "inner":
5014-
join_array, lidx, ridx = self._inner_indexer(other)
5015-
elif how == "outer":
5016-
join_array, lidx, ridx = self._outer_indexer(other)
5017-
5018-
assert lidx is not None
5019-
assert ridx is not None
5020-
5016+
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
5017+
elif how == "inner":
5018+
join_array, lidx, ridx = self._inner_indexer(other)
5019+
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
5020+
elif how == "outer":
5021+
join_array, lidx, ridx = self._outer_indexer(other)
50215022
join_index = self._wrap_joined_index(join_array, other, lidx, ridx)
50225023

50235024
lidx = None if lidx is None else ensure_platform_int(lidx)

pandas/core/resample.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1039,15 +1039,15 @@ def interpolate(
10391039
2023-03-01 07:00:04 3
10401040
Freq: s, dtype: int64
10411041
1042-
Upsample the dataframe to 0.5Hz by providing the period time of 2s.
1042+
Downsample the dataframe to 0.5Hz by providing the period time of 2s.
10431043
10441044
>>> series.resample("2s").interpolate("linear")
10451045
2023-03-01 07:00:00 1
10461046
2023-03-01 07:00:02 2
10471047
2023-03-01 07:00:04 3
10481048
Freq: 2s, dtype: int64
10491049
1050-
Downsample the dataframe to 2Hz by providing the period time of 500ms.
1050+
Upsample the dataframe to 2Hz by providing the period time of 500ms.
10511051
10521052
>>> series.resample("500ms").interpolate("linear")
10531053
2023-03-01 07:00:00.000 1.0

pandas/core/shared_docs.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -839,7 +839,7 @@
839839
Consider a dataset containing food consumption in Argentina.
840840
841841
>>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48],
842-
... 'co2_emissions': [37.2, 19.66, 1712]}},
842+
... 'co2_emissions': [37.2, 19.66, 1712]}},
843843
... index=['Pork', 'Wheat Products', 'Beef'])
844844
845845
>>> df
@@ -904,7 +904,7 @@
904904
Consider a dataset containing food consumption in Argentina.
905905
906906
>>> df = pd.DataFrame({{'consumption': [10.51, 103.11, 55.48],
907-
... 'co2_emissions': [37.2, 19.66, 1712]}},
907+
... 'co2_emissions': [37.2, 19.66, 1712]}},
908908
... index=['Pork', 'Wheat Products', 'Beef'])
909909
910910
>>> df

pandas/core/strings/accessor.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3055,11 +3055,11 @@ def len(self):
30553055
number of entries for dictionaries, lists or tuples.
30563056
30573057
>>> s = pd.Series(['dog',
3058-
... '',
3059-
... 5,
3060-
... {'foo' : 'bar'},
3061-
... [2, 3, 5, 7],
3062-
... ('one', 'two', 'three')])
3058+
... '',
3059+
... 5,
3060+
... {'foo' : 'bar'},
3061+
... [2, 3, 5, 7],
3062+
... ('one', 'two', 'three')])
30633063
>>> s
30643064
0 dog
30653065
1

pandas/errors/__init__.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,7 @@ class ChainedAssignmentError(Warning):
469469
--------
470470
>>> pd.options.mode.copy_on_write = True
471471
>>> df = pd.DataFrame({'A': [1, 1, 1, 2, 2]}, columns=['A'])
472-
>>> df["A"][0:3] = 10 # doctest: +SKIP
472+
>>> df["A"][0:3] = 10 # doctest: +SKIP
473473
... # ChainedAssignmentError: ...
474474
>>> pd.options.mode.copy_on_write = False
475475
"""
@@ -561,10 +561,10 @@ class NumExprClobberingError(NameError):
561561
Examples
562562
--------
563563
>>> df = pd.DataFrame({'abs': [1, 1, 1]})
564-
>>> df.query("abs > 2") # doctest: +SKIP
564+
>>> df.query("abs > 2") # doctest: +SKIP
565565
... # NumExprClobberingError: Variables in expression "(abs) > (2)" overlap...
566566
>>> sin, a = 1, 2
567-
>>> pd.eval("sin + a", engine='numexpr') # doctest: +SKIP
567+
>>> pd.eval("sin + a", engine='numexpr') # doctest: +SKIP
568568
... # NumExprClobberingError: Variables in expression "(sin) + (a)" overlap...
569569
"""
570570

@@ -677,9 +677,9 @@ class ClosedFileError(Exception):
677677
678678
Examples
679679
--------
680-
>>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP
681-
>>> store.close() # doctest: +SKIP
682-
>>> store.keys() # doctest: +SKIP
680+
>>> store = pd.HDFStore('my-store', 'a') # doctest: +SKIP
681+
>>> store.close() # doctest: +SKIP
682+
>>> store.keys() # doctest: +SKIP
683683
... # ClosedFileError: my-store file is not open!
684684
"""
685685

@@ -773,9 +773,9 @@ class CategoricalConversionWarning(Warning):
773773
Examples
774774
--------
775775
>>> from pandas.io.stata import StataReader
776-
>>> with StataReader('dta_file', chunksize=2) as reader: # doctest: +SKIP
777-
... for i, block in enumerate(reader):
778-
... print(i, block)
776+
>>> with StataReader('dta_file', chunksize=2) as reader: # doctest: +SKIP
777+
... for i, block in enumerate(reader):
778+
... print(i, block)
779779
... # CategoricalConversionWarning: One or more series with value labels...
780780
"""
781781

pandas/plotting/_core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1114,7 +1114,7 @@ def line(
11141114
.. plot::
11151115
:context: close-figs
11161116
1117-
>>> df = pd.DataFrame({'lab':['A', 'B', 'C'], 'val':[10, 30, 20]})
1117+
>>> df = pd.DataFrame({'lab': ['A', 'B', 'C'], 'val': [10, 30, 20]})
11181118
>>> ax = df.plot.bar(x='lab', y='val', rot=0)
11191119
11201120
Plot a whole dataframe to a bar plot. Each column is assigned a
@@ -1195,7 +1195,7 @@ def bar( # pylint: disable=disallowed-name
11951195
"""
11961196
See Also
11971197
--------
1198-
DataFrame.plot.bar: Vertical bar plot.
1198+
DataFrame.plot.bar : Vertical bar plot.
11991199
DataFrame.plot : Make plots of DataFrame using matplotlib.
12001200
matplotlib.axes.Axes.bar : Plot a vertical bar plot using matplotlib.
12011201

0 commit comments

Comments
 (0)