Skip to content

Commit 9fd186b

Browse files
committed
Merge remote-tracking branch 'upstream/main' into ref/cython3changes
2 parents 5203e41 + bb0fcc2 commit 9fd186b

File tree

17 files changed

+162
-56
lines changed

17 files changed

+162
-56
lines changed

.github/ISSUE_TEMPLATE/pdep_vote.yaml

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
name: PDEP Vote
2+
description: Call for a vote on a PDEP
3+
title: "VOTE: "
4+
labels: [Vote]
5+
6+
body:
7+
- type: markdown
8+
attributes:
9+
value: >
10+
As per [PDEP-1](https://pandas.pydata.org/pdeps/0001-purpose-and-guidelines.html), the following issue template should be used when a
11+
maintainer has opened a PDEP discussion and is ready to call for a vote.
12+
- type: checkboxes
13+
attributes:
14+
label: Locked issue
15+
options:
16+
- label: >
17+
I locked this voting issue so that only voting members are able to cast their votes or
18+
comment on this issue.
19+
required: true
20+
- type: input
21+
id: PDEP-name
22+
attributes:
23+
label: PDEP number and title
24+
placeholder: >
25+
PDEP-1: Purpose and guidelines
26+
validations:
27+
required: true
28+
- type: input
29+
id: PDEP-link
30+
attributes:
31+
label: Pull request with discussion
32+
description: e.g. https://github.com/pandas-dev/pandas/pull/47444
33+
validations:
34+
required: true
35+
- type: input
36+
id: PDEP-rendered-link
37+
attributes:
38+
label: Rendered PDEP for easy reading
39+
description: e.g. https://github.com/pandas-dev/pandas/pull/47444/files?short_path=7c449e6#diff-7c449e698132205b235c501f7e47ebba38da4d2b7f9492c98f16745dba787041
40+
validations:
41+
required: true
42+
- type: input
43+
id: PDEP-number-of-discussion-participants
44+
attributes:
45+
label: Discussion participants
46+
description: >
47+
You may find it useful to list or total the number of participating members in the
48+
PDEP discussion PR. This would be the maximum possible disapprove votes.
49+
placeholder: >
50+
14 voting members participated in the PR discussion thus far.
51+
- type: input
52+
id: PDEP-vote-end
53+
attributes:
54+
label: Voting will close in 15 days.
55+
description: The voting period end date. ('Voting will close in 15 days.' will be automatically written)
56+
- type: markdown
57+
attributes:
58+
value: ---
59+
- type: textarea
60+
id: Vote
61+
attributes:
62+
label: Vote
63+
value: |
64+
Cast your vote in a comment below.
65+
* +1: approve.
66+
* 0: abstain.
67+
* Reason: A one sentence reason is required.
68+
* -1: disapprove
69+
* Reason: A one sentence reason is required.
70+
A disapprove vote requires prior participation in the linked discussion PR.
71+
72+
@pandas-dev/pandas-core
73+
validations:
74+
required: true

asv_bench/benchmarks/frame_methods.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -862,4 +862,28 @@ def time_last_valid_index(self, dtype):
862862
self.df.last_valid_index()
863863

864864

865+
class Update:
866+
def setup(self):
867+
rng = np.random.default_rng()
868+
self.df = DataFrame(rng.uniform(size=(1_000_000, 10)))
869+
870+
idx = rng.choice(range(1_000_000), size=1_000_000, replace=False)
871+
self.df_random = DataFrame(self.df, index=idx)
872+
873+
idx = rng.choice(range(1_000_000), size=100_000, replace=False)
874+
cols = rng.choice(range(10), size=2, replace=False)
875+
self.df_sample = DataFrame(
876+
rng.uniform(size=(100_000, 2)), index=idx, columns=cols
877+
)
878+
879+
def time_to_update_big_frame_small_arg(self):
880+
self.df.update(self.df_sample)
881+
882+
def time_to_update_random_indices(self):
883+
self.df_random.update(self.df_sample)
884+
885+
def time_to_update_small_frame_big_arg(self):
886+
self.df_sample.update(self.df)
887+
888+
865889
from .pandas_vb_common import setup # noqa: F401 isort:skip

ci/code_checks.sh

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
153153
-i "pandas.DatetimeTZDtype SA01" \
154154
-i "pandas.DatetimeTZDtype.tz SA01" \
155155
-i "pandas.DatetimeTZDtype.unit SA01" \
156-
-i "pandas.Float32Dtype SA01" \
157-
-i "pandas.Float64Dtype SA01" \
158156
-i "pandas.Grouper PR02,SA01" \
159157
-i "pandas.HDFStore.append PR01,SA01" \
160158
-i "pandas.HDFStore.get SA01" \

doc/source/whatsnew/v3.0.0.rst

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ enhancement2
2828

2929
Other enhancements
3030
^^^^^^^^^^^^^^^^^^
31+
- :class:`pandas.api.typing.FrozenList` is available for typing the outputs of :attr:`MultiIndex.names`, :attr:`MultiIndex.codes` and :attr:`MultiIndex.levels` (:issue:`58237`)
3132
- :func:`DataFrame.to_excel` now raises an ``UserWarning`` when the character count in a cell exceeds Excel's limitation of 32767 characters (:issue:`56954`)
3233
- :func:`read_stata` now returns ``datetime64`` resolutions better matching those natively stored in the stata format (:issue:`55642`)
3334
- :meth:`Styler.set_tooltips` provides alternative method to storing tooltips by using title attribute of td elements. (:issue:`56981`)
@@ -330,6 +331,7 @@ Performance improvements
330331
- Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
331332
- Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
332333
- Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`)
334+
- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
333335
- Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
334336
- Performance improvement in indexing operations for string dtypes (:issue:`56997`)
335337
- Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`)
@@ -385,7 +387,7 @@ Interval
385387

386388
Indexing
387389
^^^^^^^^
388-
-
390+
- Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`)
389391
-
390392

391393
Missing
@@ -405,7 +407,6 @@ I/O
405407
- Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
406408
- Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
407409

408-
409410
Period
410411
^^^^^^
411412
-

pandas/_libs/tslibs/offsets.pyx

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -219,8 +219,7 @@ cdef _get_calendar(weekmask, holidays, calendar):
219219
holidays = holidays + calendar.holidays().tolist()
220220
except AttributeError:
221221
pass
222-
holidays = [_to_dt64D(dt) for dt in holidays]
223-
holidays = tuple(sorted(holidays))
222+
holidays = tuple(sorted(_to_dt64D(dt) for dt in holidays))
224223

225224
kwargs = {"weekmask": weekmask}
226225
if holidays:
@@ -419,11 +418,10 @@ cdef class BaseOffset:
419418

420419
if "holidays" in all_paras and not all_paras["holidays"]:
421420
all_paras.pop("holidays")
422-
exclude = ["kwds", "name", "calendar"]
423-
attrs = [(k, v) for k, v in all_paras.items()
424-
if (k not in exclude) and (k[0] != "_")]
425-
attrs = sorted(set(attrs))
426-
params = tuple([str(type(self))] + attrs)
421+
exclude = {"kwds", "name", "calendar"}
422+
attrs = {(k, v) for k, v in all_paras.items()
423+
if (k not in exclude) and (k[0] != "_")}
424+
params = tuple([str(type(self))] + sorted(attrs))
427425
return params
428426

429427
@property

pandas/api/typing/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
DataFrameGroupBy,
1010
SeriesGroupBy,
1111
)
12+
from pandas.core.indexes.frozen import FrozenList
1213
from pandas.core.resample import (
1314
DatetimeIndexResamplerGroupby,
1415
PeriodIndexResamplerGroupby,
@@ -38,6 +39,7 @@
3839
"ExpandingGroupby",
3940
"ExponentialMovingWindow",
4041
"ExponentialMovingWindowGroupby",
42+
"FrozenList",
4143
"JsonReader",
4244
"NaTType",
4345
"NAType",

pandas/core/arrays/datetimelike.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1787,7 +1787,7 @@ def strftime(self, date_format: str) -> npt.NDArray[np.object_]:
17871787
----------
17881788
freq : str or Offset
17891789
The frequency level to {op} the index to. Must be a fixed
1790-
frequency like 'S' (second) not 'ME' (month end). See
1790+
frequency like 's' (second) not 'ME' (month end). See
17911791
:ref:`frequency aliases <timeseries.offset_aliases>` for
17921792
a list of possible `freq` values.
17931793
ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'

pandas/core/arrays/floating.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,12 @@ class FloatingArray(NumericArray):
135135
-------
136136
None
137137
138+
See Also
139+
--------
140+
CategoricalDtype : Type for categorical data with the categories and orderedness.
141+
IntegerDtype : An ExtensionDtype to hold a single size & kind of integer dtype.
142+
StringDtype : An ExtensionDtype for string data.
143+
138144
Examples
139145
--------
140146
For Float32Dtype:

pandas/core/frame.py

Lines changed: 17 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2301,8 +2301,8 @@ def maybe_reorder(
23012301
exclude.update(index)
23022302

23032303
if any(exclude):
2304-
arr_exclude = [x for x in exclude if x in arr_columns]
2305-
to_remove = [arr_columns.get_loc(col) for col in arr_exclude]
2304+
arr_exclude = (x for x in exclude if x in arr_columns)
2305+
to_remove = {arr_columns.get_loc(col) for col in arr_exclude}
23062306
arrays = [v for i, v in enumerate(arrays) if i not in to_remove]
23072307

23082308
columns = columns.drop(exclude)
@@ -3705,7 +3705,7 @@ def transpose(
37053705
nv.validate_transpose(args, {})
37063706
# construct the args
37073707

3708-
dtypes = list(self.dtypes)
3708+
first_dtype = self.dtypes.iloc[0] if len(self.columns) else None
37093709

37103710
if self._can_fast_transpose:
37113711
# Note: tests pass without this, but this improves perf quite a bit.
@@ -3723,11 +3723,11 @@ def transpose(
37233723

37243724
elif (
37253725
self._is_homogeneous_type
3726-
and dtypes
3727-
and isinstance(dtypes[0], ExtensionDtype)
3726+
and first_dtype is not None
3727+
and isinstance(first_dtype, ExtensionDtype)
37283728
):
37293729
new_values: list
3730-
if isinstance(dtypes[0], BaseMaskedDtype):
3730+
if isinstance(first_dtype, BaseMaskedDtype):
37313731
# We have masked arrays with the same dtype. We can transpose faster.
37323732
from pandas.core.arrays.masked import (
37333733
transpose_homogeneous_masked_arrays,
@@ -3736,7 +3736,7 @@ def transpose(
37363736
new_values = transpose_homogeneous_masked_arrays(
37373737
cast(Sequence[BaseMaskedArray], self._iter_column_arrays())
37383738
)
3739-
elif isinstance(dtypes[0], ArrowDtype):
3739+
elif isinstance(first_dtype, ArrowDtype):
37403740
# We have arrow EAs with the same dtype. We can transpose faster.
37413741
from pandas.core.arrays.arrow.array import (
37423742
ArrowExtensionArray,
@@ -3748,10 +3748,11 @@ def transpose(
37483748
)
37493749
else:
37503750
# We have other EAs with the same dtype. We preserve dtype in transpose.
3751-
dtyp = dtypes[0]
3752-
arr_typ = dtyp.construct_array_type()
3751+
arr_typ = first_dtype.construct_array_type()
37533752
values = self.values
3754-
new_values = [arr_typ._from_sequence(row, dtype=dtyp) for row in values]
3753+
new_values = [
3754+
arr_typ._from_sequence(row, dtype=first_dtype) for row in values
3755+
]
37553756

37563757
result = type(self)._from_arrays(
37573758
new_values,
@@ -3855,8 +3856,10 @@ def __getitem__(self, key):
38553856
key = lib.item_from_zerodim(key)
38563857
key = com.apply_if_callable(key, self)
38573858

3858-
if is_hashable(key) and not is_iterator(key):
3859+
if is_hashable(key) and not is_iterator(key) and not isinstance(key, slice):
38593860
# is_iterator to exclude generator e.g. test_getitem_listlike
3861+
# As of Python 3.12, slice is hashable which breaks MultiIndex (GH#57500)
3862+
38603863
# shortcut if the key is in columns
38613864
is_mi = isinstance(self.columns, MultiIndex)
38623865
# GH#45316 Return view if key is not duplicated
@@ -5880,7 +5883,7 @@ def set_index(
58805883
else:
58815884
arrays.append(self.index)
58825885

5883-
to_remove: list[Hashable] = []
5886+
to_remove: set[Hashable] = set()
58845887
for col in keys:
58855888
if isinstance(col, MultiIndex):
58865889
arrays.extend(col._get_level_values(n) for n in range(col.nlevels))
@@ -5907,7 +5910,7 @@ def set_index(
59075910
arrays.append(frame[col])
59085911
names.append(col)
59095912
if drop:
5910-
to_remove.append(col)
5913+
to_remove.add(col)
59115914

59125915
if len(arrays[-1]) != len(self):
59135916
# check newest element against length of calling frame, since
@@ -5924,7 +5927,7 @@ def set_index(
59245927
raise ValueError(f"Index has duplicate keys: {duplicates}")
59255928

59265929
# use set to handle duplicate column names gracefully in case of drop
5927-
for c in set(to_remove):
5930+
for c in to_remove:
59285931
del frame[c]
59295932

59305933
# clear up memory usage

pandas/core/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2045,7 +2045,7 @@ def __setstate__(self, state) -> None:
20452045
# e.g. say fill_value needing _mgr to be
20462046
# defined
20472047
meta = set(self._internal_names + self._metadata)
2048-
for k in list(meta):
2048+
for k in meta:
20492049
if k in state and k != "_flags":
20502050
v = state[k]
20512051
object.__setattr__(self, k, v)

pandas/core/internals/construction.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -567,7 +567,7 @@ def _extract_index(data) -> Index:
567567
if len(data) == 0:
568568
return default_index(0)
569569

570-
raw_lengths = []
570+
raw_lengths = set()
571571
indexes: list[list[Hashable] | Index] = []
572572

573573
have_raw_arrays = False
@@ -583,7 +583,7 @@ def _extract_index(data) -> Index:
583583
indexes.append(list(val.keys()))
584584
elif is_list_like(val) and getattr(val, "ndim", 1) == 1:
585585
have_raw_arrays = True
586-
raw_lengths.append(len(val))
586+
raw_lengths.add(len(val))
587587
elif isinstance(val, np.ndarray) and val.ndim > 1:
588588
raise ValueError("Per-column arrays must each be 1-dimensional")
589589

@@ -596,24 +596,23 @@ def _extract_index(data) -> Index:
596596
index = union_indexes(indexes, sort=False)
597597

598598
if have_raw_arrays:
599-
lengths = list(set(raw_lengths))
600-
if len(lengths) > 1:
599+
if len(raw_lengths) > 1:
601600
raise ValueError("All arrays must be of the same length")
602601

603602
if have_dicts:
604603
raise ValueError(
605604
"Mixing dicts with non-Series may lead to ambiguous ordering."
606605
)
607-
606+
raw_length = raw_lengths.pop()
608607
if have_series:
609-
if lengths[0] != len(index):
608+
if raw_length != len(index):
610609
msg = (
611-
f"array length {lengths[0]} does not match index "
610+
f"array length {raw_length} does not match index "
612611
f"length {len(index)}"
613612
)
614613
raise ValueError(msg)
615614
else:
616-
index = default_index(lengths[0])
615+
index = default_index(raw_length)
617616

618617
return ensure_index(index)
619618

pandas/core/tools/datetimes.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,18 +1124,18 @@ def f(value):
11241124

11251125
# we require at least Ymd
11261126
required = ["year", "month", "day"]
1127-
req = sorted(set(required) - set(unit_rev.keys()))
1127+
req = set(required) - set(unit_rev.keys())
11281128
if len(req):
1129-
_required = ",".join(req)
1129+
_required = ",".join(sorted(req))
11301130
raise ValueError(
11311131
"to assemble mappings requires at least that "
11321132
f"[year, month, day] be specified: [{_required}] is missing"
11331133
)
11341134

11351135
# keys we don't recognize
1136-
excess = sorted(set(unit_rev.keys()) - set(_unit_map.values()))
1136+
excess = set(unit_rev.keys()) - set(_unit_map.values())
11371137
if len(excess):
1138-
_excess = ",".join(excess)
1138+
_excess = ",".join(sorted(excess))
11391139
raise ValueError(
11401140
f"extra keys have been passed to the datetime assemblage: [{_excess}]"
11411141
)

0 commit comments

Comments
 (0)