Skip to content

Commit 9a7a8e4

Browse files
committed
Merge remote-tracking branch 'upstream/master' into styler_format_index
2 parents b87ef09 + 6a683a2 commit 9a7a8e4

File tree

25 files changed

+458
-129
lines changed

25 files changed

+458
-129
lines changed

asv_bench/benchmarks/sparse.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -67,16 +67,28 @@ def time_sparse_series_from_coo(self):
6767

6868

6969
class ToCoo:
70-
def setup(self):
70+
params = [True, False]
71+
param_names = ["sort_labels"]
72+
73+
def setup(self, sort_labels):
7174
s = Series([np.nan] * 10000)
7275
s[0] = 3.0
7376
s[100] = -1.0
7477
s[999] = 12.1
75-
s.index = MultiIndex.from_product([range(10)] * 4)
76-
self.ss = s.astype("Sparse")
7778

78-
def time_sparse_series_to_coo(self):
79-
self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True)
79+
s_mult_lvl = s.set_axis(MultiIndex.from_product([range(10)] * 4))
80+
self.ss_mult_lvl = s_mult_lvl.astype("Sparse")
81+
82+
s_two_lvl = s.set_axis(MultiIndex.from_product([range(100)] * 2))
83+
self.ss_two_lvl = s_two_lvl.astype("Sparse")
84+
85+
def time_sparse_series_to_coo(self, sort_labels):
86+
self.ss_mult_lvl.sparse.to_coo(
87+
row_levels=[0, 1], column_levels=[2, 3], sort_labels=sort_labels
88+
)
89+
90+
def time_sparse_series_to_coo_single_level(self, sort_labels):
91+
self.ss_two_lvl.sparse.to_coo(sort_labels=sort_labels)
8092

8193

8294
class Arithmetic:

doc/source/whatsnew/v1.3.3.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,11 @@ Fixed regressions
1717
- Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`)
1818
- Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`)
1919
- Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`)
20+
- Fixed regression in :meth:`.GroupBy.quantile` which was failing with ``pandas.NA`` (:issue:`42849`)
2021
- Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`)
2122
- Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`)
2223
- Fixed regression in :meth:`read_parquet` where the ``fastparquet`` engine would not work properly with fastparquet 0.7.0 (:issue:`43075`)
24+
- Fixed regression in :func:`is_list_like` where objects with ``__iter__`` set to ``None`` would be identified as iterable (:issue:`43373`)
2325

2426
.. ---------------------------------------------------------------------------
2527

doc/source/whatsnew/v1.4.0.rst

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,15 @@ Styler
7676
- :meth:`.Styler.bar` introduces additional arguments to control alignment and display (:issue:`26070`, :issue:`36419`), and it also validates the input arguments ``width`` and ``height`` (:issue:`42511`).
7777
- :meth:`.Styler.to_latex` introduces keyword argument ``environment``, which also allows a specific "longtable" entry through a separate jinja2 template (:issue:`41866`).
7878
- :meth:`.Styler.to_html` introduces keyword arguments ``sparse_index``, ``sparse_columns``, ``bold_headers``, ``caption`` (:issue:`41946`, :issue:`43149`).
79-
- Keyword argument ``level`` is added to :meth:`.Styler.hide_index` and :meth:`.Styler.hide_columns` for optionally controlling hidden levels in a MultiIndex (:issue:`25475`)
79+
- Keyword arguments ``level`` and ``names`` added to :meth:`.Styler.hide_index` and :meth:`.Styler.hide_columns` for additional control of visibility of MultiIndexes and index names (:issue:`25475`, :issue:`43404`, :issue:`43346`)
8080
- Global options have been extended to configure default ``Styler`` properties including formatting and encoding and mathjax options and LaTeX (:issue:`41395`)
8181

8282
Formerly Styler relied on ``display.html.use_mathjax``, which has now been replaced by ``styler.html.mathjax``.
8383

8484
There are also bug fixes and deprecations listed below.
8585

86+
Validation now for ``caption`` arg (:issue:`43368`)
87+
8688
.. _whatsnew_140.enhancements.pyarrow_csv_engine:
8789

8890
Multithreaded CSV reading with a new CSV Engine based on pyarrow
@@ -272,6 +274,7 @@ Other Deprecations
272274
- Deprecated dropping of nuisance columns in :class:`Rolling`, :class:`Expanding`, and :class:`EWM` aggregations (:issue:`42738`)
273275
- Deprecated :meth:`Index.reindex` with a non-unique index (:issue:`42568`)
274276
- Deprecated :meth:`.Styler.render` in favour of :meth:`.Styler.to_html` (:issue:`42140`)
277+
- Deprecated passing in a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`)
275278

276279
.. ---------------------------------------------------------------------------
277280
@@ -287,6 +290,8 @@ Performance improvements
287290
- Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`)
288291
- Performance improvement in :func:`read_stata` (:issue:`43059`)
289292
- Performance improvement in :meth:`to_datetime` with ``uint`` dtypes (:issue:`42606`)
293+
- Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`)
294+
-
290295

291296
.. ---------------------------------------------------------------------------
292297
@@ -377,6 +382,7 @@ I/O
377382
- Bug in :func:`read_fwf`, where difference in lengths of ``colspecs`` and ``names`` was not raising ``ValueError`` (:issue:`40830`)
378383
- Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`)
379384
- Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`)
385+
- Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`)
380386
-
381387

382388
Period
@@ -393,6 +399,7 @@ Groupby/resample/rolling
393399
^^^^^^^^^^^^^^^^^^^^^^^^
394400
- Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`)
395401
- Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`)
402+
- Bug in :meth:`GroupBy.max` and :meth:`GroupBy.min` with nullable integer dtypes losing precision (:issue:`41743`)
396403
- Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`)
397404
- Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`)
398405
- Bug in :meth:`SeriesGroupBy.nlargest` and :meth:`SeriesGroupBy.nsmallest` would have an inconsistent index when the input Series was sorted and ``n`` was greater than or equal to all group sizes (:issue:`15272`, :issue:`16345`, :issue:`29129`)
@@ -406,6 +413,7 @@ Reshaping
406413
- Improved error message when creating a :class:`DataFrame` column from a multi-dimensional :class:`numpy.ndarray` (:issue:`42463`)
407414
- :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`)
408415
- Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`)
416+
- Bug in :meth:`DataFrame.append` failing to retain dtypes when appended columns do not match (:issue:`43392`)
409417
-
410418

411419
Sparse

pandas/_libs/groupby.pyi

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,17 @@ def group_max(
123123
values: np.ndarray, # ndarray[groupby_t, ndim=2]
124124
labels: np.ndarray, # const int64_t[:]
125125
min_count: int = ...,
126+
mask: np.ndarray | None = ...,
127+
result_mask: np.ndarray | None = ...,
126128
) -> None: ...
127129
def group_min(
128130
out: np.ndarray, # groupby_t[:, ::1]
129131
counts: np.ndarray, # int64_t[::1]
130132
values: np.ndarray, # ndarray[groupby_t, ndim=2]
131133
labels: np.ndarray, # const int64_t[:]
132134
min_count: int = ...,
135+
mask: np.ndarray | None = ...,
136+
result_mask: np.ndarray | None = ...,
133137
) -> None: ...
134138
def group_cummin(
135139
out: np.ndarray, # groupby_t[:, ::1]

pandas/_libs/groupby.pyx

Lines changed: 31 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1182,7 +1182,9 @@ cdef group_min_max(groupby_t[:, ::1] out,
11821182
const intp_t[::1] labels,
11831183
Py_ssize_t min_count=-1,
11841184
bint is_datetimelike=False,
1185-
bint compute_max=True):
1185+
bint compute_max=True,
1186+
const uint8_t[:, ::1] mask=None,
1187+
uint8_t[:, ::1] result_mask=None):
11861188
"""
11871189
Compute minimum/maximum of columns of `values`, in row groups `labels`.
11881190
@@ -1203,6 +1205,12 @@ cdef group_min_max(groupby_t[:, ::1] out,
12031205
True if `values` contains datetime-like entries.
12041206
compute_max : bint, default True
12051207
True to compute group-wise max, False to compute min
1208+
mask : ndarray[bool, ndim=2], optional
1209+
If not None, indices represent missing values,
1210+
otherwise the mask will not be used
1211+
result_mask : ndarray[bool, ndim=2], optional
1212+
If not None, these specify locations in the output that are NA.
1213+
Modified in-place.
12061214
12071215
Notes
12081216
-----
@@ -1215,6 +1223,8 @@ cdef group_min_max(groupby_t[:, ::1] out,
12151223
ndarray[groupby_t, ndim=2] group_min_or_max
12161224
bint runtime_error = False
12171225
int64_t[:, ::1] nobs
1226+
bint uses_mask = mask is not None
1227+
bint isna_entry
12181228

12191229
# TODO(cython 3.0):
12201230
# Instead of `labels.shape[0]` use `len(labels)`
@@ -1249,7 +1259,12 @@ cdef group_min_max(groupby_t[:, ::1] out,
12491259
for j in range(K):
12501260
val = values[i, j]
12511261

1252-
if not _treat_as_na(val, is_datetimelike):
1262+
if uses_mask:
1263+
isna_entry = mask[i, j]
1264+
else:
1265+
isna_entry = _treat_as_na(val, is_datetimelike)
1266+
1267+
if not isna_entry:
12531268
nobs[lab, j] += 1
12541269
if compute_max:
12551270
if val > group_min_or_max[lab, j]:
@@ -1265,7 +1280,10 @@ cdef group_min_max(groupby_t[:, ::1] out,
12651280
runtime_error = True
12661281
break
12671282
else:
1268-
out[i, j] = nan_val
1283+
if uses_mask:
1284+
result_mask[i, j] = True
1285+
else:
1286+
out[i, j] = nan_val
12691287
else:
12701288
out[i, j] = group_min_or_max[i, j]
12711289

@@ -1282,7 +1300,9 @@ def group_max(groupby_t[:, ::1] out,
12821300
ndarray[groupby_t, ndim=2] values,
12831301
const intp_t[::1] labels,
12841302
Py_ssize_t min_count=-1,
1285-
bint is_datetimelike=False) -> None:
1303+
bint is_datetimelike=False,
1304+
const uint8_t[:, ::1] mask=None,
1305+
uint8_t[:, ::1] result_mask=None) -> None:
12861306
"""See group_min_max.__doc__"""
12871307
group_min_max(
12881308
out,
@@ -1292,6 +1312,8 @@ def group_max(groupby_t[:, ::1] out,
12921312
min_count=min_count,
12931313
is_datetimelike=is_datetimelike,
12941314
compute_max=True,
1315+
mask=mask,
1316+
result_mask=result_mask,
12951317
)
12961318

12971319

@@ -1302,7 +1324,9 @@ def group_min(groupby_t[:, ::1] out,
13021324
ndarray[groupby_t, ndim=2] values,
13031325
const intp_t[::1] labels,
13041326
Py_ssize_t min_count=-1,
1305-
bint is_datetimelike=False) -> None:
1327+
bint is_datetimelike=False,
1328+
const uint8_t[:, ::1] mask=None,
1329+
uint8_t[:, ::1] result_mask=None) -> None:
13061330
"""See group_min_max.__doc__"""
13071331
group_min_max(
13081332
out,
@@ -1312,6 +1336,8 @@ def group_min(groupby_t[:, ::1] out,
13121336
min_count=min_count,
13131337
is_datetimelike=is_datetimelike,
13141338
compute_max=False,
1339+
mask=mask,
1340+
result_mask=result_mask,
13151341
)
13161342

13171343

pandas/_libs/index.pyx

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -603,35 +603,34 @@ cdef class BaseMultiIndexCodesEngine:
603603
def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray:
604604
raise NotImplementedError("Implemented by subclass")
605605

606-
def _extract_level_codes(self, ndarray[object] target) -> np.ndarray:
606+
def _extract_level_codes(self, target) -> np.ndarray:
607607
"""
608608
Map the requested list of (tuple) keys to their integer representations
609609
for searching in the underlying integer index.
610610

611611
Parameters
612612
----------
613-
target : ndarray[object]
614-
Each key is a tuple, with a label for each level of the index.
613+
target : MultiIndex
615614

616615
Returns
617616
------
618617
int_keys : 1-dimensional array of dtype uint64 or object
619618
Integers representing one combination each
620619
"""
620+
zt = [target._get_level_values(i) for i in range(target.nlevels)]
621621
level_codes = [lev.get_indexer(codes) + 1 for lev, codes
622-
in zip(self.levels, zip(*target))]
622+
in zip(self.levels, zt)]
623623
return self._codes_to_ints(np.array(level_codes, dtype='uint64').T)
624624

625-
def get_indexer(self, ndarray[object] target) -> np.ndarray:
625+
def get_indexer(self, target) -> np.ndarray:
626626
"""
627627
Returns an array giving the positions of each value of `target` in
628628
`self.values`, where -1 represents a value in `target` which does not
629629
appear in `self.values`
630630

631631
Parameters
632632
----------
633-
target : ndarray[object]
634-
Each key is a tuple, with a label for each level of the index
633+
target : MultiIndex
635634

636635
Returns
637636
-------
@@ -742,8 +741,8 @@ cdef class BaseMultiIndexCodesEngine:
742741

743742
return self._base.get_loc(self, lab_int)
744743

745-
def get_indexer_non_unique(self, ndarray[object] target):
746-
744+
def get_indexer_non_unique(self, target):
745+
# target: MultiIndex
747746
lab_ints = self._extract_level_codes(target)
748747
indexer = self._base.get_indexer_non_unique(self, lab_ints)
749748

pandas/_libs/lib.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1092,7 +1092,7 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool:
10921092
cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1:
10931093
return (
10941094
# equiv: `isinstance(obj, abc.Iterable)`
1095-
hasattr(obj, "__iter__") and not isinstance(obj, type)
1095+
getattr(obj, "__iter__", None) is not None and not isinstance(obj, type)
10961096
# we do not count strings/unicode/bytes as list-like
10971097
and not isinstance(obj, (str, bytes))
10981098
# exclude zero-dimensional numpy arrays, effectively scalars

pandas/core/arrays/masked.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False):
123123
raise ValueError("values must be a 1D array")
124124
if mask.ndim != 1:
125125
raise ValueError("mask must be a 1D array")
126+
if values.shape != mask.shape:
127+
raise ValueError("values and mask must have same shape")
126128

127129
if copy:
128130
values = values.copy()

pandas/core/arrays/sparse/accessor.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False):
113113
column_levels : tuple/list
114114
sort_labels : bool, default False
115115
Sort the row and column labels before forming the sparse matrix.
116+
When `row_levels` and/or `column_levels` refer to a single level,
117+
set to `True` for a faster execution.
116118
117119
Returns
118120
-------

0 commit comments

Comments
 (0)