Skip to content

Commit ca00c4d

Browse files
committed
PERF: improves performance in SeriesGroupBy.count
BUG: closes bug in Series.count when index has nulls
1 parent f82e177 commit ca00c4d

File tree

5 files changed

+34
-33
lines changed

5 files changed

+34
-33
lines changed

doc/source/whatsnew/v0.17.0.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -814,6 +814,8 @@ Bug Fixes
814814
- Bug in ``BinGrouper.group_info`` where returned values are not compatible with base class (:issue:`10914`)
815815
- Bug in clearing the cache on ``DataFrame.pop`` and a subsequent inplace op (:issue:`10912`)
816816
- Bug in indexing with a mixed-integer ``Index`` causing an ``ImportError`` (:issue:`10610`)
817+
- Bug in ``Series.count`` when index has nulls (:issue:`10946`)
818+
817819
- Bug causing ``DataFrame.where`` to not respect the ``axis`` parameter when the frame has a symmetric shape. (:issue:`9736`)
818820

819821
- Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`)

pandas/core/groupby.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2684,6 +2684,15 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
26842684

26852685
return Series(out, index=mi)
26862686

2687+
def count(self):
2688+
ids, _, ngroups = self.grouper.group_info
2689+
val = self.obj.get_values()
2690+
2691+
mask = (ids != -1) & ~isnull(val)
2692+
out = np.bincount(ids[mask], minlength=ngroups) if ngroups != 0 else []
2693+
2694+
return Series(out, index=self.grouper.result_index, name=self.name)
2695+
26872696
def _apply_to_column_groupbys(self, func):
26882697
""" return a pass thru """
26892698
return func(self)

pandas/core/series.py

Lines changed: 13 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1117,27 +1117,24 @@ def count(self, level=None):
11171117
-------
11181118
nobs : int or Series (if level specified)
11191119
"""
1120-
if level is not None:
1121-
mask = notnull(self.values)
1120+
from pandas.core.index import _get_na_value
11221121

1123-
if isinstance(level, compat.string_types):
1124-
level = self.index._get_level_number(level)
1122+
if level is None:
1123+
return notnull(_values_from_object(self)).sum()
11251124

1126-
level_index = self.index.levels[level]
1125+
if isinstance(level, compat.string_types):
1126+
level = self.index._get_level_number(level)
11271127

1128-
if len(self) == 0:
1129-
return self._constructor(0, index=level_index)\
1130-
.__finalize__(self)
1128+
lev = self.index.levels[level]
1129+
lab = np.array(self.index.labels[level], subok=False, copy=True)
11311130

1132-
# call cython function
1133-
max_bin = len(level_index)
1134-
labels = com._ensure_int64(self.index.labels[level])
1135-
counts = lib.count_level_1d(mask.view(np.uint8),
1136-
labels, max_bin)
1137-
return self._constructor(counts,
1138-
index=level_index).__finalize__(self)
1131+
mask = lab == -1
1132+
if mask.any():
1133+
lab[mask] = cnt = len(lev)
1134+
lev = lev.insert(cnt, _get_na_value(lev.dtype.type))
11391135

1140-
return notnull(_values_from_object(self)).sum()
1136+
out = np.bincount(lab[notnull(self.values)], minlength=len(lev))
1137+
return self._constructor(out, index=lev).__finalize__(self)
11411138

11421139
def mode(self):
11431140
"""Returns the mode(s) of the dataset.

pandas/lib.pyx

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1253,23 +1253,6 @@ def lookup_values(ndarray[object] values, dict mapping):
12531253
return maybe_convert_objects(result)
12541254

12551255

1256-
def count_level_1d(ndarray[uint8_t, cast=True] mask,
1257-
ndarray[int64_t] labels, Py_ssize_t max_bin):
1258-
cdef:
1259-
Py_ssize_t i, n
1260-
ndarray[int64_t] counts
1261-
1262-
counts = np.zeros(max_bin, dtype='i8')
1263-
1264-
n = len(mask)
1265-
1266-
for i from 0 <= i < n:
1267-
if mask[i]:
1268-
counts[labels[i]] += 1
1269-
1270-
return counts
1271-
1272-
12731256
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
12741257
ndarray[int64_t] labels, Py_ssize_t max_bin):
12751258
cdef:

pandas/tests/test_series.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4740,6 +4740,16 @@ def test_count(self):
47404740

47414741
self.assertEqual(self.ts.count(), np.isfinite(self.ts).sum())
47424742

4743+
mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]])
4744+
ts = Series(np.arange(len(mi)), index=mi)
4745+
4746+
left = ts.count(level=1)
4747+
right = Series([2, 3, 1], index=[1, 2, nan])
4748+
assert_series_equal(left, right)
4749+
4750+
ts.iloc[[0, 3, 5]] = nan
4751+
assert_series_equal(ts.count(level=1), right - 1)
4752+
47434753
def test_dtype(self):
47444754

47454755
self.assertEqual(self.ts.dtype, np.dtype('float64'))

0 commit comments

Comments
 (0)