Skip to content

Commit 650ae77

Browse files
committed
Merge pull request #5462 from jreback/perf_isnull
PERF: performance improvements on isnull/notnull for large pandas objects
2 parents b89f02d + 6e68a8f commit 650ae77

File tree

7 files changed

+57
-25
lines changed

7 files changed

+57
-25
lines changed

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -803,6 +803,7 @@ Bug Fixes
803803
- Make tests create temp files in temp directory by default. (:issue:`5419`)
804804
- ``pd.to_timedelta`` of a scalar returns a scalar (:issue:`5410`)
805805
- ``pd.to_timedelta`` accepts ``NaN`` and ``NaT``, returning ``NaT`` instead of raising (:issue:`5437`)
806+
- performance improvements in ``isnull`` on larger size pandas objects
806807

807808
pandas 0.12.0
808809
-------------

pandas/core/common.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def _isnull_new(obj):
128128
elif isinstance(obj, (ABCSeries, np.ndarray)):
129129
return _isnull_ndarraylike(obj)
130130
elif isinstance(obj, ABCGeneric):
131-
return obj.apply(isnull)
131+
return obj._constructor(obj._data.apply(lambda x: isnull(x.values)))
132132
elif isinstance(obj, list) or hasattr(obj, '__array__'):
133133
return _isnull_ndarraylike(np.asarray(obj))
134134
else:
@@ -155,7 +155,7 @@ def _isnull_old(obj):
155155
elif isinstance(obj, (ABCSeries, np.ndarray)):
156156
return _isnull_ndarraylike_old(obj)
157157
elif isinstance(obj, ABCGeneric):
158-
return obj.apply(_isnull_old)
158+
return obj._constructor(obj._data.apply(lambda x: _isnull_old(x.values)))
159159
elif isinstance(obj, list) or hasattr(obj, '__array__'):
160160
return _isnull_ndarraylike_old(np.asarray(obj))
161161
else:

pandas/core/generic.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2208,13 +2208,13 @@ def isnull(self):
22082208
"""
22092209
Return a boolean same-sized object indicating if the values are null
22102210
"""
2211-
return self.__class__(isnull(self),**self._construct_axes_dict()).__finalize__(self)
2211+
return isnull(self).__finalize__(self)
22122212

22132213
def notnull(self):
22142214
"""
22152215
Return a boolean same-sized object indicating if the values are not null
22162216
"""
2217-
return self.__class__(notnull(self),**self._construct_axes_dict()).__finalize__(self)
2217+
return notnull(self).__finalize__(self)
22182218

22192219
def clip(self, lower=None, upper=None, out=None):
22202220
"""

pandas/core/internals.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2153,6 +2153,13 @@ def apply(self, f, *args, **kwargs):
21532153
continue
21542154
if callable(f):
21552155
applied = f(blk, *args, **kwargs)
2156+
2157+
# if we are no a block, try to coerce
2158+
if not isinstance(applied, Block):
2159+
applied = make_block(applied,
2160+
blk.items,
2161+
blk.ref_items)
2162+
21562163
else:
21572164
applied = getattr(blk, f)(*args, **kwargs)
21582165

pandas/tests/test_common.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,17 +75,28 @@ def test_isnull():
7575
assert not isnull(np.inf)
7676
assert not isnull(-np.inf)
7777

78+
# series
7879
for s in [tm.makeFloatSeries(),tm.makeStringSeries(),
7980
tm.makeObjectSeries(),tm.makeTimeSeries(),tm.makePeriodSeries()]:
8081
assert(isinstance(isnull(s), Series))
8182

82-
# call on DataFrame
83-
df = DataFrame(np.random.randn(10, 5))
84-
df['foo'] = 'bar'
85-
result = isnull(df)
86-
expected = result.apply(isnull)
87-
tm.assert_frame_equal(result, expected)
88-
83+
# frame
84+
for df in [tm.makeTimeDataFrame(),tm.makePeriodFrame(),tm.makeMixedDataFrame()]:
85+
result = isnull(df)
86+
expected = df.apply(isnull)
87+
tm.assert_frame_equal(result, expected)
88+
89+
# panel
90+
for p in [ tm.makePanel(), tm.makePeriodPanel(), tm.add_nans(tm.makePanel()) ]:
91+
result = isnull(p)
92+
expected = p.apply(isnull)
93+
tm.assert_panel_equal(result, expected)
94+
95+
# panel 4d
96+
for p in [ tm.makePanel4D(), tm.add_nans_panel4d(tm.makePanel4D()) ]:
97+
result = isnull(p)
98+
expected = p.apply(isnull)
99+
tm.assert_panel4d_equal(result, expected)
89100

90101
def test_isnull_lists():
91102
result = isnull([[False]])

pandas/util/testing.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -531,6 +531,9 @@ def assert_copy(iter1, iter2, **eql_kwargs):
531531
def getCols(k):
532532
return string.ascii_uppercase[:k]
533533

534+
def getArangeMat():
535+
return np.arange(N * K).reshape((N, K))
536+
534537

535538
# make index
536539
def makeStringIndex(k=10):
@@ -601,24 +604,20 @@ def getTimeSeriesData(nper=None):
601604
return dict((c, makeTimeSeries(nper)) for c in getCols(K))
602605

603606

607+
def getPeriodData(nper=None):
608+
return dict((c, makePeriodSeries(nper)) for c in getCols(K))
609+
610+
# make frame
604611
def makeTimeDataFrame(nper=None):
605612
data = getTimeSeriesData(nper)
606613
return DataFrame(data)
607614

608615

609-
def getPeriodData(nper=None):
610-
return dict((c, makePeriodSeries(nper)) for c in getCols(K))
611-
612-
# make frame
613616
def makeDataFrame():
614617
data = getSeriesData()
615618
return DataFrame(data)
616619

617620

618-
def getArangeMat():
619-
return np.arange(N * K).reshape((N, K))
620-
621-
622621
def getMixedTypeDict():
623622
index = Index(['a', 'b', 'c', 'd', 'e'])
624623

@@ -631,6 +630,8 @@ def getMixedTypeDict():
631630

632631
return index, data
633632

633+
def makeMixedDataFrame():
634+
return DataFrame(getMixedTypeDict()[1])
634635

635636
def makePeriodFrame(nper=None):
636637
data = getPeriodData(nper)
@@ -827,13 +828,13 @@ def add_nans(panel):
827828
dm = panel[item]
828829
for j, col in enumerate(dm.columns):
829830
dm[col][:i + j] = np.NaN
830-
831+
return panel
831832

832833
def add_nans_panel4d(panel4d):
833834
for l, label in enumerate(panel4d.labels):
834835
panel = panel4d[label]
835836
add_nans(panel)
836-
837+
return panel4d
837838

838839
class TestSubDict(dict):
839840

vb_suite/frame_methods.py

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,9 @@ def f(K=100):
221221

222222
frame_xs_col = Benchmark('df.xs(50000,axis = 1)', setup)
223223

224+
#----------------------------------------------------------------------
225+
# nulls/masking
226+
224227
## masking
225228
setup = common_setup + """
226229
data = np.random.randn(1000, 500)
@@ -230,8 +233,17 @@ def f(K=100):
230233
mask = isnull(df)
231234
"""
232235

233-
mask_bools = Benchmark('bools.mask(mask)', setup,
234-
start_date=datetime(2013,1,1))
236+
frame_mask_bools = Benchmark('bools.mask(mask)', setup,
237+
start_date=datetime(2013,1,1))
238+
239+
frame_mask_floats = Benchmark('bools.astype(float).mask(mask)', setup,
240+
start_date=datetime(2013,1,1))
241+
242+
## isnull
243+
setup = common_setup + """
244+
data = np.random.randn(1000, 1000)
245+
df = DataFrame(data)
246+
"""
247+
frame_isnull = Benchmark('isnull(df)', setup,
248+
start_date=datetime(2012,1,1))
235249

236-
mask_floats = Benchmark('bools.astype(float).mask(mask)', setup,
237-
start_date=datetime(2013,1,1))

0 commit comments

Comments
 (0)