Skip to content

Commit d33b537

Browse files
committed
BUG: fix HDFStore iterator to handle a where properly (GH8014)
1 parent 441c585 commit d33b537

File tree

3 files changed

+180
-59
lines changed

3 files changed

+180
-59
lines changed

doc/source/v0.15.0.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ Bug Fixes
454454
- Bug in pickle deserialization that failed for pre-0.14.1 containers with dup items trying to avoid ambiguity
455455
when matching block and manager items, when there's only one block there's no ambiguity (:issue:`7794`)
456456

457-
457+
- Bug in HDFStore iteration when passing a where (:issue:`8014`)
458458

459459
- Bug in repeated timeseries line and area plot may result in ``ValueError`` or incorrect kind (:issue:`7733`)
460460

pandas/io/pytables.py

Lines changed: 63 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -662,21 +662,18 @@ def select(self, key, where=None, start=None, stop=None, columns=None,
662662
s = self._create_storer(group)
663663
s.infer_axes()
664664

665-
# what we are actually going to do for a chunk
666-
def func(_start, _stop):
667-
return s.read(where=where, start=_start, stop=_stop,
665+
# function to call on iteration
666+
def func(_start, _stop, _where):
667+
return s.read(start=_start, stop=_stop,
668+
where=_where,
668669
columns=columns, **kwargs)
669670

670-
if iterator or chunksize is not None:
671-
if not s.is_table:
672-
raise TypeError(
673-
"can only use an iterator or chunksize on a table")
674-
return TableIterator(self, func, nrows=s.nrows, start=start,
675-
stop=stop, chunksize=chunksize,
676-
auto_close=auto_close)
671+
# create the iterator
672+
it = TableIterator(self, s, func, where=where, nrows=s.nrows, start=start,
673+
stop=stop, iterator=iterator, chunksize=chunksize,
674+
auto_close=auto_close)
677675

678-
return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop,
679-
auto_close=auto_close).get_values()
676+
return it.get_result()
680677

681678
def select_as_coordinates(
682679
self, key, where=None, start=None, stop=None, **kwargs):
@@ -779,26 +776,22 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
779776
# axis is the concentation axes
780777
axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
781778

782-
def func(_start, _stop):
783-
if where is not None:
784-
c = s.read_coordinates(where=where, start=_start, stop=_stop, **kwargs)
785-
else:
786-
c = None
779+
def func(_start, _stop, _where):
787780

788-
objs = [t.read(where=c, start=_start, stop=_stop,
789-
columns=columns, **kwargs) for t in tbls]
781+
# retrieve the objs, _where is always passed as a set of coordinates here
782+
objs = [t.read(where=_where, columns=columns, **kwargs) for t in tbls]
790783

791784
# concat and return
792785
return concat(objs, axis=axis,
793786
verify_integrity=False).consolidate()
794787

795-
if iterator or chunksize is not None:
796-
return TableIterator(self, func, nrows=nrows, start=start,
797-
stop=stop, chunksize=chunksize,
798-
auto_close=auto_close)
788+
# create the iterator
789+
it = TableIterator(self, s, func, where=where, nrows=nrows, start=start,
790+
stop=stop, iterator=iterator, chunksize=chunksize,
791+
auto_close=auto_close)
792+
793+
return it.get_result(coordinates=True)
799794

800-
return TableIterator(self, func, nrows=nrows, start=start, stop=stop,
801-
auto_close=auto_close).get_values()
802795

803796
def put(self, key, value, format=None, append=False, **kwargs):
804797
"""
@@ -1293,57 +1286,86 @@ class TableIterator(object):
12931286
----------
12941287
12951288
store : the reference store
1296-
func : the function to get results
1289+
s : the refered storer
1290+
func : the function to execute the query
1291+
where : the where of the query
12971292
nrows : the rows to iterate on
12981293
start : the passed start value (default is None)
12991294
stop : the passed stop value (default is None)
1300-
chunksize : the passed chunking valeu (default is 50000)
1295+
iterator : boolean, whether to use the default iterator
1296+
chunksize : the passed chunking value (default is 50000)
13011297
auto_close : boolean, automatically close the store at the end of
13021298
iteration, default is False
13031299
kwargs : the passed kwargs
13041300
"""
13051301

1306-
def __init__(self, store, func, nrows, start=None, stop=None,
1307-
chunksize=None, auto_close=False):
1302+
def __init__(self, store, s, func, where, nrows, start=None, stop=None,
1303+
iterator=False, chunksize=None, auto_close=False):
13081304
self.store = store
1309-
self.func = func
1305+
self.s = s
1306+
self.func = func
1307+
self.where = where
13101308
self.nrows = nrows or 0
13111309
self.start = start or 0
13121310

13131311
if stop is None:
13141312
stop = self.nrows
13151313
self.stop = min(self.nrows, stop)
13161314

1317-
if chunksize is None:
1318-
chunksize = 100000
1315+
self.coordinates = None
1316+
if iterator or chunksize is not None:
1317+
if chunksize is None:
1318+
chunksize = 100000
1319+
self.chunksize = int(chunksize)
1320+
else:
1321+
self.chunksize = None
13191322

1320-
self.chunksize = chunksize
13211323
self.auto_close = auto_close
13221324

13231325
def __iter__(self):
1326+
1327+
# iterate
13241328
current = self.start
13251329
while current < self.stop:
1326-
stop = current + self.chunksize
1327-
v = self.func(current, stop)
1328-
current = stop
13291330

1330-
if v is None:
1331+
stop = min(current + self.chunksize, self.stop)
1332+
value = self.func(None, None, self.coordinates[current:stop])
1333+
if value is None:
13311334
continue
13321335

1333-
yield v
1336+
current = current + self.chunksize
1337+
1338+
yield value
13341339

13351340
self.close()
13361341

13371342
def close(self):
13381343
if self.auto_close:
13391344
self.store.close()
13401345

1341-
def get_values(self):
1342-
results = self.func(self.start, self.stop)
1346+
def get_result(self, coordinates=False):
1347+
1348+
# return the actual iterator
1349+
if self.chunksize is not None:
1350+
if not self.s.is_table:
1351+
raise TypeError(
1352+
"can only use an iterator or chunksize on a table")
1353+
1354+
self.coordinates = self.s.read_coordinates(where=self.where)
1355+
1356+
return self
1357+
1358+
# if specified read via coordinates (necessary for multiple selections
1359+
if coordinates:
1360+
where = self.s.read_coordinates(where=self.where)
1361+
else:
1362+
where = self.where
1363+
1364+
# directly return the result
1365+
results = self.func(self.start, self.stop, where)
13431366
self.close()
13441367
return results
13451368

1346-
13471369
class IndexCol(StringMixin):
13481370

13491371
""" an index column description class

pandas/io/tests/test_pytables.py

Lines changed: 116 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -3264,21 +3264,16 @@ def test_select_iterator(self):
32643264

32653265
expected = store.select('df')
32663266

3267-
results = []
3268-
for s in store.select('df',iterator=True):
3269-
results.append(s)
3267+
results = [ s for s in store.select('df',iterator=True) ]
32703268
result = concat(results)
32713269
tm.assert_frame_equal(expected, result)
3272-
results = []
3273-
for s in store.select('df',chunksize=100):
3274-
results.append(s)
3270+
3271+
results = [ s for s in store.select('df',chunksize=100) ]
32753272
self.assertEqual(len(results), 5)
32763273
result = concat(results)
32773274
tm.assert_frame_equal(expected, result)
32783275

3279-
results = []
3280-
for s in store.select('df',chunksize=150):
3281-
results.append(s)
3276+
results = [ s for s in store.select('df',chunksize=150) ]
32823277
result = concat(results)
32833278
tm.assert_frame_equal(result, expected)
32843279

@@ -3294,12 +3289,10 @@ def test_select_iterator(self):
32943289
df = tm.makeTimeDataFrame(500)
32953290
df.to_hdf(path,'df',format='table')
32963291

3297-
results = []
3298-
for x in read_hdf(path,'df',chunksize=100):
3299-
results.append(x)
3292+
results = [ s for s in read_hdf(path,'df',chunksize=100) ]
3293+
result = concat(results)
33003294

33013295
self.assertEqual(len(results), 5)
3302-
result = concat(results)
33033296
tm.assert_frame_equal(result, df)
33043297
tm.assert_frame_equal(result, read_hdf(path,'df'))
33053298

@@ -3318,10 +3311,8 @@ def test_select_iterator(self):
33183311
# full selection
33193312
expected = store.select_as_multiple(
33203313
['df1', 'df2'], selector='df1')
3321-
results = []
3322-
for s in store.select_as_multiple(
3323-
['df1', 'df2'], selector='df1', chunksize=150):
3324-
results.append(s)
3314+
results = [ s for s in store.select_as_multiple(
3315+
['df1', 'df2'], selector='df1', chunksize=150) ]
33253316
result = concat(results)
33263317
tm.assert_frame_equal(expected, result)
33273318

@@ -3335,6 +3326,114 @@ def test_select_iterator(self):
33353326
#result = concat(results)
33363327
#tm.assert_frame_equal(expected, result)
33373328

3329+
def test_select_iterator_complete_8014(self):
3330+
3331+
# GH 8014
3332+
# using iterator and where clause
3333+
chunksize=1e4
3334+
3335+
# no iterator
3336+
with ensure_clean_store(self.path) as store:
3337+
3338+
expected = tm.makeTimeDataFrame(100064, 'S')
3339+
_maybe_remove(store, 'df')
3340+
store.append('df',expected)
3341+
3342+
beg_dt = expected.index[0]
3343+
end_dt = expected.index[-1]
3344+
3345+
# select w/o iteration and no where clause works
3346+
result = store.select('df')
3347+
tm.assert_frame_equal(expected, result)
3348+
3349+
# select w/o iterator and where clause, single term, begin
3350+
# of range, works
3351+
where = "index >= '%s'" % beg_dt
3352+
result = store.select('df',where=where)
3353+
tm.assert_frame_equal(expected, result)
3354+
3355+
# select w/o iterator and where clause, single term, end
3356+
# of range, works
3357+
where = "index <= '%s'" % end_dt
3358+
result = store.select('df',where=where)
3359+
tm.assert_frame_equal(expected, result)
3360+
3361+
# select w/o iterator and where clause, inclusive range,
3362+
# works
3363+
where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
3364+
result = store.select('df',where=where)
3365+
tm.assert_frame_equal(expected, result)
3366+
3367+
# with iterator, full range
3368+
with ensure_clean_store(self.path) as store:
3369+
3370+
expected = tm.makeTimeDataFrame(100064, 'S')
3371+
_maybe_remove(store, 'df')
3372+
store.append('df',expected)
3373+
3374+
beg_dt = expected.index[0]
3375+
end_dt = expected.index[-1]
3376+
3377+
# select w/iterator and no where clause works
3378+
results = [ s for s in store.select('df',chunksize=chunksize) ]
3379+
result = concat(results)
3380+
tm.assert_frame_equal(expected, result)
3381+
3382+
# select w/iterator and where clause, single term, begin of range
3383+
where = "index >= '%s'" % beg_dt
3384+
results = [ s for s in store.select('df',where=where,chunksize=chunksize) ]
3385+
result = concat(results)
3386+
tm.assert_frame_equal(expected, result)
3387+
3388+
# select w/iterator and where clause, single term, end of range
3389+
where = "index <= '%s'" % end_dt
3390+
results = [ s for s in store.select('df',where=where,chunksize=chunksize) ]
3391+
result = concat(results)
3392+
tm.assert_frame_equal(expected, result)
3393+
3394+
# select w/iterator and where clause, inclusive range
3395+
where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
3396+
results = [ s for s in store.select('df',where=where,chunksize=chunksize) ]
3397+
result = concat(results)
3398+
tm.assert_frame_equal(expected, result)
3399+
3400+
def test_select_iterator_non_complete_8014(self):
3401+
3402+
# GH 8014
3403+
# using iterator and where clause
3404+
chunksize=1e4
3405+
3406+
# with iterator, non complete range
3407+
with ensure_clean_store(self.path) as store:
3408+
3409+
expected = tm.makeTimeDataFrame(100064, 'S')
3410+
_maybe_remove(store, 'df')
3411+
store.append('df',expected)
3412+
3413+
beg_dt = expected.index[1]
3414+
end_dt = expected.index[-2]
3415+
3416+
# select w/iterator and where clause, single term, begin of range
3417+
where = "index >= '%s'" % beg_dt
3418+
results = [ s for s in store.select('df',where=where,chunksize=chunksize) ]
3419+
result = concat(results)
3420+
rexpected = expected[expected.index >= beg_dt]
3421+
tm.assert_frame_equal(rexpected, result)
3422+
3423+
# select w/iterator and where clause, single term, end of range
3424+
where = "index <= '%s'" % end_dt
3425+
results = [ s for s in store.select('df',where=where,chunksize=chunksize) ]
3426+
result = concat(results)
3427+
rexpected = expected[expected.index <= end_dt]
3428+
tm.assert_frame_equal(rexpected, result)
3429+
3430+
# select w/iterator and where clause, inclusive range
3431+
where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt)
3432+
results = [ s for s in store.select('df',where=where,chunksize=chunksize) ]
3433+
result = concat(results)
3434+
rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)]
3435+
tm.assert_frame_equal(rexpected, result)
3436+
33383437
def test_retain_index_attributes(self):
33393438

33403439
# GH 3499, losing frequency info on index recreation

0 commit comments

Comments
 (0)