Skip to content

Commit d0a8b79

Browse files
author
Brendan Boerner
committed
Merge branch 'hdf_iterator' of https://github.com/jreback/pandas into hdf_iterator
* 'hdf_iterator' of https://github.com/jreback/pandas: BUG: fix HDFStore iterator to handle a where properly (GH8014)
2 parents 7f7e1b5 + d33b537 commit d0a8b79

File tree

2 files changed

+101
-157
lines changed

2 files changed

+101
-157
lines changed

pandas/io/pytables.py

Lines changed: 61 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -662,28 +662,18 @@ def select(self, key, where=None, start=None, stop=None, columns=None,
662662
s = self._create_storer(group)
663663
s.infer_axes()
664664

665-
def func(_start, _stop):
665+
# function to call on iteration
666+
def func(_start, _stop, _where):
666667
return s.read(start=_start, stop=_stop,
667-
where=where,
668+
where=_where,
668669
columns=columns, **kwargs)
669670

670-
if iterator or chunksize is not None:
671-
if not s.is_table:
672-
raise TypeError(
673-
"can only use an iterator or chunksize on a table")
671+
# create the iterator
672+
it = TableIterator(self, s, func, where=where, nrows=s.nrows, start=start,
673+
stop=stop, iterator=iterator, chunksize=chunksize,
674+
auto_close=auto_close)
674675

675-
# read the coordinates & iterate
676-
if where is not None:
677-
c = s.read_coordinates(where=where, **kwargs)
678-
def func(_start, _stop):
679-
return s.read(where=c[_start:_stop], columns=columns, **kwargs)
680-
681-
return TableIterator(self, func, nrows=s.nrows, start=start,
682-
stop=stop, chunksize=chunksize,
683-
auto_close=auto_close)
684-
685-
return TableIterator(self, func, nrows=s.nrows, start=start, stop=stop,
686-
auto_close=auto_close).get_values()
676+
return it.get_result()
687677

688678
def select_as_coordinates(
689679
self, key, where=None, start=None, stop=None, **kwargs):
@@ -786,34 +776,22 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None,
786776
# axis is the concentation axes
787777
axis = list(set([t.non_index_axes[0][0] for t in tbls]))[0]
788778

789-
# for a not-none where, select the coordinates and chunk on those
790-
if where is not None:
791-
c = s.read_coordinates(where=where, **kwargs)
779+
def func(_start, _stop, _where):
792780

793-
def func(_start, _stop):
794-
objs = [t.read(where=c[_start:_stop], columns=columns, **kwargs) for t in tbls]
781+
# retrieve the objs, _where is always passed as a set of coordinates here
782+
objs = [t.read(where=_where, columns=columns, **kwargs) for t in tbls]
795783

796-
# concat and return
797-
return concat(objs, axis=axis,
798-
verify_integrity=False).consolidate()
784+
# concat and return
785+
return concat(objs, axis=axis,
786+
verify_integrity=False).consolidate()
799787

800-
else:
801-
802-
def func(_start, _stop):
803-
objs = [t.read(start=_start, stop=_stop,
804-
columns=columns, **kwargs) for t in tbls]
788+
# create the iterator
789+
it = TableIterator(self, s, func, where=where, nrows=nrows, start=start,
790+
stop=stop, iterator=iterator, chunksize=chunksize,
791+
auto_close=auto_close)
805792

806-
# concat and return
807-
return concat(objs, axis=axis,
808-
verify_integrity=False).consolidate()
809-
810-
if iterator or chunksize is not None:
811-
return TableIterator(self, func, nrows=nrows, start=start,
812-
stop=stop, chunksize=chunksize,
813-
auto_close=auto_close)
793+
return it.get_result(coordinates=True)
814794

815-
return TableIterator(self, func, nrows=nrows, start=start, stop=stop,
816-
auto_close=auto_close).get_values()
817795

818796
def put(self, key, value, format=None, append=False, **kwargs):
819797
"""
@@ -1308,42 +1286,54 @@ class TableIterator(object):
13081286
----------
13091287
13101288
store : the reference store
1311-
func : the function to get results
1289+
s : the refered storer
1290+
func : the function to execute the query
1291+
where : the where of the query
13121292
nrows : the rows to iterate on
13131293
start : the passed start value (default is None)
13141294
stop : the passed stop value (default is None)
1315-
chunksize : the passed chunking valeu (default is 50000)
1295+
iterator : boolean, whether to use the default iterator
1296+
chunksize : the passed chunking value (default is 50000)
13161297
auto_close : boolean, automatically close the store at the end of
13171298
iteration, default is False
13181299
kwargs : the passed kwargs
13191300
"""
13201301

1321-
def __init__(self, store, func, nrows, start=None, stop=None,
1322-
chunksize=None, auto_close=False):
1302+
def __init__(self, store, s, func, where, nrows, start=None, stop=None,
1303+
iterator=False, chunksize=None, auto_close=False):
13231304
self.store = store
1324-
self.func = func
1305+
self.s = s
1306+
self.func = func
1307+
self.where = where
13251308
self.nrows = nrows or 0
13261309
self.start = start or 0
13271310

13281311
if stop is None:
13291312
stop = self.nrows
13301313
self.stop = min(self.nrows, stop)
13311314

1332-
if chunksize is None:
1333-
chunksize = 100000
1315+
self.coordinates = None
1316+
if iterator or chunksize is not None:
1317+
if chunksize is None:
1318+
chunksize = 100000
1319+
self.chunksize = int(chunksize)
1320+
else:
1321+
self.chunksize = None
13341322

1335-
self.chunksize = int(chunksize)
13361323
self.auto_close = auto_close
13371324

13381325
def __iter__(self):
1326+
1327+
# iterate
13391328
current = self.start
13401329
while current < self.stop:
13411330

13421331
stop = min(current + self.chunksize, self.stop)
1343-
value = self.func(current, stop)
1332+
value = self.func(None, None, self.coordinates[current:stop])
13441333
if value is None:
13451334
continue
1346-
current = current + min(self.chunksize,len(value))
1335+
1336+
current = current + self.chunksize
13471337

13481338
yield value
13491339

@@ -1353,12 +1343,29 @@ def close(self):
13531343
if self.auto_close:
13541344
self.store.close()
13551345

1356-
def get_values(self):
1357-
results = self.func(self.start, self.stop)
1346+
def get_result(self, coordinates=False):
1347+
1348+
# return the actual iterator
1349+
if self.chunksize is not None:
1350+
if not self.s.is_table:
1351+
raise TypeError(
1352+
"can only use an iterator or chunksize on a table")
1353+
1354+
self.coordinates = self.s.read_coordinates(where=self.where)
1355+
1356+
return self
1357+
1358+
# if specified read via coordinates (necessary for multiple selections
1359+
if coordinates:
1360+
where = self.s.read_coordinates(where=self.where)
1361+
else:
1362+
where = self.where
1363+
1364+
# directly return the result
1365+
results = self.func(self.start, self.stop, where)
13581366
self.close()
13591367
return results
13601368

1361-
13621369
class IndexCol(StringMixin):
13631370

13641371
""" an index column description class

0 commit comments

Comments
 (0)