Skip to content

Commit b63fc2f

Browse files
committed
ENH: Initial support for reading Open Document Format ods spreadsheet (GH2311)
1 parent 7eb5668 commit b63fc2f

File tree

2 files changed

+239
-6
lines changed

2 files changed

+239
-6
lines changed

pandas/io/api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pandas.io.parsers import read_csv, read_table, read_fwf
66
from pandas.io.clipboard import read_clipboard
77
from pandas.io.excel import ExcelFile, ExcelWriter, read_excel
8+
from pandas.io.ods import OdsFile, read_ods
89
from pandas.io.pytables import HDFStore, Term, get_store, read_hdf
910
from pandas.io.json import read_json
1011
from pandas.io.html import read_html

pandas/io/excel.py

Lines changed: 238 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from warnings import warn
2323
from distutils.version import LooseVersion
2424

25-
__all__ = ["read_excel", "ExcelWriter", "ExcelFile"]
25+
__all__ = ["read_excel", "ExcelWriter", "ExcelFile", "OdsFile"]
2626

2727
_writer_extensions = ["xlsx", "xls", "xlsm"]
2828
_writers = {}
@@ -67,11 +67,12 @@ def get_writer(engine_name):
6767

6868

6969
def read_excel(io, sheetname=0, **kwds):
70-
"""Read an Excel table into a pandas DataFrame
70+
"""Read an Excel/ods table into a pandas DataFrame
7171
7272
Parameters
7373
----------
74-
io : string, file-like object, or xlrd workbook.
74+
io : string, file-like object, or xlrd workbook for MS Excel files. For an
75+
ods file (Open Document Formant), string or ezodf workbook is required.
7576
The string could be a URL. Valid URL schemes include http, ftp, s3,
7677
and file. For file URLs, a host is expected. For instance, a local
7778
file could be file://localhost/path/to/workbook.xlsx
@@ -104,7 +105,7 @@ def read_excel(io, sheetname=0, **kwds):
104105
converters : dict, default None
105106
Dict of functions for converting values in certain columns. Keys can
106107
either be integers or column labels, values are functions that take one
107-
input argument, the Excel cell content, and return the transformed
108+
input argument, the Excel/ods cell content, and return the transformed
108109
content.
109110
index_col : int, default None
110111
Column to use as the row labels of the DataFrame. Pass None if
@@ -124,10 +125,10 @@ def read_excel(io, sheetname=0, **kwds):
124125
Indicate number of NA values placed in non-numeric columns
125126
engine: string, default None
126127
If io is not a buffer or path, this must be set to identify io.
127-
Acceptable values are None or xlrd
128+
Acceptable values are None, xlrd, or ezodf
128129
convert_float : boolean, default True
129130
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
130-
data will be read in as floats: Excel stores all numbers as floats
131+
data will be read in as floats: Excel/ods stores all numbers as floats
131132
internally
132133
has_index_names : boolean, default False
133134
True if the cols defined in index_col have an index name and are
@@ -148,9 +149,240 @@ def read_excel(io, sheetname=0, **kwds):
148149

149150
engine = kwds.pop('engine', None)
150151

152+
if engine == 'ezodf':
153+
return OdsFile(io).parse(sheetname=sheetname, **kwds)
154+
155+
# figure out if the file is an MS Excel or ODF ODS type
156+
# code is doubled here: it is very similar to OdsFile.__init__. Is there a
157+
# better way?
158+
if isinstance(io, compat.string_types):
159+
if io[-4:] == '.ods':
160+
try:
161+
return OdsFile(io).parse(sheetname=sheetname, **kwds)
162+
except Exception as e:
163+
print('ods support requires ezodf, please install ezodf first')
164+
raise e
165+
elif io[-4:] in ['xls', 'xlsx', 'xlsm']:
166+
return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
167+
try:
168+
import ezodf
169+
if isinstance(io, ezodf.document.PackagedDocument):
170+
return OdsFile(io).parse(sheetname=sheetname, **kwds)
171+
except ImportError:
172+
pass
151173
return ExcelFile(io, engine=engine).parse(sheetname=sheetname, **kwds)
152174

153175

176+
class OdsFile(object):
177+
"""
178+
Class for parsing tabular ods sheets into DataFrame objects.
179+
Uses ezodf. See OdsFile.parse for more documentation
180+
181+
Parameters
182+
----------
183+
io : string or ezodf workbook
184+
If a string, expected to be a path to ods file
185+
"""
186+
def __init__(self, io, **kwds):
187+
188+
import ezodf # throw an ImportError if we need to
189+
# ezodf does not have a __version__ or similar attribute
190+
191+
self.io = io
192+
193+
if isinstance(io, compat.string_types):
194+
if _is_url(io):
195+
data = _urlopen(io).read()
196+
self.book = ezodf.opendoc(data)
197+
else:
198+
self.book = ezodf.opendoc(io)
199+
# this the corresponding ezopdf instance of a workbook
200+
elif isinstance(io, ezodf.document.PackagedDocument):
201+
self.book = io
202+
else:
203+
raise ValueError('IO must be a path or ods workbook')
204+
205+
def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
206+
index_col=None, parse_cols=None, parse_dates=False,
207+
date_parser=None, na_values=None, thousands=None, chunksize=None,
208+
convert_float=True, has_index_names=False, converters=None, **kwds):
209+
"""Read an ods table into DataFrame
210+
211+
Parameters
212+
----------
213+
sheetname : string or integer
214+
Name of ods sheet or the page number of the sheet
215+
header : int, default 0
216+
Row to use for the column labels of the parsed DataFrame
217+
skiprows : list-like
218+
Rows to skip at the beginning (0-indexed)
219+
skip_footer : int, default 0
220+
Rows at the end to skip (0-indexed)
221+
converters : dict, default None
222+
Dict of functions for converting values in certain columns. Keys can
223+
either be integers or column labels
224+
index_col : int, default None
225+
Column to use as the row labels of the DataFrame. Pass None if
226+
there is no such column
227+
parse_cols : int or list, default None
228+
* If None then parse all columns
229+
* If int then indicates last column to be parsed
230+
* If list of ints then indicates list of column numbers to be
231+
parsed
232+
* If string then indicates comma separated list of column names and
233+
column ranges (e.g. "A:E" or "A,C,E:F")
234+
parse_dates : boolean, default False
235+
Parse date ods values,
236+
date_parser : function default None
237+
Date parsing function
238+
na_values : list-like, default None
239+
List of additional strings to recognize as NA/NaN
240+
thousands : str, default None
241+
Thousands separator
242+
chunksize : int, default None
243+
Size of file chunk to read for lazy evaluation.
244+
convert_float : boolean, default True
245+
convert integral floats to int (i.e., 1.0 --> 1). If False, all
246+
numeric data will be read in as floats: ods stores all numbers as
247+
floats internally.
248+
has_index_names : boolean, default False
249+
True if the cols defined in index_col have an index name and are
250+
not in the header
251+
252+
Returns
253+
-------
254+
parsed : DataFrame
255+
DataFrame parsed from the ods file
256+
"""
257+
skipfooter = kwds.pop('skipfooter', None)
258+
if skipfooter is not None:
259+
skip_footer = skipfooter
260+
261+
return self._parse_ods(sheetname=sheetname, header=header,
262+
skiprows=skiprows,
263+
index_col=index_col,
264+
has_index_names=has_index_names,
265+
parse_cols=parse_cols,
266+
parse_dates=parse_dates,
267+
date_parser=date_parser, na_values=na_values,
268+
thousands=thousands, chunksize=chunksize,
269+
skip_footer=skip_footer,
270+
convert_float=convert_float,
271+
converters=converters,
272+
**kwds)
273+
274+
def _print_cellinfo(self, cell):
275+
print(' plaintext:', cell.plaintext()) # no formatting
276+
# formatted, but what is difference with value?
277+
print('display_form:', cell.display_form) # format, ?=plaintext
278+
print(' value:', cell.value) # data handled
279+
print(' value_type:', cell.value_type) # data type
280+
print(' formula:', cell.formula)
281+
282+
def _parse_datetime(self, cell):
283+
"""
284+
Parse the date or time to a datetime object
285+
"""
286+
if cell.value_type == 'time' and cell.formula is not None:
287+
try:
288+
value = datetime.datetime.strptime(cell.formula,
289+
'of:=TIME(%H;%M;%S)')
290+
except ValueError:
291+
# hours can be more then 23
292+
hours = int(cell.value[2:].split('H')[0])
293+
minutes = int(cell.value[2:].split('M')[0][-2:])
294+
seconds = int(cell.value[2:].split('M')[1][:-1])
295+
if hours > 23:
296+
value = datetime.timedelta(hours=hours, minutes=minutes,
297+
seconds=seconds)
298+
else:
299+
# TODO: should return a time object, not datetime?
300+
value = datetime.datetime.strptime(cell.value,
301+
'PT%HH%MM%SS')
302+
# TODO: this does not cover all scenario's
303+
# TODO: now timedelta objects will be mixed with normal time
304+
elif cell.value_type == 'date' and cell.formula is not None:
305+
try:
306+
value = datetime.datetime.strptime(cell.formula,
307+
'of:=DATE(%Y;%m;%d)')
308+
except (ValueError, TypeError):
309+
# TODO: parsing other scenerio's
310+
value = cell.value
311+
else:
312+
value = None
313+
return value
314+
315+
def _parse_ods(self, sheetname=0, header=0, skiprows=None, skip_footer=0,
316+
index_col=None, has_index_names=None, parse_cols=None,
317+
parse_dates=False, date_parser=None, na_values=None,
318+
thousands=None, chunksize=None, convert_float=True,
319+
**kwds):
320+
321+
# sheetname can be index or string
322+
sheet = self.book.sheets[sheetname]
323+
324+
data = []
325+
326+
for i in range(sheet.nrows()):
327+
row = []
328+
for j, cell in enumerate(sheet.row(i)):
329+
typ = cell.value_type
330+
if isinstance(cell.value, float):
331+
value = cell.value
332+
if convert_float:
333+
# GH5394 - Excel and ODS 'numbers' are always floats
334+
# it's a minimal perf hit and less suprising
335+
# FIXME: this goes wrong when int(cell.value) returns
336+
# a long (>1e18)
337+
val = int(cell.value)
338+
if val == cell.value:
339+
value = val
340+
elif isinstance(typ, str):
341+
if typ == 'string':
342+
value = cell.value
343+
elif typ == 'date' or typ == 'time':
344+
value = self._parse_datetime(cell)
345+
elif isinstance(typ, bool):
346+
value = cell.value
347+
elif isinstance(typ, type(None)):
348+
value = np.nan
349+
else:
350+
value = np.nan
351+
352+
row.append(value)
353+
354+
data.append(row)
355+
356+
parser = TextParser(data, header=header, index_col=index_col,
357+
has_index_names=has_index_names,
358+
na_values=na_values,
359+
thousands=thousands,
360+
parse_dates=parse_dates,
361+
date_parser=date_parser,
362+
skiprows=skiprows,
363+
skip_footer=skip_footer,
364+
chunksize=chunksize,
365+
**kwds)
366+
367+
return parser.read()
368+
369+
@property
370+
def sheet_names(self):
371+
# book.sheet.names() is a generator
372+
return [sheetname for sheetname in self.book.sheet.names()]
373+
374+
def close(self):
375+
"""close io if necessary"""
376+
if hasattr(self.io, 'close'):
377+
self.io.close()
378+
379+
def __enter__(self):
380+
return self
381+
382+
def __exit__(self, exc_type, exc_value, traceback):
383+
self.close()
384+
385+
154386
class ExcelFile(object):
155387
"""
156388
Class for parsing tabular excel sheets into DataFrame objects.

0 commit comments

Comments
 (0)