22
22
from warnings import warn
23
23
from distutils .version import LooseVersion
24
24
25
- __all__ = ["read_excel" , "ExcelWriter" , "ExcelFile" ]
25
+ __all__ = ["read_excel" , "ExcelWriter" , "ExcelFile" , "OdsFile" ]
26
26
27
27
_writer_extensions = ["xlsx" , "xls" , "xlsm" ]
28
28
_writers = {}
@@ -67,11 +67,12 @@ def get_writer(engine_name):
67
67
68
68
69
69
def read_excel (io , sheetname = 0 , ** kwds ):
70
- """Read an Excel table into a pandas DataFrame
70
+ """Read an Excel/ods table into a pandas DataFrame
71
71
72
72
Parameters
73
73
----------
74
- io : string, file-like object, or xlrd workbook.
74
+ io : string, file-like object, or xlrd workbook for MS Excel files. For an
75
+ ods file (Open Document Formant), string or ezodf workbook is required.
75
76
The string could be a URL. Valid URL schemes include http, ftp, s3,
76
77
and file. For file URLs, a host is expected. For instance, a local
77
78
file could be file://localhost/path/to/workbook.xlsx
@@ -104,7 +105,7 @@ def read_excel(io, sheetname=0, **kwds):
104
105
converters : dict, default None
105
106
Dict of functions for converting values in certain columns. Keys can
106
107
either be integers or column labels, values are functions that take one
107
- input argument, the Excel cell content, and return the transformed
108
+ input argument, the Excel/ods cell content, and return the transformed
108
109
content.
109
110
index_col : int, default None
110
111
Column to use as the row labels of the DataFrame. Pass None if
@@ -124,10 +125,10 @@ def read_excel(io, sheetname=0, **kwds):
124
125
Indicate number of NA values placed in non-numeric columns
125
126
engine: string, default None
126
127
If io is not a buffer or path, this must be set to identify io.
127
- Acceptable values are None or xlrd
128
+ Acceptable values are None, xlrd, or ezodf
128
129
convert_float : boolean, default True
129
130
convert integral floats to int (i.e., 1.0 --> 1). If False, all numeric
130
- data will be read in as floats: Excel stores all numbers as floats
131
+ data will be read in as floats: Excel/ods stores all numbers as floats
131
132
internally
132
133
has_index_names : boolean, default False
133
134
True if the cols defined in index_col have an index name and are
@@ -148,9 +149,240 @@ def read_excel(io, sheetname=0, **kwds):
148
149
149
150
engine = kwds .pop ('engine' , None )
150
151
152
+ if engine == 'ezodf' :
153
+ return OdsFile (io ).parse (sheetname = sheetname , ** kwds )
154
+
155
+ # figure out if the file is an MS Excel or ODF ODS type
156
+ # code is doubled here: it is very similar to OdsFile.__init__. Is there a
157
+ # better way?
158
+ if isinstance (io , compat .string_types ):
159
+ if io [- 4 :] == '.ods' :
160
+ try :
161
+ return OdsFile (io ).parse (sheetname = sheetname , ** kwds )
162
+ except Exception as e :
163
+ print ('ods support requires ezodf, please install ezodf first' )
164
+ raise e
165
+ elif io [- 4 :] in ['xls' , 'xlsx' , 'xlsm' ]:
166
+ return ExcelFile (io , engine = engine ).parse (sheetname = sheetname , ** kwds )
167
+ try :
168
+ import ezodf
169
+ if isinstance (io , ezodf .document .PackagedDocument ):
170
+ return OdsFile (io ).parse (sheetname = sheetname , ** kwds )
171
+ except ImportError :
172
+ pass
151
173
return ExcelFile (io , engine = engine ).parse (sheetname = sheetname , ** kwds )
152
174
153
175
176
+ class OdsFile (object ):
177
+ """
178
+ Class for parsing tabular ods sheets into DataFrame objects.
179
+ Uses ezodf. See OdsFile.parse for more documentation
180
+
181
+ Parameters
182
+ ----------
183
+ io : string or ezodf workbook
184
+ If a string, expected to be a path to ods file
185
+ """
186
+ def __init__ (self , io , ** kwds ):
187
+
188
+ import ezodf # throw an ImportError if we need to
189
+ # ezodf does not have a __version__ or similar attribute
190
+
191
+ self .io = io
192
+
193
+ if isinstance (io , compat .string_types ):
194
+ if _is_url (io ):
195
+ data = _urlopen (io ).read ()
196
+ self .book = ezodf .opendoc (data )
197
+ else :
198
+ self .book = ezodf .opendoc (io )
199
+ # this the corresponding ezopdf instance of a workbook
200
+ elif isinstance (io , ezodf .document .PackagedDocument ):
201
+ self .book = io
202
+ else :
203
+ raise ValueError ('IO must be a path or ods workbook' )
204
+
205
+ def parse (self , sheetname = 0 , header = 0 , skiprows = None , skip_footer = 0 ,
206
+ index_col = None , parse_cols = None , parse_dates = False ,
207
+ date_parser = None , na_values = None , thousands = None , chunksize = None ,
208
+ convert_float = True , has_index_names = False , converters = None , ** kwds ):
209
+ """Read an ods table into DataFrame
210
+
211
+ Parameters
212
+ ----------
213
+ sheetname : string or integer
214
+ Name of ods sheet or the page number of the sheet
215
+ header : int, default 0
216
+ Row to use for the column labels of the parsed DataFrame
217
+ skiprows : list-like
218
+ Rows to skip at the beginning (0-indexed)
219
+ skip_footer : int, default 0
220
+ Rows at the end to skip (0-indexed)
221
+ converters : dict, default None
222
+ Dict of functions for converting values in certain columns. Keys can
223
+ either be integers or column labels
224
+ index_col : int, default None
225
+ Column to use as the row labels of the DataFrame. Pass None if
226
+ there is no such column
227
+ parse_cols : int or list, default None
228
+ * If None then parse all columns
229
+ * If int then indicates last column to be parsed
230
+ * If list of ints then indicates list of column numbers to be
231
+ parsed
232
+ * If string then indicates comma separated list of column names and
233
+ column ranges (e.g. "A:E" or "A,C,E:F")
234
+ parse_dates : boolean, default False
235
+ Parse date ods values,
236
+ date_parser : function default None
237
+ Date parsing function
238
+ na_values : list-like, default None
239
+ List of additional strings to recognize as NA/NaN
240
+ thousands : str, default None
241
+ Thousands separator
242
+ chunksize : int, default None
243
+ Size of file chunk to read for lazy evaluation.
244
+ convert_float : boolean, default True
245
+ convert integral floats to int (i.e., 1.0 --> 1). If False, all
246
+ numeric data will be read in as floats: ods stores all numbers as
247
+ floats internally.
248
+ has_index_names : boolean, default False
249
+ True if the cols defined in index_col have an index name and are
250
+ not in the header
251
+
252
+ Returns
253
+ -------
254
+ parsed : DataFrame
255
+ DataFrame parsed from the ods file
256
+ """
257
+ skipfooter = kwds .pop ('skipfooter' , None )
258
+ if skipfooter is not None :
259
+ skip_footer = skipfooter
260
+
261
+ return self ._parse_ods (sheetname = sheetname , header = header ,
262
+ skiprows = skiprows ,
263
+ index_col = index_col ,
264
+ has_index_names = has_index_names ,
265
+ parse_cols = parse_cols ,
266
+ parse_dates = parse_dates ,
267
+ date_parser = date_parser , na_values = na_values ,
268
+ thousands = thousands , chunksize = chunksize ,
269
+ skip_footer = skip_footer ,
270
+ convert_float = convert_float ,
271
+ converters = converters ,
272
+ ** kwds )
273
+
274
+ def _print_cellinfo (self , cell ):
275
+ print (' plaintext:' , cell .plaintext ()) # no formatting
276
+ # formatted, but what is difference with value?
277
+ print ('display_form:' , cell .display_form ) # format, ?=plaintext
278
+ print (' value:' , cell .value ) # data handled
279
+ print (' value_type:' , cell .value_type ) # data type
280
+ print (' formula:' , cell .formula )
281
+
282
+ def _parse_datetime (self , cell ):
283
+ """
284
+ Parse the date or time to a datetime object
285
+ """
286
+ if cell .value_type == 'time' and cell .formula is not None :
287
+ try :
288
+ value = datetime .datetime .strptime (cell .formula ,
289
+ 'of:=TIME(%H;%M;%S)' )
290
+ except ValueError :
291
+ # hours can be more then 23
292
+ hours = int (cell .value [2 :].split ('H' )[0 ])
293
+ minutes = int (cell .value [2 :].split ('M' )[0 ][- 2 :])
294
+ seconds = int (cell .value [2 :].split ('M' )[1 ][:- 1 ])
295
+ if hours > 23 :
296
+ value = datetime .timedelta (hours = hours , minutes = minutes ,
297
+ seconds = seconds )
298
+ else :
299
+ # TODO: should return a time object, not datetime?
300
+ value = datetime .datetime .strptime (cell .value ,
301
+ 'PT%HH%MM%SS' )
302
+ # TODO: this does not cover all scenario's
303
+ # TODO: now timedelta objects will be mixed with normal time
304
+ elif cell .value_type == 'date' and cell .formula is not None :
305
+ try :
306
+ value = datetime .datetime .strptime (cell .formula ,
307
+ 'of:=DATE(%Y;%m;%d)' )
308
+ except (ValueError , TypeError ):
309
+ # TODO: parsing other scenerio's
310
+ value = cell .value
311
+ else :
312
+ value = None
313
+ return value
314
+
315
+ def _parse_ods (self , sheetname = 0 , header = 0 , skiprows = None , skip_footer = 0 ,
316
+ index_col = None , has_index_names = None , parse_cols = None ,
317
+ parse_dates = False , date_parser = None , na_values = None ,
318
+ thousands = None , chunksize = None , convert_float = True ,
319
+ ** kwds ):
320
+
321
+ # sheetname can be index or string
322
+ sheet = self .book .sheets [sheetname ]
323
+
324
+ data = []
325
+
326
+ for i in range (sheet .nrows ()):
327
+ row = []
328
+ for j , cell in enumerate (sheet .row (i )):
329
+ typ = cell .value_type
330
+ if isinstance (cell .value , float ):
331
+ value = cell .value
332
+ if convert_float :
333
+ # GH5394 - Excel and ODS 'numbers' are always floats
334
+ # it's a minimal perf hit and less suprising
335
+ # FIXME: this goes wrong when int(cell.value) returns
336
+ # a long (>1e18)
337
+ val = int (cell .value )
338
+ if val == cell .value :
339
+ value = val
340
+ elif isinstance (typ , str ):
341
+ if typ == 'string' :
342
+ value = cell .value
343
+ elif typ == 'date' or typ == 'time' :
344
+ value = self ._parse_datetime (cell )
345
+ elif isinstance (typ , bool ):
346
+ value = cell .value
347
+ elif isinstance (typ , type (None )):
348
+ value = np .nan
349
+ else :
350
+ value = np .nan
351
+
352
+ row .append (value )
353
+
354
+ data .append (row )
355
+
356
+ parser = TextParser (data , header = header , index_col = index_col ,
357
+ has_index_names = has_index_names ,
358
+ na_values = na_values ,
359
+ thousands = thousands ,
360
+ parse_dates = parse_dates ,
361
+ date_parser = date_parser ,
362
+ skiprows = skiprows ,
363
+ skip_footer = skip_footer ,
364
+ chunksize = chunksize ,
365
+ ** kwds )
366
+
367
+ return parser .read ()
368
+
369
+ @property
370
+ def sheet_names (self ):
371
+ # book.sheet.names() is a generator
372
+ return [sheetname for sheetname in self .book .sheet .names ()]
373
+
374
+ def close (self ):
375
+ """close io if necessary"""
376
+ if hasattr (self .io , 'close' ):
377
+ self .io .close ()
378
+
379
+ def __enter__ (self ):
380
+ return self
381
+
382
+ def __exit__ (self , exc_type , exc_value , traceback ):
383
+ self .close ()
384
+
385
+
154
386
class ExcelFile (object ):
155
387
"""
156
388
Class for parsing tabular excel sheets into DataFrame objects.
0 commit comments