Skip to content

Commit 61c54bf

Browse files
committed
Move compression inference to io/parsers
1 parent 2557bd7 commit 61c54bf

File tree

2 files changed

+34
-38
lines changed

2 files changed

+34
-38
lines changed

pandas/io/common.py

Lines changed: 4 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,6 @@ def urlopen(*args, **kwargs):
6363
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
6464
_VALID_URLS.discard('')
6565

66-
_compression_to_extension = {
67-
'gzip': '.gz',
68-
'bz2': '.bz2',
69-
'zip': '.zip',
70-
'xz': '.xz',
71-
}
72-
7366

7467
class ParserError(ValueError):
7568
"""
@@ -245,13 +238,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
245238
if _is_url(filepath_or_buffer):
246239
url = str(filepath_or_buffer)
247240
req = _urlopen(url)
248-
if compression == 'infer':
249-
for compression, extension in _compression_to_extension.items():
250-
if url.endswith(extension):
251-
break
252-
else:
253-
content_encoding = req.headers.get('Content-Encoding', None)
254-
compression = 'gzip' if content_encoding == 'gzip' else None
241+
content_encoding = req.headers.get('Content-Encoding', None)
242+
if content_encoding == 'gzip':
243+
# Override compression based on Content-Encoding header
244+
compression = 'gzip'
255245
reader, encoding = maybe_read_encoded_stream(req, encoding, compression)
256246
return reader, encoding, compression
257247

pandas/io/parsers.py

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -348,38 +348,44 @@ def _validate_nrows(nrows):
348348
return nrows
349349

350350

351+
_compression_to_extension = {
352+
'gzip': '.gz',
353+
'bz2': '.bz2',
354+
'zip': '.zip',
355+
'xz': '.xz',
356+
}
357+
358+
def _infer_compression(filepath_or_buffer):
359+
"""
360+
Infer compression of a filepath or buffer. In case of buffer, compression
361+
is None. Otherwise, inference is perfomed using the extension of the
362+
filename or URL.
363+
"""
364+
if not isinstance(filepath_or_buffer, compat.string_types):
365+
return None
366+
filepath = str(filepath_or_buffer)
367+
for compression, extension in _compression_to_extension.items():
368+
if filepath.endswith(extension):
369+
return compression
370+
return None
371+
351372
def _read(filepath_or_buffer, kwds):
352-
"Generic reader of line files."
373+
"""Generic reader of line files."""
353374
encoding = kwds.get('encoding', None)
354375
if encoding is not None:
355376
encoding = re.sub('_', '-', encoding).lower()
356377
kwds['encoding'] = encoding
357378

358-
# If the input could be a filename, check for a recognizable compression
359-
# extension. If we're reading from a URL, the `get_filepath_or_buffer`
360-
# will use header info to determine compression, so use what it finds in
361-
# that case.
362-
inferred_compression = kwds.get('compression')
363-
if inferred_compression == 'infer':
364-
if isinstance(filepath_or_buffer, compat.string_types):
365-
if filepath_or_buffer.endswith('.gz'):
366-
inferred_compression = 'gzip'
367-
elif filepath_or_buffer.endswith('.bz2'):
368-
inferred_compression = 'bz2'
369-
elif filepath_or_buffer.endswith('.zip'):
370-
inferred_compression = 'zip'
371-
elif filepath_or_buffer.endswith('.xz'):
372-
inferred_compression = 'xz'
373-
else:
374-
inferred_compression = None
375-
else:
376-
inferred_compression = None
379+
compression = kwds.get('compression')
380+
if compression not in set(_compression_to_extension) | {None, 'infer'}:
381+
raise ValueError('"{}" is not a valid compression'.format(compression))
382+
383+
if compression == 'infer':
384+
compression = _infer_compression(filepath_or_buffer)
377385

378386
filepath_or_buffer, _, compression = get_filepath_or_buffer(
379-
filepath_or_buffer, encoding,
380-
compression=kwds.get('compression', None))
381-
kwds['compression'] = (inferred_compression if compression == 'infer'
382-
else compression)
387+
filepath_or_buffer, encoding, compression)
388+
kwds['compression'] = compression
383389

384390
if kwds.get('date_parser', None) is not None:
385391
if isinstance(kwds['parse_dates'], bool):

0 commit comments

Comments
 (0)