Skip to content

BUG: (GH4626) Fix decoding based on a passed in non-default encoding in pd.read_stata #4643

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 26, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
the DateOffset from being cached (:issue:`4609`)
- Fix boolean comparison with a DataFrame on the lhs, and a list/tuple on the rhs (:issue:`4576`)
- Fix error/dtype conversion with setitem of ``None`` on ``Series/DataFrame`` (:issue:`4667`)
- Fix decoding based on a passed in non-default encoding in ``pd.read_stata`` (:issue:`4626`)

pandas 0.12
===========
Expand Down
38 changes: 27 additions & 11 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,32 @@ def _is_s3_url(url):
return False


def maybe_read_encoded_stream(reader, encoding=None):
""" read an encoded stream from the reader and transform the bytes to unicode
if required based on the encoding

Parameters
----------
reader : a streamable file-like object
encoding : optional, the encoding to attempt to read

Returns
-------
a tuple of (a stream of decoded bytes, the encoding which was used)

"""

if compat.PY3 or encoding is not None: # pragma: no cover
if encoding:
errors = 'strict'
else:
errors = 'replace'
encoding = 'utf-8'
reader = StringIO(reader.read().decode(encoding, errors))
else:
encoding = None
return reader, encoding

def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
"""
If the filepath_or_buffer is a url, translate and return the buffer
Expand All @@ -83,17 +109,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):

if _is_url(filepath_or_buffer):
req = _urlopen(str(filepath_or_buffer))
if compat.PY3: # pragma: no cover
if encoding:
errors = 'strict'
else:
errors = 'replace'
encoding = 'utf-8'
out = StringIO(req.read().decode(encoding, errors))
else:
encoding = None
out = req
return out, encoding
return maybe_read_encoded_stream(req,encoding)

if _is_s3_url(filepath_or_buffer):
try:
Expand Down
26 changes: 12 additions & 14 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
from pandas.compat import StringIO, long, lrange, lmap, lzip
from pandas import isnull
from pandas.io.parsers import _parser_params, Appender
from pandas.io.common import get_filepath_or_buffer
from pandas.io.common import get_filepath_or_buffer, maybe_read_encoded_stream


_read_stata_doc = """
Expand Down Expand Up @@ -203,11 +203,10 @@ def __repr__(self):


class StataParser(object):
def __init__(self, encoding):
if(encoding is None):
self._encoding = 'cp1252'
else:
self._encoding = encoding
_default_encoding = 'cp1252'

def __init__(self, encoding=None):
self._encoding = encoding

#type code.
#--------------------
Expand Down Expand Up @@ -256,7 +255,7 @@ def __init__(self, encoding):
}

def _decode_bytes(self, str, errors=None):
if compat.PY3:
if compat.PY3 or self._encoding is not None:
return str.decode(self._encoding, errors)
else:
return str
Expand Down Expand Up @@ -286,7 +285,8 @@ class StataReader(StataParser):
Encoding used to parse the files. Note that Stata doesn't
support unicode. None defaults to cp1252.
"""
def __init__(self, path_or_buf, encoding=None):

def __init__(self, path_or_buf, encoding='cp1252'):
super(StataReader, self).__init__(encoding)
self.col_sizes = ()
self._has_string_data = False
Expand All @@ -295,8 +295,6 @@ def __init__(self, path_or_buf, encoding=None):
self._value_labels_read = False
if isinstance(path_or_buf, str):
path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252')
if encoding is not None:
self._encoding = encoding

if isinstance(path_or_buf, (str, compat.text_type, bytes)):
self.path_or_buf = open(path_or_buf, 'rb')
Expand Down Expand Up @@ -403,13 +401,13 @@ def _unpack(self, fmt, byt):
return d

def _null_terminate(self, s):
if compat.PY3: # have bytes not strings, so must decode
if compat.PY3 or self._encoding is not None: # have bytes not strings, so must decode
null_byte = b"\0"
try:
s = s[:s.index(null_byte)]
except:
pass
return s.decode(self._encoding)
return s.decode(self._encoding or self._default_encoding)
else:
null_byte = "\0"
try:
Expand Down Expand Up @@ -744,15 +742,15 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="
if byteorder is None:
byteorder = sys.byteorder
self._byteorder = _set_endianness(byteorder)
self._file = _open_file_binary_write(fname, self._encoding)
self._file = _open_file_binary_write(fname, self._encoding or self._default_encoding)
self.type_converters = {253: np.long, 252: int}

def _write(self, to_write):
"""
Helper to call encode before writing to file for Python 3 compat.
"""
if compat.PY3:
self._file.write(to_write.encode(self._encoding))
self._file.write(to_write.encode(self._encoding or self._default_encoding))
else:
self._file.write(to_write)

Expand Down
Binary file added pandas/io/tests/data/stata1_encoding.dta
Binary file not shown.
18 changes: 17 additions & 1 deletion pandas/io/tests/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from pandas.io.stata import read_stata, StataReader
import pandas.util.testing as tm
from pandas.util.misc import is_little_endian

from pandas import compat

class StataTests(unittest.TestCase):

Expand All @@ -32,6 +32,7 @@ def setUp(self):
self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv')
self.dta9 = os.path.join(self.dirpath, 'lbw.dta')
self.csv9 = os.path.join(self.dirpath, 'lbw.csv')
self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta')

def read_dta(self, file):
return read_stata(file, convert_dates=True)
Expand Down Expand Up @@ -202,6 +203,21 @@ def test_stata_doc_examples(self):
df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
df.to_stata(path)

def test_encoding(self):

# GH 4626, proper encoding handling
raw = read_stata(self.dta_encoding)
encoded = read_stata(self.dta_encoding, encoding="latin-1")
result = encoded.kreis1849[0]

if compat.PY3:
expected = raw.kreis1849[0]
self.assert_(result == expected)
self.assert_(isinstance(result,compat.string_types))
else:
expected = raw.kreis1849.str.decode("latin-1")[0]
self.assert_(result == expected)
self.assert_(isinstance(result,unicode))

if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down