Skip to content

Commit 4cf2030

Browse files
committed
Merge pull request #4643 from jreback/stata_encode
BUG: (GH4626) Fix decoding based on a passed in non-default encoding in pd.read_stata
2 parents 920512c + f58fb0c commit 4cf2030

File tree

5 files changed

+57
-26
lines changed

5 files changed

+57
-26
lines changed

doc/source/release.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,7 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
286286
the DateOffset from being cached (:issue:`4609`)
287287
- Fix boolean comparison with a DataFrame on the lhs, and a list/tuple on the rhs (:issue:`4576`)
288288
- Fix error/dtype conversion with setitem of ``None`` on ``Series/DataFrame`` (:issue:`4667`)
289+
- Fix decoding based on a passed in non-default encoding in ``pd.read_stata`` (:issue:`4626`)
289290

290291
pandas 0.12
291292
===========

pandas/io/common.py

Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,32 @@ def _is_s3_url(url):
6666
return False
6767

6868

69+
def maybe_read_encoded_stream(reader, encoding=None):
70+
""" read an encoded stream from the reader and transform the bytes to unicode
71+
if required based on the encoding
72+
73+
Parameters
74+
----------
75+
reader : a streamable file-like object
76+
encoding : optional, the encoding to attempt to read
77+
78+
Returns
79+
-------
80+
a tuple of (a stream of decoded bytes, the encoding which was used)
81+
82+
"""
83+
84+
if compat.PY3 or encoding is not None: # pragma: no cover
85+
if encoding:
86+
errors = 'strict'
87+
else:
88+
errors = 'replace'
89+
encoding = 'utf-8'
90+
reader = StringIO(reader.read().decode(encoding, errors))
91+
else:
92+
encoding = None
93+
return reader, encoding
94+
6995
def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
7096
"""
7197
If the filepath_or_buffer is a url, translate and return the buffer
@@ -83,17 +109,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
83109

84110
if _is_url(filepath_or_buffer):
85111
req = _urlopen(str(filepath_or_buffer))
86-
if compat.PY3: # pragma: no cover
87-
if encoding:
88-
errors = 'strict'
89-
else:
90-
errors = 'replace'
91-
encoding = 'utf-8'
92-
out = StringIO(req.read().decode(encoding, errors))
93-
else:
94-
encoding = None
95-
out = req
96-
return out, encoding
112+
return maybe_read_encoded_stream(req,encoding)
97113

98114
if _is_s3_url(filepath_or_buffer):
99115
try:

pandas/io/stata.py

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
from pandas.compat import StringIO, long, lrange, lmap, lzip
2525
from pandas import isnull
2626
from pandas.io.parsers import _parser_params, Appender
27-
from pandas.io.common import get_filepath_or_buffer
27+
from pandas.io.common import get_filepath_or_buffer, maybe_read_encoded_stream
2828

2929

3030
_read_stata_doc = """
@@ -203,11 +203,10 @@ def __repr__(self):
203203

204204

205205
class StataParser(object):
206-
def __init__(self, encoding):
207-
if(encoding is None):
208-
self._encoding = 'cp1252'
209-
else:
210-
self._encoding = encoding
206+
_default_encoding = 'cp1252'
207+
208+
def __init__(self, encoding=None):
209+
self._encoding = encoding
211210

212211
#type code.
213212
#--------------------
@@ -256,7 +255,7 @@ def __init__(self, encoding):
256255
}
257256

258257
def _decode_bytes(self, str, errors=None):
259-
if compat.PY3:
258+
if compat.PY3 or self._encoding is not None:
260259
return str.decode(self._encoding, errors)
261260
else:
262261
return str
@@ -286,7 +285,8 @@ class StataReader(StataParser):
286285
Encoding used to parse the files. Note that Stata doesn't
287286
support unicode. None defaults to cp1252.
288287
"""
289-
def __init__(self, path_or_buf, encoding=None):
288+
289+
def __init__(self, path_or_buf, encoding='cp1252'):
290290
super(StataReader, self).__init__(encoding)
291291
self.col_sizes = ()
292292
self._has_string_data = False
@@ -295,8 +295,6 @@ def __init__(self, path_or_buf, encoding=None):
295295
self._value_labels_read = False
296296
if isinstance(path_or_buf, str):
297297
path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252')
298-
if encoding is not None:
299-
self._encoding = encoding
300298

301299
if isinstance(path_or_buf, (str, compat.text_type, bytes)):
302300
self.path_or_buf = open(path_or_buf, 'rb')
@@ -403,13 +401,13 @@ def _unpack(self, fmt, byt):
403401
return d
404402

405403
def _null_terminate(self, s):
406-
if compat.PY3: # have bytes not strings, so must decode
404+
if compat.PY3 or self._encoding is not None: # have bytes not strings, so must decode
407405
null_byte = b"\0"
408406
try:
409407
s = s[:s.index(null_byte)]
410408
except:
411409
pass
412-
return s.decode(self._encoding)
410+
return s.decode(self._encoding or self._default_encoding)
413411
else:
414412
null_byte = "\0"
415413
try:
@@ -744,15 +742,15 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="
744742
if byteorder is None:
745743
byteorder = sys.byteorder
746744
self._byteorder = _set_endianness(byteorder)
747-
self._file = _open_file_binary_write(fname, self._encoding)
745+
self._file = _open_file_binary_write(fname, self._encoding or self._default_encoding)
748746
self.type_converters = {253: np.long, 252: int}
749747

750748
def _write(self, to_write):
751749
"""
752750
Helper to call encode before writing to file for Python 3 compat.
753751
"""
754752
if compat.PY3:
755-
self._file.write(to_write.encode(self._encoding))
753+
self._file.write(to_write.encode(self._encoding or self._default_encoding))
756754
else:
757755
self._file.write(to_write)
758756

3.42 KB
Binary file not shown.

pandas/io/tests/test_stata.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pandas.io.stata import read_stata, StataReader
1414
import pandas.util.testing as tm
1515
from pandas.util.misc import is_little_endian
16-
16+
from pandas import compat
1717

1818
class StataTests(unittest.TestCase):
1919

@@ -32,6 +32,7 @@ def setUp(self):
3232
self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv')
3333
self.dta9 = os.path.join(self.dirpath, 'lbw.dta')
3434
self.csv9 = os.path.join(self.dirpath, 'lbw.csv')
35+
self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta')
3536

3637
def read_dta(self, file):
3738
return read_stata(file, convert_dates=True)
@@ -202,6 +203,21 @@ def test_stata_doc_examples(self):
202203
df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
203204
df.to_stata(path)
204205

206+
def test_encoding(self):
207+
208+
# GH 4626, proper encoding handling
209+
raw = read_stata(self.dta_encoding)
210+
encoded = read_stata(self.dta_encoding, encoding="latin-1")
211+
result = encoded.kreis1849[0]
212+
213+
if compat.PY3:
214+
expected = raw.kreis1849[0]
215+
self.assert_(result == expected)
216+
self.assert_(isinstance(result,compat.string_types))
217+
else:
218+
expected = raw.kreis1849.str.decode("latin-1")[0]
219+
self.assert_(result == expected)
220+
self.assert_(isinstance(result,unicode))
205221

206222
if __name__ == '__main__':
207223
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)