Merge pull request #4643 from jreback/stata_encode

jreback · jreback · commit 4cf2030bd992 · 2013-08-26T08:21:55.000-07:00
BUG: (GH4626) Fix decoding based on a passed in non-default encoding in pd.read_stata
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -286,6 +286,7 @@ See :ref:`Internal Refactoring<whatsnew_0130.refactoring>`
     the DateOffset from being cached (:issue:`4609`)
   - Fix boolean comparison with a DataFrame on the lhs, and a list/tuple on the rhs (:issue:`4576`)
   - Fix error/dtype conversion with setitem of ``None`` on ``Series/DataFrame`` (:issue:`4667`)
+  - Fix decoding based on a passed in non-default encoding in ``pd.read_stata`` (:issue:`4626`)
 
 pandas 0.12
 ===========
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -66,6 +66,32 @@ def _is_s3_url(url):
         return False
 
 
+def maybe_read_encoded_stream(reader, encoding=None):
+    """ read an encoded stream from the reader and transform the bytes to unicode
+        if required based on the encoding
+
+        Parameters
+        ----------
+        reader : a streamable file-like object
+        encoding : optional, the encoding to attempt to read
+
+        Returns
+        -------
+        a tuple of (a stream of decoded bytes, the encoding which was used)
+
+        """
+
+    if compat.PY3 or encoding is not None:  # pragma: no cover
+        if encoding:
+            errors = 'strict'
+        else:
+            errors = 'replace'
+            encoding = 'utf-8'
+        reader = StringIO(reader.read().decode(encoding, errors))
+    else:
+        encoding = None
+    return reader, encoding
+
 def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
     """
     If the filepath_or_buffer is a url, translate and return the buffer
@@ -83,17 +109,7 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None):
 
     if _is_url(filepath_or_buffer):
         req = _urlopen(str(filepath_or_buffer))
-        if compat.PY3:  # pragma: no cover
-            if encoding:
-                errors = 'strict'
-            else:
-                errors = 'replace'
-                encoding = 'utf-8'
-            out = StringIO(req.read().decode(encoding, errors))
-        else:
-            encoding = None
-            out = req
-        return out, encoding
+        return maybe_read_encoded_stream(req,encoding)
 
     if _is_s3_url(filepath_or_buffer):
         try:
diff --git a/pandas/io/stata.py b/pandas/io/stata.py
@@ -24,7 +24,7 @@
 from pandas.compat import StringIO, long, lrange, lmap, lzip
 from pandas import isnull
 from pandas.io.parsers import _parser_params, Appender
-from pandas.io.common import get_filepath_or_buffer
+from pandas.io.common import get_filepath_or_buffer, maybe_read_encoded_stream
 
 
 _read_stata_doc = """
@@ -203,11 +203,10 @@ def __repr__(self):
 
 
 class StataParser(object):
-    def __init__(self, encoding):
-        if(encoding is None):
-            self._encoding = 'cp1252'
-        else:
-            self._encoding = encoding
+    _default_encoding = 'cp1252'
+
+    def __init__(self, encoding=None):
+        self._encoding = encoding
 
         #type          code.
         #--------------------
@@ -256,7 +255,7 @@ def __init__(self, encoding):
             }
 
     def _decode_bytes(self, str, errors=None):
-        if compat.PY3:
+        if compat.PY3 or self._encoding is not None:
             return str.decode(self._encoding, errors)
         else:
             return str
@@ -286,7 +285,8 @@ class StataReader(StataParser):
         Encoding used to parse the files. Note that Stata doesn't
         support unicode. None defaults to cp1252.
     """
-    def __init__(self, path_or_buf, encoding=None):
+
+    def __init__(self, path_or_buf, encoding='cp1252'):
         super(StataReader, self).__init__(encoding)
         self.col_sizes = ()
         self._has_string_data = False
@@ -295,8 +295,6 @@ def __init__(self, path_or_buf, encoding=None):
         self._value_labels_read = False
         if isinstance(path_or_buf, str):
             path_or_buf, encoding = get_filepath_or_buffer(path_or_buf, encoding='cp1252')
-            if encoding is not None:
-                self._encoding = encoding
 
         if isinstance(path_or_buf, (str, compat.text_type, bytes)):
             self.path_or_buf = open(path_or_buf, 'rb')
@@ -403,13 +401,13 @@ def _unpack(self, fmt, byt):
         return d
 
     def _null_terminate(self, s):
-        if compat.PY3:  # have bytes not strings, so must decode
+        if compat.PY3 or self._encoding is not None:  # have bytes not strings, so must decode
             null_byte = b"\0"
             try:
                 s = s[:s.index(null_byte)]
             except:
                 pass
-            return s.decode(self._encoding)
+            return s.decode(self._encoding or self._default_encoding)
         else:
             null_byte = "\0"
             try:
@@ -744,15 +742,15 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="
         if byteorder is None:
             byteorder = sys.byteorder
         self._byteorder = _set_endianness(byteorder)
-        self._file = _open_file_binary_write(fname, self._encoding)
+        self._file = _open_file_binary_write(fname, self._encoding or self._default_encoding)
         self.type_converters = {253: np.long, 252: int}
 
     def _write(self, to_write):
         """
         Helper to call encode before writing to file for Python 3 compat.
         """
         if compat.PY3:
-            self._file.write(to_write.encode(self._encoding))
+            self._file.write(to_write.encode(self._encoding or self._default_encoding))
         else:
             self._file.write(to_write)
 
diff --git a/pandas/io/tests/data/stata1_encoding.dta b/pandas/io/tests/data/stata1_encoding.dta
diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py
@@ -13,7 +13,7 @@
 from pandas.io.stata import read_stata, StataReader
 import pandas.util.testing as tm
 from pandas.util.misc import is_little_endian
-
+from pandas import compat
 
 class StataTests(unittest.TestCase):
 
@@ -32,6 +32,7 @@ def setUp(self):
         self.csv8 = os.path.join(self.dirpath, 'tbl19-3.csv')
         self.dta9 = os.path.join(self.dirpath, 'lbw.dta')
         self.csv9 = os.path.join(self.dirpath, 'lbw.csv')
+        self.dta_encoding = os.path.join(self.dirpath, 'stata1_encoding.dta')
 
     def read_dta(self, file):
         return read_stata(file, convert_dates=True)
@@ -202,6 +203,21 @@ def test_stata_doc_examples(self):
             df = DataFrame(np.random.randn(10, 2), columns=list('AB'))
             df.to_stata(path)
 
+    def test_encoding(self):
+
+        # GH 4626, proper encoding handling
+        raw = read_stata(self.dta_encoding)
+        encoded = read_stata(self.dta_encoding, encoding="latin-1")
+        result = encoded.kreis1849[0]
+
+        if compat.PY3:
+            expected = raw.kreis1849[0]
+            self.assert_(result == expected)
+            self.assert_(isinstance(result,compat.string_types))
+        else:
+            expected = raw.kreis1849.str.decode("latin-1")[0]
+            self.assert_(result == expected)
+            self.assert_(isinstance(result,unicode))
 
 if __name__ == '__main__':
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],