BUG: unicode characters when reading JSON lines

rouzazari · jreback · commit 7d5c354953aa · 2017-01-19T08:53:23.000-05:00
Fixes UnicodeDecodeError bug when reading JSON lines input with Ascii decoder, which is often the default setting in Python 2.7. Avoids issues with mixing unicode and ascii strings. closes #15132 Author: Rouz Azari <rouz.azari@gmail.com> Closes #15149 from rouzazari/GH_15132_json_lines_with_unicode_chars_py2 and squashes the following commits: e117889 [Rouz Azari] BUG: unicode characters when reading JSON lines
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -436,3 +436,4 @@ Bug Fixes
 - Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)
 
 - Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
+- Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`)
diff --git a/pandas/io/json.py b/pandas/io/json.py
@@ -274,7 +274,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True,
         # If given a json lines file, we break the string into lines, add
         # commas and put it in a json list to make a valid json object.
         lines = list(StringIO(json.strip()))
-        json = u'[' + u','.join(lines) + u']'
+        json = '[' + ','.join(lines) + ']'
 
     obj = None
     if typ == 'frame':
diff --git a/pandas/io/tests/json/test_pandas.py b/pandas/io/tests/json/test_pandas.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
 # pylint: disable-msg=W0612,E1101
 import nose
 from pandas.compat import range, lrange, StringIO, OrderedDict
@@ -960,6 +961,25 @@ def test_read_jsonl(self):
         expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])
         assert_frame_equal(result, expected)
 
+    def test_read_jsonl_unicode_chars(self):
+        # GH15132: non-ascii unicode characters
+        # \u201d == RIGHT DOUBLE QUOTATION MARK
+
+        # simulate file handle
+        json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+        json = StringIO(json)
+        result = read_json(json, lines=True)
+        expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+                             columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
+        # simulate string
+        json = '{"a": "foo”", "b": "bar"}\n{"a": "foo", "b": "bar"}\n'
+        result = read_json(json, lines=True)
+        expected = DataFrame([[u"foo\u201d", "bar"], ["foo", "bar"]],
+                             columns=['a', 'b'])
+        assert_frame_equal(result, expected)
+
     def test_to_jsonl(self):
         # GH9180
         df = DataFrame([[1, 2], [1, 2]], columns=['a', 'b'])

Original file line number	Diff line number	Diff line change
`@@ -436,3 +436,4 @@ Bug Fixes`
`436`	`436`	- Bug in ``pd.read_csv()`` for the C engine where ``usecols`` were being indexed incorrectly with ``parse_dates`` (:issue:`14792`)
`437`	`437`
`438`	`438`	- Bug in ``Series.dt.round`` inconsistent behaviour on NAT's with different arguments (:issue:`14940`)
	`439`	+- Bug in ``.read_json()`` for Python 2 where ``lines=True`` and contents contain non-ascii unicode characters (:issue:`15132`)