Skip to content

Commit b8d78c4

Browse files
committed
BUG: read_csv throws UnicodeDecodeError with unicode aliases
see issue pandas-dev#13549 read_csv with engine=c throws error when encoding=UTF_16 or when encoding has _ or caps
1 parent d38ee27 commit b8d78c4

File tree

3 files changed

+37
-0
lines changed

3 files changed

+37
-0
lines changed

doc/source/whatsnew/v0.19.0.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,4 @@ Performance Improvements
8181

8282
Bug Fixes
8383
~~~~~~~~~
84+
- Bug in ``read_csv()``, where aliases for utf-xx (e.g. UTF-xx, UTF_xx, utf_xx) raised UnicodeDecodeError (:issue:`13549`)

pandas/io/parsers.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,10 @@ def _validate_nrows(nrows):
339339
def _read(filepath_or_buffer, kwds):
340340
"Generic reader of line files."
341341
encoding = kwds.get('encoding', None)
342+
if encoding is not None:
343+
encoding = re.sub('_', '-', encoding).lower()
344+
kwds['encoding'] = encoding
345+
342346
skipfooter = kwds.pop('skipfooter', None)
343347
if skipfooter is not None:
344348
kwds['skip_footer'] = skipfooter
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
import pandas, os, nose
2+
3+
def test_read_csv():
4+
# see gh issue 13549
5+
test_encodings = ['utf-16', 'utf_16', 'UTF_16', 'UTF-16']
6+
engines = ['c', 'python', None]
7+
expected_output = [1]*len(test_encodings)*len(engines)
8+
test_output = []
9+
path = "test.csv"
10+
pandas.DataFrame({"A": [0,1], "B": [2,3]}).to_csv(
11+
path, encoding="utf-16")
12+
13+
for encoding in test_encodings:
14+
for engine in engines:
15+
try:
16+
pandas.io.parsers.read_csv(
17+
path,
18+
engine='c',
19+
encoding=encoding)
20+
print(encoding, 'succeeded with engine =', engine)
21+
test_output.append(1)
22+
except UnicodeDecodeError:
23+
print(encoding, 'failed with engine =', engine)
24+
test_output.append(0)
25+
26+
assert (expected_output == test_output)
27+
28+
os.remove("test.csv")
29+
30+
if __name__ == '__main__':
31+
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
32+
exit=False)

0 commit comments

Comments
 (0)