pandas-dev · jreback · Jun 21, 2018 · May 24, 2018 · May 24, 2018 · Jun 12, 2018
diff --git a/pandas/_libs/hashing.pyx b/pandas/_libs/hashing.pyx
@@ -8,8 +8,7 @@ import numpy as np
 from numpy cimport ndarray, uint8_t, uint32_t, uint64_t
 
 from util cimport _checknull
-from cpython cimport (PyString_Check,
-                      PyBytes_Check,
+from cpython cimport (PyBytes_Check,
                       PyUnicode_Check)
 from libc.stdlib cimport malloc, free
 
@@ -62,9 +61,7 @@ def hash_object_array(ndarray[object] arr, object key, object encoding='utf8'):
     cdef list datas = []
     for i in range(n):
         val = arr[i]
-        if PyString_Check(val):
-            data = <bytes>val.encode(encoding)
-        elif PyBytes_Check(val):
+        if PyBytes_Check(val):
             data = <bytes>val
         elif PyUnicode_Check(val):
             data = <bytes>val.encode(encoding)

diff --git a/pandas/tests/series/test_repr.py b/pandas/tests/series/test_repr.py
@@ -5,6 +5,7 @@
 
 import sys
 
+import pytest
 import numpy as np
 import pandas as pd
 
@@ -202,6 +203,35 @@ def test_latex_repr(self):
 
 class TestCategoricalRepr(object):
 
+    @pytest.mark.skipif(compat.PY3, reason="Decoding failure only in PY2")
+    def test_categorical_repr_unicode(self):
+        # GH#21002 if len(index) > 60, sys.getdefaultencoding()=='ascii',
+        # and we are working in PY2, then rendering a Categorical could raise
+        # UnicodeDecodeError by trying to decode when it shouldn't
+        from pandas.core.base import StringMixin
+
+        class County(StringMixin):
+            name = u'San Sebastián'
+            state = u'PR'
+
+            def __unicode__(self):
+                return self.name + u', ' + self.state
+
+        cat = pd.Categorical([County() for n in range(61)])
+        idx = pd.Index(cat)
+        ser = idx.to_series()
+
+        # set sys.defaultencoding to ascii, then change it back after the test
+        enc = sys.getdefaultencoding()
+        reload(sys)  # noqa:F821
+        sys.setdefaultencoding('ascii')
+        try:
+            repr(ser)
+            str(ser)
+        finally:
+            # restore encoding
+            sys.setdefaultencoding(enc)
+
     def test_categorical_repr(self):
         a = Series(Categorical([1, 2, 3, 4]))
         exp = u("0    1\n1    2\n2    3\n3    4\n" +