Use hash for comparison (need to update docs)

TomAugspurger · TomAugspurger · commit f144db2cfe83 · 2017-05-30T11:42:07.000-05:00
diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py
@@ -170,14 +170,32 @@ def __new__(cls, categories=None, ordered=False, fastpath=False):
             # Can just inline _validate_* if needed
             cls._validate_categories(categories, fastpath=fastpath)
             cls._validate_ordered(ordered)
-            hashed = cls._hash_categories(categories, ordered)
+
+            # We have a choice when hashing pandas unordered categoricals
+            # We can completely ignore the order, or not. I.e. should
+            # [a, b, c] hash the same as [b, a, c], when both are unordered?
+            # ignoring the order can cause some confusion when a user says
+            # CategoricalDtype(['a', 'b']), they get that back. But if they
+            # first do CategoricalDtype(['b', 'a']) then
+            # CategoricalDtype(['a', 'b']) gives CategoricalDtype(['b', 'a'])
+            # which is surprising. For this reason, we choose to include order
+            # in the hashing, even if it's unordered
+
+            hashed = cls._hash_categories(categories, ordered=True)
         else:
             hashed = None
         return cls._get_or_create(categories, ordered, hashed)
 
     def __hash__(self):
-        # make myself hashable
-        return hash(str(self))
+        # _hash_categories returns a uint64, so use the negative
+        # space for when we have unknown categories to avoid a conflict
+        if self.categories is None:
+            if self.ordered:
+                return -1
+            else:
+                return -2
+        # We *do* want to include the real self.ordered here
+        return int(self._hash_categories(self.categories, self.ordered))
 
     def __eq__(self, other):
         if isinstance(other, compat.string_types):
@@ -186,7 +204,7 @@ def __eq__(self, other):
         return isinstance(other, CategoricalDtype)
 
     @staticmethod
-    def _hash_categories(categories, ordered):
+    def _hash_categories(categories, ordered=True):
         from pandas.core.util.hashing import hash_array, _combine_hash_arrays
         cat_array = hash_array(np.asarray(categories), categorize=False)
         if ordered:
@@ -195,7 +213,6 @@ def _hash_categories(categories, ordered):
             ])
         else:
             cat_array = [cat_array]
-
         hashed = _combine_hash_arrays(iter(cat_array),
                                       num_items=len(cat_array))
         hashed = np.bitwise_xor.reduce(hashed)
diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py
@@ -559,8 +559,7 @@ def test_order_matters(self):
     def test_unordered_same(self):
         c1 = CategoricalDtype(['a', 'b'])
         c2 = CategoricalDtype(['b', 'a'])
-        assert c1 is c2
-        tm.assert_index_equal(c1.categories, c2.categories)
+        assert hash(c1) == hash(c2)
 
     def test_categories(self):
         result = CategoricalDtype(['a', 'b', 'c'])