@@ -170,14 +170,32 @@ def __new__(cls, categories=None, ordered=False, fastpath=False):
170
170
# Can just inline _validate_* if needed
171
171
cls ._validate_categories (categories , fastpath = fastpath )
172
172
cls ._validate_ordered (ordered )
173
- hashed = cls ._hash_categories (categories , ordered )
173
+
174
+ # We have a choice when hashing pandas unordered categoricals
175
+ # We can completely ignore the order, or not. I.e. should
176
+ # [a, b, c] hash the same as [b, a, c], when both are unordered?
177
+ # ignoring the order can cause some confusion when a user says
178
+ # CategoricalDtype(['a', 'b']), they get that back. But if they
179
+ # first do CategoricalDtype(['b', 'a']) then
180
+ # CategoricalDtype(['a', 'b']) gives CategoricalDtype(['b', 'a'])
181
+ # which is surprising. For this reason, we choose to include order
182
+ # in the hashing, even if it's unordered
183
+
184
+ hashed = cls ._hash_categories (categories , ordered = True )
174
185
else :
175
186
hashed = None
176
187
return cls ._get_or_create (categories , ordered , hashed )
177
188
178
189
def __hash__ (self ):
179
- # make myself hashable
180
- return hash (str (self ))
190
+ # _hash_categories returns a uint64, so use the negative
191
+ # space for when we have unknown categories to avoid a conflict
192
+ if self .categories is None :
193
+ if self .ordered :
194
+ return - 1
195
+ else :
196
+ return - 2
197
+ # We *do* want to include the real self.ordered here
198
+ return int (self ._hash_categories (self .categories , self .ordered ))
181
199
182
200
def __eq__ (self , other ):
183
201
if isinstance (other , compat .string_types ):
@@ -186,7 +204,7 @@ def __eq__(self, other):
186
204
return isinstance (other , CategoricalDtype )
187
205
188
206
@staticmethod
189
- def _hash_categories (categories , ordered ):
207
+ def _hash_categories (categories , ordered = True ):
190
208
from pandas .core .util .hashing import hash_array , _combine_hash_arrays
191
209
cat_array = hash_array (np .asarray (categories ), categorize = False )
192
210
if ordered :
@@ -195,7 +213,6 @@ def _hash_categories(categories, ordered):
195
213
])
196
214
else :
197
215
cat_array = [cat_array ]
198
-
199
216
hashed = _combine_hash_arrays (iter (cat_array ),
200
217
num_items = len (cat_array ))
201
218
hashed = np .bitwise_xor .reduce (hashed )
0 commit comments