Skip to content

Commit f144db2

Browse files
committed
Use hash for comparison (need to update docs)
1 parent 436b43c commit f144db2

File tree

2 files changed

+23
-7
lines changed

2 files changed

+23
-7
lines changed

pandas/core/dtypes/dtypes.py

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -170,14 +170,32 @@ def __new__(cls, categories=None, ordered=False, fastpath=False):
170170
# Can just inline _validate_* if needed
171171
cls._validate_categories(categories, fastpath=fastpath)
172172
cls._validate_ordered(ordered)
173-
hashed = cls._hash_categories(categories, ordered)
173+
174+
# We have a choice when hashing pandas unordered categoricals
175+
# We can completely ignore the order, or not. I.e. should
176+
# [a, b, c] hash the same as [b, a, c], when both are unordered?
177+
# ignoring the order can cause some confusion when a user says
178+
# CategoricalDtype(['a', 'b']), they get that back. But if they
179+
# first do CategoricalDtype(['b', 'a']) then
180+
# CategoricalDtype(['a', 'b']) gives CategoricalDtype(['b', 'a'])
181+
# which is surprising. For this reason, we choose to include order
182+
# in the hashing, even if it's unordered
183+
184+
hashed = cls._hash_categories(categories, ordered=True)
174185
else:
175186
hashed = None
176187
return cls._get_or_create(categories, ordered, hashed)
177188

178189
def __hash__(self):
179-
# make myself hashable
180-
return hash(str(self))
190+
# _hash_categories returns a uint64, so use the negative
191+
# space for when we have unknown categories to avoid a conflict
192+
if self.categories is None:
193+
if self.ordered:
194+
return -1
195+
else:
196+
return -2
197+
# We *do* want to include the real self.ordered here
198+
return int(self._hash_categories(self.categories, self.ordered))
181199

182200
def __eq__(self, other):
183201
if isinstance(other, compat.string_types):
@@ -186,7 +204,7 @@ def __eq__(self, other):
186204
return isinstance(other, CategoricalDtype)
187205

188206
@staticmethod
189-
def _hash_categories(categories, ordered):
207+
def _hash_categories(categories, ordered=True):
190208
from pandas.core.util.hashing import hash_array, _combine_hash_arrays
191209
cat_array = hash_array(np.asarray(categories), categorize=False)
192210
if ordered:
@@ -195,7 +213,6 @@ def _hash_categories(categories, ordered):
195213
])
196214
else:
197215
cat_array = [cat_array]
198-
199216
hashed = _combine_hash_arrays(iter(cat_array),
200217
num_items=len(cat_array))
201218
hashed = np.bitwise_xor.reduce(hashed)

pandas/tests/dtypes/test_dtypes.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -559,8 +559,7 @@ def test_order_matters(self):
559559
def test_unordered_same(self):
560560
c1 = CategoricalDtype(['a', 'b'])
561561
c2 = CategoricalDtype(['b', 'a'])
562-
assert c1 is c2
563-
tm.assert_index_equal(c1.categories, c2.categories)
562+
assert hash(c1) == hash(c2)
564563

565564
def test_categories(self):
566565
result = CategoricalDtype(['a', 'b', 'c'])

0 commit comments

Comments
 (0)