Skip to content

Commit 434c8c1

Browse files
committed
consistent string hashing
1 parent d9615d5 commit 434c8c1

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

pandas/core/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from pandas.util.decorators import (Appender, cache_readonly,
1717
deprecate_kwarg, Substitution)
1818
from pandas.core.common import AbstractMethodError
19-
from pandas.tools.hashing import hash_pandas_object
2019
from pandas.formats.printing import pprint_thing
2120

2221
_shared_docs = dict()
@@ -838,6 +837,7 @@ def hash(self, index=True):
838837
9751253963311919054], dtype=uint64)
839838
840839
"""
840+
from pandas.tools.hashing import hash_pandas_object
841841
return hash_pandas_object(self, index=index)
842842

843843

pandas/tools/hashing.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22
data hash pandas / numpy objects
33
"""
44

5+
from hashlib import md5
56
import numpy as np
7+
from pandas import Series
68
from pandas.types.generic import ABCIndexClass, ABCSeries, ABCDataFrame
79
from pandas.types.common import is_categorical_dtype
810

@@ -71,7 +73,13 @@ def hash_array(vals):
7173

7274
vals = vals.view('u{}'.format(vals.dtype.itemsize)).astype('u8')
7375
else:
74-
vals = np.array([hash(x) for x in vals], dtype=np.uint64)
76+
77+
# we want to stringify
78+
# then apply a consistent hashing scheme
79+
def f(v):
80+
return int(md5(v).hexdigest(), 16) % (10 ** 8)
81+
vals = Series(vals).astype(str).str.encode('utf8').values
82+
vals = np.array([f(v) for v in vals], dtype='uint64')
7583

7684
# Then, redistribute these 64-bit ints within the space of 64-bit ints
7785
vals ^= vals >> 30

0 commit comments

Comments
 (0)