Skip to content

Commit e52c872

Browse files
author
Mike Graham
committed
Steal the algorithm used to combine hashes from tupleobject.c
1 parent 5f40950 commit e52c872

File tree

1 file changed

+36
-21
lines changed

1 file changed

+36
-21
lines changed

pandas/tools/hashing.py

Lines changed: 36 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
data hash pandas / numpy objects
33
"""
4+
import itertools
45

56
import numpy as np
67
from pandas import _hash, Series, factorize, Categorical, Index
@@ -13,6 +14,22 @@
1314
_default_hash_key = '0123456789123456'
1415

1516

17+
def _combine_hash_arrays(arrays, num_items):
18+
"Should be the same as CPython's tupleobject.c"
19+
first = next(arrays)
20+
arrays = itertools.chain([first], arrays)
21+
22+
mult = np.zeros_like(first) + np.uint64(1000003)
23+
out = np.zeros_like(first) + np.uint64(0x345678)
24+
for i, a in enumerate(arrays):
25+
inverse_i = num_items - i
26+
out ^= a
27+
out *= mult
28+
mult += np.uint64(82520 + inverse_i + inverse_i)
29+
assert i+1 == num_items, 'Fed in wrong num_items'
30+
out += np.uint64(97531)
31+
return out
32+
1633
def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
1734
categorize=True):
1835
"""
@@ -41,10 +58,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
4158
if hash_key is None:
4259
hash_key = _default_hash_key
4360

44-
def adder(h, hashed_to_add):
45-
h = np.multiply(h, np.uint(3), h)
46-
return np.add(h, hashed_to_add, h)
47-
4861
if isinstance(obj, ABCIndexClass):
4962
h = hash_array(obj.values, encoding, hash_key,
5063
categorize).astype('uint64')
@@ -53,26 +66,28 @@ def adder(h, hashed_to_add):
5366
h = hash_array(obj.values, encoding, hash_key,
5467
categorize).astype('uint64')
5568
if index:
56-
h = adder(h, hash_pandas_object(obj.index,
57-
index=False,
58-
encoding=encoding,
59-
hash_key=hash_key,
60-
categorize=categorize).values)
69+
h = _combine_hash_arrays(iter([
70+
h,
71+
hash_pandas_object(obj.index,
72+
index=False,
73+
encoding=encoding,
74+
hash_key=hash_key,
75+
categorize=categorize).values]),
76+
2)
6177
h = Series(h, index=obj.index, dtype='uint64')
6278
elif isinstance(obj, ABCDataFrame):
63-
cols = obj.iteritems()
64-
first_series = next(cols)[1]
65-
h = hash_array(first_series.values, encoding,
66-
hash_key, categorize).astype('uint64')
67-
for _, col in cols:
68-
h = adder(h, hash_array(col.values, encoding, hash_key,
69-
categorize))
79+
hashes = (hash_array(series.values) for _, series in obj.iteritems())
80+
num_items = len(obj.columns)
7081
if index:
71-
h = adder(h, hash_pandas_object(obj.index,
72-
index=False,
73-
encoding=encoding,
74-
hash_key=hash_key,
75-
categorize=categorize).values)
82+
index_hash_generator = (hash_pandas_object(obj.index,
83+
index=False,
84+
encoding=encoding,
85+
hash_key=hash_key,
86+
categorize=categorize).values
87+
for _ in [None])
88+
num_items += 1
89+
hashes = itertools.chain(hashes, index_hash_generator)
90+
h = _combine_hash_arrays(hashes, num_items)
7691

7792
h = Series(h, index=obj.index, dtype='uint64')
7893
else:

0 commit comments

Comments
 (0)