1
1
"""
2
2
data hash pandas / numpy objects
3
3
"""
4
+ import itertools
4
5
5
6
import numpy as np
6
7
from pandas import _hash , Series , factorize , Categorical , Index
13
14
_default_hash_key = '0123456789123456'
14
15
15
16
17
+ def _combine_hash_arrays (arrays , num_items ):
18
+ "Should be the same as CPython's tupleobject.c"
19
+ first = next (arrays )
20
+ arrays = itertools .chain ([first ], arrays )
21
+
22
+ mult = np .zeros_like (first ) + np .uint64 (1000003 )
23
+ out = np .zeros_like (first ) + np .uint64 (0x345678 )
24
+ for i , a in enumerate (arrays ):
25
+ inverse_i = num_items - i
26
+ out ^= a
27
+ out *= mult
28
+ mult += np .uint64 (82520 + inverse_i + inverse_i )
29
+ assert i + 1 == num_items , 'Fed in wrong num_items'
30
+ out += np .uint64 (97531 )
31
+ return out
32
+
16
33
def hash_pandas_object (obj , index = True , encoding = 'utf8' , hash_key = None ,
17
34
categorize = True ):
18
35
"""
@@ -41,10 +58,6 @@ def hash_pandas_object(obj, index=True, encoding='utf8', hash_key=None,
41
58
if hash_key is None :
42
59
hash_key = _default_hash_key
43
60
44
- def adder (h , hashed_to_add ):
45
- h = np .multiply (h , np .uint (3 ), h )
46
- return np .add (h , hashed_to_add , h )
47
-
48
61
if isinstance (obj , ABCIndexClass ):
49
62
h = hash_array (obj .values , encoding , hash_key ,
50
63
categorize ).astype ('uint64' )
@@ -53,26 +66,28 @@ def adder(h, hashed_to_add):
53
66
h = hash_array (obj .values , encoding , hash_key ,
54
67
categorize ).astype ('uint64' )
55
68
if index :
56
- h = adder (h , hash_pandas_object (obj .index ,
57
- index = False ,
58
- encoding = encoding ,
59
- hash_key = hash_key ,
60
- categorize = categorize ).values )
69
+ h = _combine_hash_arrays (iter ([
70
+ h ,
71
+ hash_pandas_object (obj .index ,
72
+ index = False ,
73
+ encoding = encoding ,
74
+ hash_key = hash_key ,
75
+ categorize = categorize ).values ]),
76
+ 2 )
61
77
h = Series (h , index = obj .index , dtype = 'uint64' )
62
78
elif isinstance (obj , ABCDataFrame ):
63
- cols = obj .iteritems ()
64
- first_series = next (cols )[1 ]
65
- h = hash_array (first_series .values , encoding ,
66
- hash_key , categorize ).astype ('uint64' )
67
- for _ , col in cols :
68
- h = adder (h , hash_array (col .values , encoding , hash_key ,
69
- categorize ))
79
+ hashes = (hash_array (series .values ) for _ , series in obj .iteritems ())
80
+ num_items = len (obj .columns )
70
81
if index :
71
- h = adder (h , hash_pandas_object (obj .index ,
72
- index = False ,
73
- encoding = encoding ,
74
- hash_key = hash_key ,
75
- categorize = categorize ).values )
82
+ index_hash_generator = (hash_pandas_object (obj .index ,
83
+ index = False ,
84
+ encoding = encoding ,
85
+ hash_key = hash_key ,
86
+ categorize = categorize ).values
87
+ for _ in [None ])
88
+ num_items += 1
89
+ hashes = itertools .chain (hashes , index_hash_generator )
90
+ h = _combine_hash_arrays (hashes , num_items )
76
91
77
92
h = Series (h , index = obj .index , dtype = 'uint64' )
78
93
else :
0 commit comments