Skip to content

Commit dae46b7

Browse files
committed
feature: optional pandas and polars support
1 parent 1846b7b commit dae46b7

File tree

3 files changed

+117
-2
lines changed

3 files changed

+117
-2
lines changed

deepdiff/deephash.py

Lines changed: 25 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,17 @@
1414
number_to_string, datetime_normalize, KEY_TO_VAL_STR, short_repr,
1515
get_truncate_datetime, dict_, add_root_to_paths)
1616
from deepdiff.base import Base
17+
18+
try:
19+
import pandas
20+
except ImportError:
21+
pandas = False
22+
23+
try:
24+
import polars
25+
except ImportError:
26+
polars = False
27+
1728
logger = logging.getLogger(__name__)
1829

1930
UNPROCESSED_KEY = object()
@@ -448,7 +459,6 @@ def _prep_path(self, obj):
448459
type_ = obj.__class__.__name__
449460
return KEY_TO_VAL_STR.format(type_, obj)
450461

451-
452462
def _prep_number(self, obj):
453463
type_ = "number" if self.ignore_numeric_type_changes else obj.__class__.__name__
454464
if self.significant_digits is not None:
@@ -479,7 +489,7 @@ def _prep_tuple(self, obj, parent, parents_ids):
479489
return result, counts
480490

481491
def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET):
482-
"""The main diff method"""
492+
"""The main hash method"""
483493
counts = 1
484494

485495
if isinstance(obj, bool):
@@ -529,6 +539,19 @@ def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET):
529539
elif isinstance(obj, tuple):
530540
result, counts = self._prep_tuple(obj=obj, parent=parent, parents_ids=parents_ids)
531541

542+
elif (pandas and isinstance(obj, pandas.DataFrame)):
543+
def gen():
544+
yield ('dtype', obj.dtypes)
545+
yield ('index', obj.index)
546+
yield from obj.items() # which contains (column name, series tuples)
547+
result, counts = self._prep_iterable(obj=gen(), parent=parent, parents_ids=parents_ids)
548+
elif (polars and isinstance(obj, polars.DataFrame)):
549+
def gen():
550+
yield from obj.columns
551+
yield from list(obj.schema.items())
552+
yield from obj.rows()
553+
result, counts = self._prep_iterable(obj=gen(), parent=parent, parents_ids=parents_ids)
554+
532555
elif isinstance(obj, Iterable):
533556
result, counts = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids)
534557

requirements-dev.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,3 +20,5 @@ tomli==2.0.1
2020
tomli-w==1.0.0
2121
pydantic==2.7.4
2222
pytest-benchmark==4.0.0
23+
pandas>=1.6
24+
polars=>0.19.11

tests/test_hash.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -744,6 +744,96 @@ def test_hash_numpy_array2_multi_dimensional_can_not_retrieve_individual_array_i
744744
except Exception as e:
745745
assert str(e).strip("'") == HASH_LOOKUP_ERR_MSG.format(t1[0])
746746

747+
def test_pandas(self):
748+
import pandas as pd
749+
df = pd.DataFrame({"a": [1]})
750+
equal_df = pd.DataFrame({"a": [1]})
751+
df_same_column_names = pd.DataFrame({"a": [1, 2]})
752+
other_df = pd.DataFrame({"b": [1]})
753+
df_hash = DeepHashPrep(df)[df]
754+
equal_df_hash = DeepHashPrep(equal_df)[equal_df]
755+
df_same_column_names_hash = DeepHashPrep(df_same_column_names)[df_same_column_names]
756+
other_df_hash = DeepHashPrep(other_df)[other_df]
757+
assert df_hash == equal_df_hash
758+
assert df_hash != df_same_column_names_hash
759+
assert df_hash != other_df_hash
760+
761+
df_mixed = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
762+
df_mixed_2 = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
763+
df_mixed_3 = pd.DataFrame({'a': [1], 'b': ['one'], 'c': [(1, 2)]})
764+
df_mixed_4 = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 3)]})
765+
df_mixed_hash = DeepHashPrep(df_mixed)[df_mixed]
766+
df_mixed_2_hash = DeepHashPrep(df_mixed_2)[df_mixed_2]
767+
df_mixed_3_hash = DeepHashPrep(df_mixed_3)[df_mixed_3]
768+
df_mixed_4_hash = DeepHashPrep(df_mixed_4)[df_mixed_4]
769+
assert df_mixed_hash == df_mixed_2_hash
770+
assert df_mixed_hash != df_mixed_3_hash
771+
assert df_mixed_hash != df_mixed_4_hash
772+
773+
df_u8 = pd.DataFrame({'a': np.array([1], dtype=np.uint8)})
774+
df_u16 = pd.DataFrame({'a': np.array([1], dtype=np.uint16)})
775+
df_float = pd.DataFrame({'a': np.array([1], dtype=np.float32)})
776+
df_u8_hash = DeepHashPrep(df_u8)[df_u8]
777+
df_u16_hash = DeepHashPrep(df_u16)[df_u16]
778+
df_float_hash = DeepHashPrep(df_float)[df_float]
779+
assert df_u8_hash != df_float_hash
780+
assert df_u8_hash != df_u16_hash
781+
782+
df_index = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3])
783+
df_index_diff = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 4])
784+
df_index_hash = DeepHashPrep(df_index)[df_index]
785+
df_index_diff_hash = DeepHashPrep(df_index_diff)[df_index_diff]
786+
assert df_index_hash != df_index_diff_hash
787+
788+
def test_polars(self):
789+
import polars as pl
790+
df = pl.DataFrame({"a": [1]})
791+
equal_df = pl.DataFrame({"a": [1]})
792+
df_same_column_names = pl.DataFrame({"a": [1, 2]})
793+
other_df = pl.DataFrame({"b": [1]})
794+
df_hash = DeepHashPrep(df)[df]
795+
equal_df_hash = DeepHashPrep(equal_df)[equal_df]
796+
df_same_column_names_hash = DeepHashPrep(df_same_column_names)[df_same_column_names]
797+
other_df_hash = DeepHashPrep(other_df)[other_df]
798+
assert df_hash == equal_df_hash
799+
assert df_hash != df_same_column_names_hash
800+
assert df_hash != other_df_hash
801+
802+
df_mixed = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
803+
df_mixed_2 = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
804+
df_mixed_3 = pl.DataFrame({'a': [1], 'b': ['one'], 'c': [(1, 2)]})
805+
df_mixed_4 = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 3)]})
806+
df_mixed_hash = DeepHashPrep(df_mixed)[df_mixed]
807+
df_mixed_2_hash = DeepHashPrep(df_mixed_2)[df_mixed_2]
808+
df_mixed_3_hash = DeepHashPrep(df_mixed_3)[df_mixed_3]
809+
df_mixed_4_hash = DeepHashPrep(df_mixed_4)[df_mixed_4]
810+
assert df_mixed_hash == df_mixed_2_hash
811+
assert df_mixed_hash != df_mixed_3_hash
812+
assert df_mixed_hash != df_mixed_4_hash
813+
814+
df_u8 = pl.DataFrame({'a': np.array([1], dtype=np.uint8)})
815+
df_u16 = pl.DataFrame({'a': np.array([1], dtype=np.uint16)})
816+
df_float = pl.DataFrame({'a': np.array([1], dtype=np.float32)})
817+
df_u8_hash = DeepHashPrep(df_u8)[df_u8]
818+
df_u16_hash = DeepHashPrep(df_u16)[df_u16]
819+
df_float_hash = DeepHashPrep(df_float)[df_float]
820+
assert df_u8_hash != df_float_hash
821+
assert df_u8_hash != df_u16_hash
822+
823+
lazy_1 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy()
824+
lazy_2 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy()
825+
lazy_3 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2], "foobar": 5}).lazy()
826+
with pytest.raises(TypeError):
827+
DeepHashPrep(lazy_1)[lazy_1] # lazy dfs can not be compared
828+
df_1 = lazy_1.collect()
829+
df_2 = lazy_2.collect()
830+
df_3 = lazy_3.collect()
831+
df_1_hash = DeepHashPrep(df_1)[df_1]
832+
df_2_hash = DeepHashPrep(df_2)[df_2]
833+
df_3_hash = DeepHashPrep(df_3)[df_3]
834+
assert df_1_hash == df_2_hash
835+
assert df_1_hash != df_3_hash
836+
747837

748838
class TestDeepHashSHA:
749839
"""DeepHash with SHA Tests."""

0 commit comments

Comments
 (0)