Skip to content

feature: optional pandas and polars support #467

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jul 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 25 additions & 2 deletions deepdiff/deephash.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,17 @@
number_to_string, datetime_normalize, KEY_TO_VAL_STR, short_repr,
get_truncate_datetime, dict_, add_root_to_paths)
from deepdiff.base import Base

try:
import pandas
except ImportError:
pandas = False

try:
import polars
except ImportError:
polars = False

logger = logging.getLogger(__name__)

UNPROCESSED_KEY = object()
Expand Down Expand Up @@ -448,7 +459,6 @@ def _prep_path(self, obj):
type_ = obj.__class__.__name__
return KEY_TO_VAL_STR.format(type_, obj)


def _prep_number(self, obj):
type_ = "number" if self.ignore_numeric_type_changes else obj.__class__.__name__
if self.significant_digits is not None:
Expand Down Expand Up @@ -479,7 +489,7 @@ def _prep_tuple(self, obj, parent, parents_ids):
return result, counts

def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET):
"""The main diff method"""
"""The main hash method"""
counts = 1

if isinstance(obj, bool):
Expand Down Expand Up @@ -529,6 +539,19 @@ def _hash(self, obj, parent, parents_ids=EMPTY_FROZENSET):
elif isinstance(obj, tuple):
result, counts = self._prep_tuple(obj=obj, parent=parent, parents_ids=parents_ids)

elif (pandas and isinstance(obj, pandas.DataFrame)):
def gen():
yield ('dtype', obj.dtypes)
yield ('index', obj.index)
yield from obj.items() # which contains (column name, series tuples)
result, counts = self._prep_iterable(obj=gen(), parent=parent, parents_ids=parents_ids)
elif (polars and isinstance(obj, polars.DataFrame)):
def gen():
yield from obj.columns
yield from list(obj.schema.items())
yield from obj.rows()
result, counts = self._prep_iterable(obj=gen(), parent=parent, parents_ids=parents_ids)

elif isinstance(obj, Iterable):
result, counts = self._prep_iterable(obj=obj, parent=parent, parents_ids=parents_ids)

Expand Down
2 changes: 2 additions & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,5 @@ tomli==2.0.1
tomli-w==1.0.0
pydantic==2.7.4
pytest-benchmark==4.0.0
pandas>=1.6
polars=>0.19.11
90 changes: 90 additions & 0 deletions tests/test_hash.py
Original file line number Diff line number Diff line change
Expand Up @@ -744,6 +744,96 @@ def test_hash_numpy_array2_multi_dimensional_can_not_retrieve_individual_array_i
except Exception as e:
assert str(e).strip("'") == HASH_LOOKUP_ERR_MSG.format(t1[0])

def test_pandas(self):
import pandas as pd
df = pd.DataFrame({"a": [1]})
equal_df = pd.DataFrame({"a": [1]})
df_same_column_names = pd.DataFrame({"a": [1, 2]})
other_df = pd.DataFrame({"b": [1]})
df_hash = DeepHashPrep(df)[df]
equal_df_hash = DeepHashPrep(equal_df)[equal_df]
df_same_column_names_hash = DeepHashPrep(df_same_column_names)[df_same_column_names]
other_df_hash = DeepHashPrep(other_df)[other_df]
assert df_hash == equal_df_hash
assert df_hash != df_same_column_names_hash
assert df_hash != other_df_hash

df_mixed = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
df_mixed_2 = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
df_mixed_3 = pd.DataFrame({'a': [1], 'b': ['one'], 'c': [(1, 2)]})
df_mixed_4 = pd.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 3)]})
df_mixed_hash = DeepHashPrep(df_mixed)[df_mixed]
df_mixed_2_hash = DeepHashPrep(df_mixed_2)[df_mixed_2]
df_mixed_3_hash = DeepHashPrep(df_mixed_3)[df_mixed_3]
df_mixed_4_hash = DeepHashPrep(df_mixed_4)[df_mixed_4]
assert df_mixed_hash == df_mixed_2_hash
assert df_mixed_hash != df_mixed_3_hash
assert df_mixed_hash != df_mixed_4_hash

df_u8 = pd.DataFrame({'a': np.array([1], dtype=np.uint8)})
df_u16 = pd.DataFrame({'a': np.array([1], dtype=np.uint16)})
df_float = pd.DataFrame({'a': np.array([1], dtype=np.float32)})
df_u8_hash = DeepHashPrep(df_u8)[df_u8]
df_u16_hash = DeepHashPrep(df_u16)[df_u16]
df_float_hash = DeepHashPrep(df_float)[df_float]
assert df_u8_hash != df_float_hash
assert df_u8_hash != df_u16_hash

df_index = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 3])
df_index_diff = pd.DataFrame({'a': [1, 2, 3]}, index=[1, 2, 4])
df_index_hash = DeepHashPrep(df_index)[df_index]
df_index_diff_hash = DeepHashPrep(df_index_diff)[df_index_diff]
assert df_index_hash != df_index_diff_hash

def test_polars(self):
import polars as pl
df = pl.DataFrame({"a": [1]})
equal_df = pl.DataFrame({"a": [1]})
df_same_column_names = pl.DataFrame({"a": [1, 2]})
other_df = pl.DataFrame({"b": [1]})
df_hash = DeepHashPrep(df)[df]
equal_df_hash = DeepHashPrep(equal_df)[equal_df]
df_same_column_names_hash = DeepHashPrep(df_same_column_names)[df_same_column_names]
other_df_hash = DeepHashPrep(other_df)[other_df]
assert df_hash == equal_df_hash
assert df_hash != df_same_column_names_hash
assert df_hash != other_df_hash

df_mixed = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
df_mixed_2 = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 2)]})
df_mixed_3 = pl.DataFrame({'a': [1], 'b': ['one'], 'c': [(1, 2)]})
df_mixed_4 = pl.DataFrame({'a': [1], 'b': ['two'], 'c': [(1, 3)]})
df_mixed_hash = DeepHashPrep(df_mixed)[df_mixed]
df_mixed_2_hash = DeepHashPrep(df_mixed_2)[df_mixed_2]
df_mixed_3_hash = DeepHashPrep(df_mixed_3)[df_mixed_3]
df_mixed_4_hash = DeepHashPrep(df_mixed_4)[df_mixed_4]
assert df_mixed_hash == df_mixed_2_hash
assert df_mixed_hash != df_mixed_3_hash
assert df_mixed_hash != df_mixed_4_hash

df_u8 = pl.DataFrame({'a': np.array([1], dtype=np.uint8)})
df_u16 = pl.DataFrame({'a': np.array([1], dtype=np.uint16)})
df_float = pl.DataFrame({'a': np.array([1], dtype=np.float32)})
df_u8_hash = DeepHashPrep(df_u8)[df_u8]
df_u16_hash = DeepHashPrep(df_u16)[df_u16]
df_float_hash = DeepHashPrep(df_float)[df_float]
assert df_u8_hash != df_float_hash
assert df_u8_hash != df_u16_hash

lazy_1 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy()
lazy_2 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2]}).lazy()
lazy_3 = pl.DataFrame({"foo": ["a", "b", "c"], "bar": [0, 1, 2], "foobar": 5}).lazy()
with pytest.raises(TypeError):
DeepHashPrep(lazy_1)[lazy_1] # lazy dfs can not be compared
df_1 = lazy_1.collect()
df_2 = lazy_2.collect()
df_3 = lazy_3.collect()
df_1_hash = DeepHashPrep(df_1)[df_1]
df_2_hash = DeepHashPrep(df_2)[df_2]
df_3_hash = DeepHashPrep(df_3)[df_3]
assert df_1_hash == df_2_hash
assert df_1_hash != df_3_hash


class TestDeepHashSHA:
"""DeepHash with SHA Tests."""
Expand Down