@@ -744,6 +744,96 @@ def test_hash_numpy_array2_multi_dimensional_can_not_retrieve_individual_array_i
744
744
except Exception as e :
745
745
assert str (e ).strip ("'" ) == HASH_LOOKUP_ERR_MSG .format (t1 [0 ])
746
746
747
+ def test_pandas (self ):
748
+ import pandas as pd
749
+ df = pd .DataFrame ({"a" : [1 ]})
750
+ equal_df = pd .DataFrame ({"a" : [1 ]})
751
+ df_same_column_names = pd .DataFrame ({"a" : [1 , 2 ]})
752
+ other_df = pd .DataFrame ({"b" : [1 ]})
753
+ df_hash = DeepHashPrep (df )[df ]
754
+ equal_df_hash = DeepHashPrep (equal_df )[equal_df ]
755
+ df_same_column_names_hash = DeepHashPrep (df_same_column_names )[df_same_column_names ]
756
+ other_df_hash = DeepHashPrep (other_df )[other_df ]
757
+ assert df_hash == equal_df_hash
758
+ assert df_hash != df_same_column_names_hash
759
+ assert df_hash != other_df_hash
760
+
761
+ df_mixed = pd .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 2 )]})
762
+ df_mixed_2 = pd .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 2 )]})
763
+ df_mixed_3 = pd .DataFrame ({'a' : [1 ], 'b' : ['one' ], 'c' : [(1 , 2 )]})
764
+ df_mixed_4 = pd .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 3 )]})
765
+ df_mixed_hash = DeepHashPrep (df_mixed )[df_mixed ]
766
+ df_mixed_2_hash = DeepHashPrep (df_mixed_2 )[df_mixed_2 ]
767
+ df_mixed_3_hash = DeepHashPrep (df_mixed_3 )[df_mixed_3 ]
768
+ df_mixed_4_hash = DeepHashPrep (df_mixed_4 )[df_mixed_4 ]
769
+ assert df_mixed_hash == df_mixed_2_hash
770
+ assert df_mixed_hash != df_mixed_3_hash
771
+ assert df_mixed_hash != df_mixed_4_hash
772
+
773
+ df_u8 = pd .DataFrame ({'a' : np .array ([1 ], dtype = np .uint8 )})
774
+ df_u16 = pd .DataFrame ({'a' : np .array ([1 ], dtype = np .uint16 )})
775
+ df_float = pd .DataFrame ({'a' : np .array ([1 ], dtype = np .float32 )})
776
+ df_u8_hash = DeepHashPrep (df_u8 )[df_u8 ]
777
+ df_u16_hash = DeepHashPrep (df_u16 )[df_u16 ]
778
+ df_float_hash = DeepHashPrep (df_float )[df_float ]
779
+ assert df_u8_hash != df_float_hash
780
+ assert df_u8_hash != df_u16_hash
781
+
782
+ df_index = pd .DataFrame ({'a' : [1 , 2 , 3 ]}, index = [1 , 2 , 3 ])
783
+ df_index_diff = pd .DataFrame ({'a' : [1 , 2 , 3 ]}, index = [1 , 2 , 4 ])
784
+ df_index_hash = DeepHashPrep (df_index )[df_index ]
785
+ df_index_diff_hash = DeepHashPrep (df_index_diff )[df_index_diff ]
786
+ assert df_index_hash != df_index_diff_hash
787
+
788
+ def test_polars (self ):
789
+ import polars as pl
790
+ df = pl .DataFrame ({"a" : [1 ]})
791
+ equal_df = pl .DataFrame ({"a" : [1 ]})
792
+ df_same_column_names = pl .DataFrame ({"a" : [1 , 2 ]})
793
+ other_df = pl .DataFrame ({"b" : [1 ]})
794
+ df_hash = DeepHashPrep (df )[df ]
795
+ equal_df_hash = DeepHashPrep (equal_df )[equal_df ]
796
+ df_same_column_names_hash = DeepHashPrep (df_same_column_names )[df_same_column_names ]
797
+ other_df_hash = DeepHashPrep (other_df )[other_df ]
798
+ assert df_hash == equal_df_hash
799
+ assert df_hash != df_same_column_names_hash
800
+ assert df_hash != other_df_hash
801
+
802
+ df_mixed = pl .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 2 )]})
803
+ df_mixed_2 = pl .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 2 )]})
804
+ df_mixed_3 = pl .DataFrame ({'a' : [1 ], 'b' : ['one' ], 'c' : [(1 , 2 )]})
805
+ df_mixed_4 = pl .DataFrame ({'a' : [1 ], 'b' : ['two' ], 'c' : [(1 , 3 )]})
806
+ df_mixed_hash = DeepHashPrep (df_mixed )[df_mixed ]
807
+ df_mixed_2_hash = DeepHashPrep (df_mixed_2 )[df_mixed_2 ]
808
+ df_mixed_3_hash = DeepHashPrep (df_mixed_3 )[df_mixed_3 ]
809
+ df_mixed_4_hash = DeepHashPrep (df_mixed_4 )[df_mixed_4 ]
810
+ assert df_mixed_hash == df_mixed_2_hash
811
+ assert df_mixed_hash != df_mixed_3_hash
812
+ assert df_mixed_hash != df_mixed_4_hash
813
+
814
+ df_u8 = pl .DataFrame ({'a' : np .array ([1 ], dtype = np .uint8 )})
815
+ df_u16 = pl .DataFrame ({'a' : np .array ([1 ], dtype = np .uint16 )})
816
+ df_float = pl .DataFrame ({'a' : np .array ([1 ], dtype = np .float32 )})
817
+ df_u8_hash = DeepHashPrep (df_u8 )[df_u8 ]
818
+ df_u16_hash = DeepHashPrep (df_u16 )[df_u16 ]
819
+ df_float_hash = DeepHashPrep (df_float )[df_float ]
820
+ assert df_u8_hash != df_float_hash
821
+ assert df_u8_hash != df_u16_hash
822
+
823
+ lazy_1 = pl .DataFrame ({"foo" : ["a" , "b" , "c" ], "bar" : [0 , 1 , 2 ]}).lazy ()
824
+ lazy_2 = pl .DataFrame ({"foo" : ["a" , "b" , "c" ], "bar" : [0 , 1 , 2 ]}).lazy ()
825
+ lazy_3 = pl .DataFrame ({"foo" : ["a" , "b" , "c" ], "bar" : [0 , 1 , 2 ], "foobar" : 5 }).lazy ()
826
+ with pytest .raises (TypeError ):
827
+ DeepHashPrep (lazy_1 )[lazy_1 ] # lazy dfs can not be compared
828
+ df_1 = lazy_1 .collect ()
829
+ df_2 = lazy_2 .collect ()
830
+ df_3 = lazy_3 .collect ()
831
+ df_1_hash = DeepHashPrep (df_1 )[df_1 ]
832
+ df_2_hash = DeepHashPrep (df_2 )[df_2 ]
833
+ df_3_hash = DeepHashPrep (df_3 )[df_3 ]
834
+ assert df_1_hash == df_2_hash
835
+ assert df_1_hash != df_3_hash
836
+
747
837
748
838
class TestDeepHashSHA :
749
839
"""DeepHash with SHA Tests."""
0 commit comments