Skip to content

Commit e3cdb6e

Browse files
author
tmnhat2001
committed
Add test_drop_duplicates for Categorical dtypes
1 parent 15fa4bd commit e3cdb6e

File tree

1 file changed

+143
-0
lines changed

1 file changed

+143
-0
lines changed

pandas/tests/test_categorical.py

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -797,6 +797,149 @@ def test_set_categories_inplace(self):
797797
cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
798798
tm.assert_index_equal(cat.categories, pd.Index(['a', 'b', 'c', 'd']))
799799

800+
@pytest.mark.parametrize(
801+
"input1, input2, cat_array",
802+
[
803+
(
804+
np.array([1, 2, 3, 3], dtype=np.dtype('int_')),
805+
np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('int_')),
806+
np.array([1, 2, 3, 4, 5], dtype=np.dtype('int_'))
807+
),
808+
(
809+
np.array([1, 2, 3, 3], dtype=np.dtype('uint')),
810+
np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('uint')),
811+
np.array([1, 2, 3, 4, 5], dtype=np.dtype('uint'))
812+
),
813+
(
814+
np.array([1, 2, 3, 3], dtype=np.dtype('float_')),
815+
np.array([1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('float_')),
816+
np.array([1, 2, 3, 4, 5], dtype=np.dtype('float_'))
817+
),
818+
(
819+
np.array(
820+
[1, 2, 3, 3], dtype=np.dtype('unicode_')
821+
),
822+
np.array(
823+
[1, 2, 3, 5, 3, 2, 4], dtype=np.dtype('unicode_')
824+
),
825+
np.array(
826+
[1, 2, 3, 4, 5], dtype=np.dtype('unicode_')
827+
)
828+
),
829+
(
830+
np.array(
831+
[
832+
'2017-01-01 10:00:00', '2017-02-01 10:00:00',
833+
'2017-03-01 10:00:00', '2017-03-01 10:00:00'
834+
],
835+
dtype='datetime64'
836+
),
837+
np.array(
838+
[
839+
'2017-01-01 10:00:00', '2017-02-01 10:00:00',
840+
'2017-03-01 10:00:00', '2017-05-01 10:00:00',
841+
'2017-03-01 10:00:00', '2017-02-01 10:00:00',
842+
'2017-04-01 10:00:00'
843+
],
844+
dtype='datetime64'
845+
),
846+
np.array(
847+
[
848+
'2017-01-01 10:00:00', '2017-02-01 10:00:00',
849+
'2017-03-01 10:00:00', '2017-04-01 10:00:00',
850+
'2017-05-01 10:00:00'
851+
],
852+
dtype='datetime64'
853+
)
854+
),
855+
(
856+
pd.to_timedelta(['1 days', '2 days', '3 days', '3 days'],
857+
unit="D"),
858+
pd.to_timedelta(['1 days', '2 days', '3 days', '5 days',
859+
'3 days', '2 days', '4 days'], unit="D"),
860+
pd.timedelta_range("1 days", periods=5, freq="D")
861+
)
862+
]
863+
)
864+
@pytest.mark.parametrize("is_ordered", [True, False])
865+
def test_drop_duplicates_non_bool(self, input1, input2,
866+
cat_array, is_ordered):
867+
# Test case 1
868+
tc1 = Series(Categorical(input1, categories=cat_array,
869+
ordered=is_ordered))
870+
expected = Series([False, False, False, True])
871+
tm.assert_series_equal(tc1.duplicated(), expected)
872+
tm.assert_series_equal(tc1.drop_duplicates(), tc1[~expected])
873+
sc = tc1.copy()
874+
sc.drop_duplicates(inplace=True)
875+
tm.assert_series_equal(sc, tc1[~expected])
876+
877+
expected = Series([False, False, True, False])
878+
tm.assert_series_equal(tc1.duplicated(keep='last'), expected)
879+
tm.assert_series_equal(tc1.drop_duplicates(keep='last'),
880+
tc1[~expected])
881+
sc = tc1.copy()
882+
sc.drop_duplicates(keep='last', inplace=True)
883+
tm.assert_series_equal(sc, tc1[~expected])
884+
885+
expected = Series([False, False, True, True])
886+
tm.assert_series_equal(tc1.duplicated(keep=False), expected)
887+
tm.assert_series_equal(tc1.drop_duplicates(keep=False), tc1[~expected])
888+
sc = tc1.copy()
889+
sc.drop_duplicates(keep=False, inplace=True)
890+
tm.assert_series_equal(sc, tc1[~expected])
891+
892+
# Test case 2
893+
tc2 = Series(Categorical(input2, categories=cat_array,
894+
ordered=is_ordered))
895+
expected = Series([False, False, False, False, True, True, False])
896+
tm.assert_series_equal(tc2.duplicated(), expected)
897+
tm.assert_series_equal(tc2.drop_duplicates(), tc2[~expected])
898+
sc = tc2.copy()
899+
sc.drop_duplicates(inplace=True)
900+
tm.assert_series_equal(sc, tc2[~expected])
901+
902+
expected = Series([False, True, True, False, False, False, False])
903+
tm.assert_series_equal(tc2.duplicated(keep='last'), expected)
904+
tm.assert_series_equal(tc2.drop_duplicates(keep='last'),
905+
tc2[~expected])
906+
sc = tc2.copy()
907+
sc.drop_duplicates(keep='last', inplace=True)
908+
tm.assert_series_equal(sc, tc2[~expected])
909+
910+
expected = Series([False, True, True, False, True, True, False])
911+
tm.assert_series_equal(tc2.duplicated(keep=False), expected)
912+
tm.assert_series_equal(tc2.drop_duplicates(keep=False), tc2[~expected])
913+
sc = tc2.copy()
914+
sc.drop_duplicates(keep=False, inplace=True)
915+
tm.assert_series_equal(sc, tc2[~expected])
916+
917+
@pytest.mark.parametrize("is_ordered", [True, False])
918+
def test_drop_duplicates_bool(self, is_ordered):
919+
tc = Series(Categorical([True, False, True, False],
920+
categories=[True, False], ordered=is_ordered))
921+
922+
expected = Series([False, False, True, True])
923+
tm.assert_series_equal(tc.duplicated(), expected)
924+
tm.assert_series_equal(tc.drop_duplicates(), tc[~expected])
925+
sc = tc.copy()
926+
sc.drop_duplicates(inplace=True)
927+
tm.assert_series_equal(sc, tc[~expected])
928+
929+
expected = Series([True, True, False, False])
930+
tm.assert_series_equal(tc.duplicated(keep='last'), expected)
931+
tm.assert_series_equal(tc.drop_duplicates(keep='last'), tc[~expected])
932+
sc = tc.copy()
933+
sc.drop_duplicates(keep='last', inplace=True)
934+
tm.assert_series_equal(sc, tc[~expected])
935+
936+
expected = Series([True, True, True, True])
937+
tm.assert_series_equal(tc.duplicated(keep=False), expected)
938+
tm.assert_series_equal(tc.drop_duplicates(keep=False), tc[~expected])
939+
sc = tc.copy()
940+
sc.drop_duplicates(keep=False, inplace=True)
941+
tm.assert_series_equal(sc, tc[~expected])
942+
800943
def test_describe(self):
801944
# string type
802945
desc = self.factor.describe()

0 commit comments

Comments
 (0)