Skip to content

Commit 491a6cc

Browse files
committed
BUG: algos.factorizes moves null values when sort=False
1 parent efb262f commit 491a6cc

File tree

5 files changed

+44
-24
lines changed

5 files changed

+44
-24
lines changed

pandas/_libs/hashtable_class_helper.pxi.in

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -658,7 +658,7 @@ cdef class {{name}}HashTable(HashTable):
658658
return_inverse=return_inverse)
659659

660660
def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
661-
object na_value=None, object mask=None):
661+
object na_value=None, object mask=None, ignore_na=True):
662662
"""
663663
Calculate unique values and labels (no sorting!)
664664

@@ -690,7 +690,7 @@ cdef class {{name}}HashTable(HashTable):
690690
"""
691691
uniques_vector = {{name}}Vector()
692692
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
693-
na_value=na_value, ignore_na=True, mask=mask,
693+
na_value=na_value, ignore_na=ignore_na, mask=mask,
694694
return_inverse=True)
695695

696696
def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -1037,7 +1037,7 @@ cdef class StringHashTable(HashTable):
10371037
return_inverse=return_inverse)
10381038

10391039
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1040-
object na_value=None, object mask=None):
1040+
object na_value=None, object mask=None, ignore_na=True):
10411041
"""
10421042
Calculate unique values and labels (no sorting!)
10431043

@@ -1067,7 +1067,7 @@ cdef class StringHashTable(HashTable):
10671067
"""
10681068
uniques_vector = ObjectVector()
10691069
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
1070-
na_value=na_value, ignore_na=True,
1070+
na_value=na_value, ignore_na=ignore_na,
10711071
return_inverse=True)
10721072

10731073
def get_labels(self, ndarray[object] values, ObjectVector uniques,
@@ -1290,7 +1290,7 @@ cdef class PyObjectHashTable(HashTable):
12901290
return_inverse=return_inverse)
12911291

12921292
def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
1293-
object na_value=None, object mask=None):
1293+
object na_value=None, object mask=None, ignore_na=True):
12941294
"""
12951295
Calculate unique values and labels (no sorting!)
12961296

@@ -1320,7 +1320,7 @@ cdef class PyObjectHashTable(HashTable):
13201320
"""
13211321
uniques_vector = ObjectVector()
13221322
return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
1323-
na_value=na_value, ignore_na=True,
1323+
na_value=na_value, ignore_na=ignore_na,
13241324
return_inverse=True)
13251325

13261326
def get_labels(self, ndarray[object] values, ObjectVector uniques,

pandas/core/algorithms.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -502,6 +502,7 @@ def factorize_array(
502502
size_hint: int | None = None,
503503
na_value=None,
504504
mask: npt.NDArray[np.bool_] | None = None,
505+
ignore_na: bool = True,
505506
) -> tuple[npt.NDArray[np.intp], np.ndarray]:
506507
"""
507508
Factorize a numpy array to codes and uniques.
@@ -541,7 +542,11 @@ def factorize_array(
541542

542543
table = hash_klass(size_hint or len(values))
543544
uniques, codes = table.factorize(
544-
values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
545+
values,
546+
na_sentinel=na_sentinel,
547+
na_value=na_value,
548+
mask=mask,
549+
ignore_na=ignore_na,
545550
)
546551

547552
# re-cast e.g. i8->dt64/td64, uint8->bool
@@ -728,25 +733,31 @@ def factorize(
728733

729734
if not isinstance(values.dtype, np.dtype):
730735
# i.e. ExtensionDtype
736+
assert dropna or sort
731737
codes, uniques = values.factorize(na_sentinel=na_sentinel)
732738
else:
733739
values = np.asarray(values) # convert DTA/TDA/MultiIndex
734740
codes, uniques = factorize_array(
735-
values, na_sentinel=na_sentinel, size_hint=size_hint
741+
values,
742+
na_sentinel=na_sentinel,
743+
size_hint=size_hint,
744+
ignore_na=dropna or sort,
736745
)
737746

738747
if sort and len(uniques) > 0:
739748
uniques, codes = safe_sort(
740749
uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
741750
)
742751

743-
code_is_na = codes == na_sentinel
744-
if not dropna and code_is_na.any():
745-
# na_value is set based on the dtype of uniques, and compat set to False is
746-
# because we do not want na_value to be 0 for integers
747-
na_value = na_value_for_dtype(uniques.dtype, compat=False)
748-
uniques = np.append(uniques, [na_value])
749-
codes = np.where(code_is_na, len(uniques) - 1, codes)
752+
# TODO: Fix
753+
if not dropna and (sort or not isinstance(values.dtype, np.dtype)):
754+
code_is_na = codes == na_sentinel
755+
if code_is_na.any():
756+
# na_value is set based on the dtype of uniques, and compat set to False is
757+
# because we do not want na_value to be 0 for integers
758+
na_value = na_value_for_dtype(uniques.dtype, compat=False)
759+
uniques = np.append(uniques, [na_value])
760+
codes = np.where(code_is_na, len(uniques) - 1, codes)
750761

751762
uniques = _reconstruct_data(uniques, original.dtype, original)
752763

pandas/core/arrays/base.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1044,7 +1044,7 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
10441044
arr, na_value = self._values_for_factorize()
10451045

10461046
codes, uniques = factorize_array(
1047-
arr, na_sentinel=na_sentinel, na_value=na_value
1047+
arr, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True
10481048
)
10491049

10501050
uniques_ea = self._from_factorized(uniques, self)

pandas/tests/groupby/test_groupby_dropna.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,3 +348,12 @@ def test_groupby_nan_included():
348348
tm.assert_numpy_array_equal(result_values, expected_values)
349349
assert np.isnan(list(result.keys())[2])
350350
assert list(result.keys())[0:2] == ["g1", "g2"]
351+
352+
353+
def test_no_sort_keep_na():
354+
df = pd.DataFrame({"a": ["x", "y", None, "z"], "b": [1, 2, 3, 4]})
355+
gb = df.groupby("a", dropna=False, sort=False)
356+
result = gb.sum()
357+
expected = pd.DataFrame({"b": [1, 2, 3, 4]}, index=["x", "y", None, "z"])
358+
expected.index.names = ["a"]
359+
tm.assert_frame_equal(result, expected)

pandas/tests/test_algos.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -436,13 +436,13 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
436436
[
437437
(
438438
["a", None, "b", "a"],
439-
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
440-
np.array(["a", "b", np.nan], dtype=object),
439+
np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
440+
np.array(["a", None, "b"], dtype=object),
441441
),
442442
(
443443
["a", np.nan, "b", "a"],
444-
np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
445-
np.array(["a", "b", np.nan], dtype=object),
444+
np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
445+
np.array(["a", np.nan, "b"], dtype=object),
446446
),
447447
],
448448
)
@@ -459,13 +459,13 @@ def test_object_factorize_na_sentinel_none(
459459
[
460460
(
461461
[1, None, 1, 2],
462-
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
463-
np.array([1, 2, np.nan], dtype="O"),
462+
np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
463+
np.array([1, None, 2], dtype="O"),
464464
),
465465
(
466466
[1, np.nan, 1, 2],
467-
np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
468-
np.array([1, 2, np.nan], dtype=np.float64),
467+
np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
468+
np.array([1, np.nan, 2], dtype=np.float64),
469469
),
470470
],
471471
)

0 commit comments

Comments
 (0)