BUG: algos.factorizes moves null values when sort=False

rhshadrach · rhshadrach · commit 491a6cc0bab5 · 2022-04-01T10:14:36.000-04:00
diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in
@@ -658,7 +658,7 @@ cdef class {{name}}HashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -690,7 +690,7 @@ cdef class {{name}}HashTable(HashTable):
         """
         uniques_vector = {{name}}Vector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True, mask=mask,
+                            na_value=na_value, ignore_na=ignore_na, mask=mask,
                             return_inverse=True)
 
     def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques,
@@ -1037,7 +1037,7 @@ cdef class StringHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1067,7 +1067,7 @@ cdef class StringHashTable(HashTable):
         """
         uniques_vector = ObjectVector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
@@ -1290,7 +1290,7 @@ cdef class PyObjectHashTable(HashTable):
                             return_inverse=return_inverse)
 
     def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1,
-                  object na_value=None, object mask=None):
+                  object na_value=None, object mask=None, ignore_na=True):
         """
         Calculate unique values and labels (no sorting!)
 
@@ -1320,7 +1320,7 @@ cdef class PyObjectHashTable(HashTable):
         """
         uniques_vector = ObjectVector()
         return self._unique(values, uniques_vector, na_sentinel=na_sentinel,
-                            na_value=na_value, ignore_na=True,
+                            na_value=na_value, ignore_na=ignore_na,
                             return_inverse=True)
 
     def get_labels(self, ndarray[object] values, ObjectVector uniques,
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -502,6 +502,7 @@ def factorize_array(
     size_hint: int | None = None,
     na_value=None,
     mask: npt.NDArray[np.bool_] | None = None,
+    ignore_na: bool = True,
 ) -> tuple[npt.NDArray[np.intp], np.ndarray]:
     """
     Factorize a numpy array to codes and uniques.
@@ -541,7 +542,11 @@ def factorize_array(
 
     table = hash_klass(size_hint or len(values))
     uniques, codes = table.factorize(
-        values, na_sentinel=na_sentinel, na_value=na_value, mask=mask
+        values,
+        na_sentinel=na_sentinel,
+        na_value=na_value,
+        mask=mask,
+        ignore_na=ignore_na,
     )
 
     # re-cast e.g. i8->dt64/td64, uint8->bool
@@ -728,25 +733,31 @@ def factorize(
 
     if not isinstance(values.dtype, np.dtype):
         # i.e. ExtensionDtype
+        assert dropna or sort
         codes, uniques = values.factorize(na_sentinel=na_sentinel)
     else:
         values = np.asarray(values)  # convert DTA/TDA/MultiIndex
         codes, uniques = factorize_array(
-            values, na_sentinel=na_sentinel, size_hint=size_hint
+            values,
+            na_sentinel=na_sentinel,
+            size_hint=size_hint,
+            ignore_na=dropna or sort,
         )
 
     if sort and len(uniques) > 0:
         uniques, codes = safe_sort(
             uniques, codes, na_sentinel=na_sentinel, assume_unique=True, verify=False
         )
 
-    code_is_na = codes == na_sentinel
-    if not dropna and code_is_na.any():
-        # na_value is set based on the dtype of uniques, and compat set to False is
-        # because we do not want na_value to be 0 for integers
-        na_value = na_value_for_dtype(uniques.dtype, compat=False)
-        uniques = np.append(uniques, [na_value])
-        codes = np.where(code_is_na, len(uniques) - 1, codes)
+    # TODO: Fix
+    if not dropna and (sort or not isinstance(values.dtype, np.dtype)):
+        code_is_na = codes == na_sentinel
+        if code_is_na.any():
+            # na_value is set based on the dtype of uniques, and compat set to False is
+            # because we do not want na_value to be 0 for integers
+            na_value = na_value_for_dtype(uniques.dtype, compat=False)
+            uniques = np.append(uniques, [na_value])
+            codes = np.where(code_is_na, len(uniques) - 1, codes)
 
     uniques = _reconstruct_data(uniques, original.dtype, original)
 
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -1044,7 +1044,7 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         arr, na_value = self._values_for_factorize()
 
         codes, uniques = factorize_array(
-            arr, na_sentinel=na_sentinel, na_value=na_value
+            arr, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True
         )
 
         uniques_ea = self._from_factorized(uniques, self)
diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py
@@ -348,3 +348,12 @@ def test_groupby_nan_included():
         tm.assert_numpy_array_equal(result_values, expected_values)
     assert np.isnan(list(result.keys())[2])
     assert list(result.keys())[0:2] == ["g1", "g2"]
+
+
+def test_no_sort_keep_na():
+    df = pd.DataFrame({"a": ["x", "y", None, "z"], "b": [1, 2, 3, 4]})
+    gb = df.groupby("a", dropna=False, sort=False)
+    result = gb.sum()
+    expected = pd.DataFrame({"b": [1, 2, 3, 4]}, index=["x", "y", None, "z"])
+    expected.index.names = ["a"]
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -436,13 +436,13 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):
         [
             (
                 ["a", None, "b", "a"],
-                np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
-                np.array(["a", "b", np.nan], dtype=object),
+                np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
+                np.array(["a", None, "b"], dtype=object),
             ),
             (
                 ["a", np.nan, "b", "a"],
-                np.array([0, 2, 1, 0], dtype=np.dtype("intp")),
-                np.array(["a", "b", np.nan], dtype=object),
+                np.array([0, 1, 2, 0], dtype=np.dtype("intp")),
+                np.array(["a", np.nan, "b"], dtype=object),
             ),
         ],
     )
@@ -459,13 +459,13 @@ def test_object_factorize_na_sentinel_none(
         [
             (
                 [1, None, 1, 2],
-                np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
-                np.array([1, 2, np.nan], dtype="O"),
+                np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
+                np.array([1, None, 2], dtype="O"),
             ),
             (
                 [1, np.nan, 1, 2],
-                np.array([0, 2, 0, 1], dtype=np.dtype("intp")),
-                np.array([1, 2, np.nan], dtype=np.float64),
+                np.array([0, 1, 0, 2], dtype=np.dtype("intp")),
+                np.array([1, np.nan, 2], dtype=np.float64),
             ),
         ],
     )

Original file line number	Diff line number	Diff line change
`@@ -1044,7 +1044,7 @@ def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:`
`1044`	`1044`	`arr, na_value = self._values_for_factorize()`
`1045`	`1045`
`1046`	`1046`	`codes, uniques = factorize_array(`
`1047`		`- arr, na_sentinel=na_sentinel, na_value=na_value`
	`1047`	`+ arr, na_sentinel=na_sentinel, na_value=na_value, ignore_na=True`
`1048`	`1048`	`)`
`1049`	`1049`
`1050`	`1050`	`uniques_ea = self._from_factorized(uniques, self)`
Original file line number	Diff line number	Diff line change
`@@ -436,13 +436,13 @@ def test_factorize_na_sentinel(self, sort, na_sentinel, data, uniques):`
`436`	`436`	`[`
`437`	`437`	`(`
`438`	`438`	`["a", None, "b", "a"],`
`439`		`- np.array([0, 2, 1, 0], dtype=np.dtype("intp")),`
`440`		`- np.array(["a", "b", np.nan], dtype=object),`
	`439`	`+ np.array([0, 1, 2, 0], dtype=np.dtype("intp")),`
	`440`	`+ np.array(["a", None, "b"], dtype=object),`
`441`	`441`	`),`
`442`	`442`	`(`
`443`	`443`	`["a", np.nan, "b", "a"],`
`444`		`- np.array([0, 2, 1, 0], dtype=np.dtype("intp")),`
`445`		`- np.array(["a", "b", np.nan], dtype=object),`
	`444`	`+ np.array([0, 1, 2, 0], dtype=np.dtype("intp")),`
	`445`	`+ np.array(["a", np.nan, "b"], dtype=object),`
`446`	`446`	`),`
`447`	`447`	`],`
`448`	`448`	`)`
`@@ -459,13 +459,13 @@ def test_object_factorize_na_sentinel_none(`
`459`	`459`	`[`
`460`	`460`	`(`
`461`	`461`	`[1, None, 1, 2],`
`462`		`- np.array([0, 2, 0, 1], dtype=np.dtype("intp")),`
`463`		`- np.array([1, 2, np.nan], dtype="O"),`
	`462`	`+ np.array([0, 1, 0, 2], dtype=np.dtype("intp")),`
	`463`	`+ np.array([1, None, 2], dtype="O"),`
`464`	`464`	`),`
`465`	`465`	`(`
`466`	`466`	`[1, np.nan, 1, 2],`
`467`		`- np.array([0, 2, 0, 1], dtype=np.dtype("intp")),`
`468`		`- np.array([1, 2, np.nan], dtype=np.float64),`
	`467`	`+ np.array([0, 1, 0, 2], dtype=np.dtype("intp")),`
	`468`	`+ np.array([1, np.nan, 2], dtype=np.float64),`
`469`	`469`	`),`
`470`	`470`	`],`
`471`	`471`	`)`