BUG: rank raising for arrow string dtypes (pandas-dev#55362)

phofl · phofl · commit 53b945ce645f · 2023-10-04T21:54:46.000+02:00
(cherry picked from commit 6a83910)
diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst
@@ -26,6 +26,7 @@ Bug fixes
 - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
 - Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`)
 - Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
+- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`)
 - Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
 -
 
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -1712,7 +1712,7 @@ def __setitem__(self, key, value) -> None:
             data = pa.chunked_array([data])
         self._pa_array = data
 
-    def _rank(
+    def _rank_calc(
         self,
         *,
         axis: AxisInt = 0,
@@ -1721,9 +1721,6 @@ def _rank(
         ascending: bool = True,
         pct: bool = False,
     ):
-        """
-        See Series.rank.__doc__.
-        """
         if pa_version_under9p0 or axis != 0:
             ranked = super()._rank(
                 axis=axis,
@@ -1738,7 +1735,7 @@ def _rank(
             else:
                 pa_type = pa.uint64()
             result = pa.array(ranked, type=pa_type, from_pandas=True)
-            return type(self)(result)
+            return result
 
         data = self._pa_array.combine_chunks()
         sort_keys = "ascending" if ascending else "descending"
@@ -1777,7 +1774,29 @@ def _rank(
                 divisor = pc.count(result)
             result = pc.divide(result, divisor)
 
-        return type(self)(result)
+        return result
+
+    def _rank(
+        self,
+        *,
+        axis: AxisInt = 0,
+        method: str = "average",
+        na_option: str = "keep",
+        ascending: bool = True,
+        pct: bool = False,
+    ):
+        """
+        See Series.rank.__doc__.
+        """
+        return type(self)(
+            self._rank_calc(
+                axis=axis,
+                method=method,
+                na_option=na_option,
+                ascending=ascending,
+                pct=pct,
+            )
+        )
 
     def _quantile(self, qs: npt.NDArray[np.float64], interpolation: str) -> Self:
         """
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -15,7 +15,10 @@
     lib,
     missing as libmissing,
 )
-from pandas.compat import pa_version_under7p0
+from pandas.compat import (
+    pa_version_under7p0,
+    pa_version_under13p0,
+)
 from pandas.util._exceptions import find_stack_level
 
 from pandas.core.dtypes.common import (
@@ -48,6 +51,7 @@
 
 if TYPE_CHECKING:
     from pandas._typing import (
+        AxisInt,
         Dtype,
         Scalar,
         npt,
@@ -444,6 +448,65 @@ def _str_rstrip(self, to_strip=None):
             result = pc.utf8_rtrim(self._pa_array, characters=to_strip)
         return type(self)(result)
 
+    def _str_removeprefix(self, prefix: str):
+        if not pa_version_under13p0:
+            starts_with = pc.starts_with(self._pa_array, pattern=prefix)
+            removed = pc.utf8_slice_codeunits(self._pa_array, len(prefix))
+            result = pc.if_else(starts_with, removed, self._pa_array)
+            return type(self)(result)
+        return super()._str_removeprefix(prefix)
+
+    def _str_removesuffix(self, suffix: str):
+        ends_with = pc.ends_with(self._pa_array, pattern=suffix)
+        removed = pc.utf8_slice_codeunits(self._pa_array, 0, stop=-len(suffix))
+        result = pc.if_else(ends_with, removed, self._pa_array)
+        return type(self)(result)
+
+    def _str_count(self, pat: str, flags: int = 0):
+        if flags:
+            return super()._str_count(pat, flags)
+        result = pc.count_substring_regex(self._pa_array, pat)
+        return self._convert_int_dtype(result)
+
+    def _str_find(self, sub: str, start: int = 0, end: int | None = None):
+        if start != 0 and end is not None:
+            slices = pc.utf8_slice_codeunits(self._pa_array, start, stop=end)
+            result = pc.find_substring(slices, sub)
+            not_found = pc.equal(result, -1)
+            offset_result = pc.add(result, end - start)
+            result = pc.if_else(not_found, result, offset_result)
+        elif start == 0 and end is None:
+            slices = self._pa_array
+            result = pc.find_substring(slices, sub)
+        else:
+            return super()._str_find(sub, start, end)
+        return self._convert_int_dtype(result)
+
+    def _convert_int_dtype(self, result):
+        return Int64Dtype().__from_arrow__(result)
+
+    def _rank(
+        self,
+        *,
+        axis: AxisInt = 0,
+        method: str = "average",
+        na_option: str = "keep",
+        ascending: bool = True,
+        pct: bool = False,
+    ):
+        """
+        See Series.rank.__doc__.
+        """
+        return self._convert_int_dtype(
+            self._rank_calc(
+                axis=axis,
+                method=method,
+                na_option=na_option,
+                ascending=ascending,
+                pct=pct,
+            )
+        )
+
 
 class ArrowStringArrayNumpySemantics(ArrowStringArray):
     _storage = "pyarrow_numpy"
@@ -527,6 +590,10 @@ def _str_map(
             return lib.map_infer_mask(arr, f, mask.view("uint8"))
 
     def _convert_int_dtype(self, result):
+        if isinstance(result, pa.Array):
+            result = result.to_numpy(zero_copy_only=False)
+        else:
+            result = result.to_numpy()
         if result.dtype == np.int32:
             result = result.astype(np.int64)
         return result
diff --git a/pandas/tests/frame/methods/test_rank.py b/pandas/tests/frame/methods/test_rank.py
@@ -488,3 +488,15 @@ def test_rank_mixed_axis_zero(self, data, expected):
             df.rank()
         result = df.rank(numeric_only=True)
         tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        "dtype, exp_dtype",
+        [("string[pyarrow]", "Int64"), ("string[pyarrow_numpy]", "float64")],
+    )
+    def test_rank_string_dtype(self, dtype, exp_dtype):
+        # GH#55362
+        pytest.importorskip("pyarrow")
+        obj = Series(["foo", "foo", None, "foo"], dtype=dtype)
+        result = obj.rank(method="first")
+        expected = Series([1, 2, None, 3], dtype=exp_dtype)
+        tm.assert_series_equal(result, expected)

Original file line number	Diff line number	Diff line change
`@@ -26,6 +26,7 @@ Bug fixes`
`26`	`26`	- Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`)
`27`	`27`	- Fixed bug in :meth:`DataFrame.interpolate` raising incorrect error message (:issue:`55347`)
`28`	`28`	- Fixed bug in :meth:`Index.insert` raising when inserting ``None`` into :class:`Index` with ``dtype="string[pyarrow_numpy]"`` (:issue:`55365`)
	`29`	+- Fixed bug in :meth:`Series.rank` for ``string[pyarrow_numpy]`` dtype (:issue:`55362`)
`29`	`30`	- Silence ``Period[B]`` warnings introduced by :issue:`53446` during normal plotting activity (:issue:`55138`)
`30`	`31`	`-`
`31`	`32`