split out into separate file

MarcoGorelli · MarcoGorelli · commit 45b8da09d09f · 2025-03-08T10:39:07.000Z
diff --git a/pandas-stubs/core/indexes/base.pyi b/pandas-stubs/core/indexes/base.pyi
@@ -264,7 +264,15 @@ class Index(IndexOpsMixin[S1]):
     @property
     def str(
         self,
-    ) -> StringMethods[Self, MultiIndex, np_ndarray_bool, Index[list[str]]]: ...
+    ) -> StringMethods[
+        Self,
+        MultiIndex,
+        np_ndarray_bool,
+        Index[list[str]],
+        Index[int],
+        Index[bytes],
+        Index[str],
+    ]: ...
     def is_(self, other) -> bool: ...
     def __len__(self) -> int: ...
     def __array__(self, dtype=...) -> np.ndarray: ...
diff --git a/pandas-stubs/core/series.pyi b/pandas-stubs/core/series.pyi
@@ -1156,7 +1156,15 @@ class Series(IndexOpsMixin[S1], NDFrame):
     @property
     def str(
         self,
-    ) -> StringMethods[Self, DataFrame, Series[bool], Series[list[str]]]: ...
+    ) -> StringMethods[
+        Self,
+        DataFrame,
+        Series[bool],
+        Series[list[str]],
+        Series[int],
+        Series[bytes],
+        Series[str],
+    ]: ...
     @property
     def dt(self) -> CombinedDatetimelikeProperties: ...
     @property
diff --git a/pandas-stubs/core/strings.pyi b/pandas-stubs/core/strings.pyi
@@ -37,8 +37,14 @@ _TS = TypeVar("_TS", bound=DataFrame | MultiIndex)
 _TS2 = TypeVar("_TS2", bound=Series[list[str]] | Index[list[str]])
 # The _TM type is what is used for the result of str.match
 _TM = TypeVar("_TM", bound=Series[bool] | np_ndarray_bool)
+# The _TI type is what is used for the result of str.index / str.find
+_TI = TypeVar("_TI", bound=Series[int] | Index[int])
+# The _TE type is what is used for the result of str.encode
+_TE = TypeVar("_TE", bound=Series[bytes] | Index[bytes])
+# The _TD type is what is used for the result of str.encode
+_TD = TypeVar("_TD", bound=Series[str] | Index[str])
 
-class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
+class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2, _TI, _TE, _TD]):
     def __init__(self, data: T) -> None: ...
     def __getitem__(self, key: slice | int) -> T: ...
     def __iter__(self) -> T: ...
@@ -113,15 +119,15 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
     @overload
     def rpartition(self, *, expand: Literal[False]) -> pd.Series[type[object]]: ...
     def get(self, i: int) -> T: ...
-    def join(self, sep: str) -> T: ...
+    def join(self, sep: str) -> _TD: ...
     def contains(
         self,
         pat: str | re.Pattern[str],
         case: bool = ...,
         flags: int = ...,
         na: Scalar | NaTType | None = ...,
         regex: bool = ...,
-    ) -> Series[bool]: ...
+    ) -> _TM: ...
     def match(
         self, pat: str, case: bool = ..., flags: int = ..., na: Any = ...
     ) -> _TM: ...
@@ -151,8 +157,8 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
     def slice_replace(
         self, start: int | None = ..., stop: int | None = ..., repl: str | None = ...
     ) -> T: ...
-    def decode(self, encoding: str, errors: str = ...) -> Series[str]: ...
-    def encode(self, encoding: str, errors: str = ...) -> Series[bytes]: ...
+    def decode(self, encoding: str, errors: str = ...) -> _TD: ...
+    def encode(self, encoding: str, errors: str = ...) -> _TE: ...
     def strip(self, to_strip: str | None = ...) -> T: ...
     def lstrip(self, to_strip: str | None = ...) -> T: ...
     def rstrip(self, to_strip: str | None = ...) -> T: ...
@@ -167,9 +173,9 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
     ) -> T: ...
     def get_dummies(self, sep: str = ...) -> pd.DataFrame: ...
     def translate(self, table: dict[int, int | str | None] | None) -> T: ...
-    def count(self, pat: str, flags: int = ...) -> Series[int]: ...
-    def startswith(self, pat: str | tuple[str, ...], na: Any = ...) -> Series[bool]: ...
-    def endswith(self, pat: str | tuple[str, ...], na: Any = ...) -> Series[bool]: ...
+    def count(self, pat: str, flags: int = ...) -> _TI: ...
+    def startswith(self, pat: str | tuple[str, ...], na: Any = ...) -> _TM: ...
+    def endswith(self, pat: str | tuple[str, ...], na: Any = ...) -> _TM: ...
     def findall(self, pat: str, flags: int = ...) -> _TS2: ...
     @overload
     def extract(
@@ -184,37 +190,29 @@ class StringMethods(NoNewAttributesMixin, Generic[T, _TS, _TM, _TS2]):
         self, pat: str, flags: int = ..., *, expand: Literal[False]
     ) -> Series[type[object]]: ...
     def extractall(self, pat: str, flags: int = ...) -> pd.DataFrame: ...
-    def find(
-        self, sub: str, start: int = ..., end: int | None = ...
-    ) -> Series[int]: ...
-    def rfind(
-        self, sub: str, start: int = ..., end: int | None = ...
-    ) -> Series[int]: ...
+    def find(self, sub: str, start: int = ..., end: int | None = ...) -> _TI: ...
+    def rfind(self, sub: str, start: int = ..., end: int | None = ...) -> _TI: ...
     def normalize(self, form: Literal["NFC", "NFKC", "NFD", "NFKD"]) -> T: ...
-    def index(
-        self, sub: str, start: int = ..., end: int | None = ...
-    ) -> Series[int]: ...
-    def rindex(
-        self, sub: str, start: int = ..., end: int | None = ...
-    ) -> Series[int]: ...
-    def len(self) -> Series[int]: ...
+    def index(self, sub: str, start: int = ..., end: int | None = ...) -> _TI: ...
+    def rindex(self, sub: str, start: int = ..., end: int | None = ...) -> _TI: ...
+    def len(self) -> _TI: ...
     def lower(self) -> T: ...
     def upper(self) -> T: ...
     def title(self) -> T: ...
     def capitalize(self) -> T: ...
     def swapcase(self) -> T: ...
     def casefold(self) -> T: ...
-    def isalnum(self) -> Series[bool]: ...
-    def isalpha(self) -> Series[bool]: ...
-    def isdigit(self) -> Series[bool]: ...
-    def isspace(self) -> Series[bool]: ...
-    def islower(self) -> Series[bool]: ...
-    def isupper(self) -> Series[bool]: ...
-    def istitle(self) -> Series[bool]: ...
-    def isnumeric(self) -> Series[bool]: ...
-    def isdecimal(self) -> Series[bool]: ...
+    def isalnum(self) -> _TM: ...
+    def isalpha(self) -> _TM: ...
+    def isdigit(self) -> _TM: ...
+    def isspace(self) -> _TM: ...
+    def islower(self) -> _TM: ...
+    def isupper(self) -> _TM: ...
+    def istitle(self) -> _TM: ...
+    def isnumeric(self) -> _TM: ...
+    def isdecimal(self) -> _TM: ...
     def fullmatch(
         self, pat: str, case: bool = ..., flags: int = ..., na: Any = ...
-    ) -> Series[bool]: ...
+    ) -> _TM: ...
     def removeprefix(self, prefix: str) -> T: ...
     def removesuffix(self, suffix: str) -> T: ...
diff --git a/test b/test
@@ -0,0 +1,2 @@
+    test
+ind  abc
diff --git a/tests/test_string_accessors.py b/tests/test_string_accessors.py
@@ -0,0 +1,142 @@
+import functools
+import re
+from typing import Any
+
+import numpy as np
+import pandas as pd
+import pytest
+from typing_extensions import assert_type
+
+from tests import check
+
+
+@pytest.mark.parametrize("constructor", ["series", "index"])
+@pytest.mark.parametrize(
+    ("method", "kwargs"),
+    [
+        ("capitalize", {}),
+    ],
+)
+def test_string_accessors_type_preserving_series(
+    constructor: Any, method: str, kwargs: Any
+) -> None:
+    data = ["applep", "bananap", "Cherryp", "DATEp", "eGGpLANTp", "123p", "23.45p"]
+    s = pd.Series(data)
+    _check = functools.partial(check, klass=pd.Series, dtype=str)
+    _check(assert_type(s.str.capitalize(), "pd.Series[str]"))
+    _check(assert_type(s.str.casefold(), "pd.Series[str]"))
+    check(assert_type(s.str.cat(sep="X"), str), str)
+    _check(assert_type(s.str.center(10), "pd.Series[str]"))
+    _check(assert_type(s.str.get(2), "pd.Series[str]"))
+    _check(assert_type(s.str.ljust(80), "pd.Series[str]"))
+    _check(assert_type(s.str.lower(), "pd.Series[str]"))
+    _check(assert_type(s.str.lstrip("a"), "pd.Series[str]"))
+    _check(assert_type(s.str.normalize("NFD"), "pd.Series[str]"))
+    _check(assert_type(s.str.pad(80, "right"), "pd.Series[str]"))
+    _check(assert_type(s.str.removeprefix("a"), "pd.Series[str]"))
+    _check(assert_type(s.str.removesuffix("e"), "pd.Series[str]"))
+    _check(assert_type(s.str.repeat(2), "pd.Series[str]"))
+    _check(assert_type(s.str.replace("a", "X"), "pd.Series[str]"))
+    _check(assert_type(s.str.rjust(80), "pd.Series[str]"))
+    _check(assert_type(s.str.rstrip(), "pd.Series[str]"))
+    _check(assert_type(s.str.slice(0, 4, 2), "pd.Series[str]"))
+    _check(assert_type(s.str.slice_replace(0, 2, "XX"), "pd.Series[str]"))
+    _check(assert_type(s.str.strip(), "pd.Series[str]"))
+    _check(assert_type(s.str.swapcase(), "pd.Series[str]"))
+    _check(assert_type(s.str.title(), "pd.Series[str]"))
+    _check(
+        assert_type(s.str.translate({241: "n"}), "pd.Series[str]"),
+    )
+    _check(assert_type(s.str.upper(), "pd.Series[str]"))
+    _check(assert_type(s.str.wrap(80), "pd.Series[str]"))
+    _check(assert_type(s.str.zfill(10), "pd.Series[str]"))
+
+
+def test_string_accessors_type_boolean():
+    s = pd.Series(
+        ["applep", "bananap", "Cherryp", "DATEp", "eGGpLANTp", "123p", "23.45p"]
+    )
+    check(assert_type(s.str.startswith("a"), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(
+        assert_type(s.str.startswith(("a", "b")), "pd.Series[bool]"),
+        pd.Series,
+        np.bool_,
+    )
+    check(assert_type(s.str.contains("a"), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(
+        assert_type(s.str.contains(re.compile(r"a")), "pd.Series[bool]"),
+        pd.Series,
+        np.bool_,
+    )
+    check(assert_type(s.str.endswith("e"), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(
+        assert_type(s.str.endswith(("e", "f")), "pd.Series[bool]"), pd.Series, np.bool_
+    )
+    check(assert_type(s.str.fullmatch("apple"), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(assert_type(s.str.isalnum(), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(assert_type(s.str.isalpha(), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(assert_type(s.str.isdecimal(), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(assert_type(s.str.isdigit(), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(assert_type(s.str.isnumeric(), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(assert_type(s.str.islower(), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(assert_type(s.str.isspace(), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(assert_type(s.str.istitle(), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(assert_type(s.str.isupper(), "pd.Series[bool]"), pd.Series, np.bool_)
+    check(assert_type(s.str.match("pp"), "pd.Series[bool]"), pd.Series, np.bool_)
+
+
+def test_string_accessors_type_integer():
+    s = pd.Series(
+        ["applep", "bananap", "Cherryp", "DATEp", "eGGpLANTp", "123p", "23.45p"]
+    )
+    check(assert_type(s.str.find("p"), "pd.Series[int]"), pd.Series, np.int64)
+    check(assert_type(s.str.index("p"), "pd.Series[int]"), pd.Series, np.int64)
+    check(assert_type(s.str.rfind("e"), "pd.Series[int]"), pd.Series, np.int64)
+    check(assert_type(s.str.rindex("p"), "pd.Series[int]"), pd.Series, np.int64)
+    check(assert_type(s.str.count("pp"), "pd.Series[int]"), pd.Series, np.integer)
+    check(assert_type(s.str.len(), "pd.Series[int]"), pd.Series, np.integer)
+
+
+def test_string_accessors_encode_decode():
+    s_str = pd.Series(["a1", "b2", "c3"])
+    s_bytes = pd.Series([b"a1", b"b2", b"c3"])
+    s2 = pd.Series([["apple", "banana"], ["cherry", "date"], [1, "eggplant"]])
+    check(
+        assert_type(s_bytes.str.decode("utf-8"), "pd.Series[str]"),
+        "pd.Series[str]",
+        str,
+    )
+    check(
+        assert_type(s_str.str.encode("latin-1"), "pd.Series[bytes]"), pd.Series, bytes
+    )
+    check(assert_type(s2.str.join("-"), "pd.Series[str]"), pd.Series, str)
+
+
+def test_string_accessors_list():
+    s = pd.Series(
+        ["applep", "bananap", "Cherryp", "DATEp", "eGGpLANTp", "123p", "23.45p"]
+    )
+    check(assert_type(s.str.findall("pp"), "pd.Series[list[str]]"), pd.Series, list)
+    check(assert_type(s.str.split("a"), "pd.Series[list[str]]"), pd.Series, list)
+    # GH 194
+    check(
+        assert_type(s.str.split("a", expand=False), "pd.Series[list[str]]"),
+        pd.Series,
+        list,
+    )
+    check(assert_type(s.str.rsplit("a"), "pd.Series[list[str]]"), pd.Series, list)
+    check(
+        assert_type(s.str.rsplit("a", expand=False), "pd.Series[list[str]]"),
+        pd.Series,
+        list,
+    )
+
+
+# def test_string_accessors_expanding():
+#     check(assert_type(s3.str.extract(r"([ab])?(\d)"), pd.DataFrame), pd.DataFrame)
+#     check(assert_type(s3.str.extractall(r"([ab])?(\d)"), pd.DataFrame), pd.DataFrame)
+#     check(assert_type(s.str.get_dummies(), pd.DataFrame), pd.DataFrame)
+#     check(assert_type(s.str.partition("p"), pd.DataFrame), pd.DataFrame)
+#     check(assert_type(s.str.rpartition("p"), pd.DataFrame), pd.DataFrame)
+#     check(assert_type(s.str.rsplit("a", expand=True), pd.DataFrame), pd.DataFrame)
+#     check(assert_type(s.str.split("a", expand=True), pd.DataFrame), pd.DataFrame)