sthagen · sthagen · Apr 15, 2021 · Apr 15, 2021 · Apr 15, 2021 · Apr 15, 2021
diff --git a/doc/source/whatsnew/v0.11.0.rst b/doc/source/whatsnew/v0.11.0.rst
@@ -306,6 +306,7 @@ Astype conversion on ``datetime64[ns]`` to ``object``, implicitly converts ``NaT
 
 .. ipython:: python
 
+   import datetime
    s = pd.Series([datetime.datetime(2001, 1, 2, 0, 0) for i in range(3)])
    s.dtype
    s[1] = np.nan

diff --git a/pandas/conftest.py b/pandas/conftest.py
@@ -1144,6 +1144,8 @@ def nullable_string_dtype(request):
     * 'string'
     * 'arrow_string'
     """
+    from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
+
     return request.param
 
 

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -24,6 +24,10 @@
 from pandas.util._validators import validate_fillna_kwargs
 
 from pandas.core.dtypes.base import ExtensionDtype
+from pandas.core.dtypes.common import (
+    is_object_dtype,
+    is_string_dtype,
+)
 from pandas.core.dtypes.dtypes import register_extension_dtype
 from pandas.core.dtypes.missing import isna
 
@@ -41,6 +45,7 @@
     check_array_indexer,
     validate_indices,
 )
+from pandas.core.strings.object_array import ObjectStringArrayMixin
 
 try:
     import pyarrow as pa
@@ -149,7 +154,12 @@ def __eq__(self, other) -> bool:
             return False
 
 
-class ArrowStringArray(OpsMixin, ExtensionArray):
+# TODO: Inherit directly from BaseStringArrayMethods. Currently we inherit from
+# ObjectStringArrayMixin because we want to have the object-dtype based methods as
+# fallback for the ones that pyarrow doesn't yet support
+
+
+class ArrowStringArray(OpsMixin, ExtensionArray, ObjectStringArrayMixin):
     """
     Extension array for string data in a ``pyarrow.ChunkedArray``.
 
@@ -676,3 +686,71 @@ def value_counts(self, dropna: bool = True) -> Series:
             raise NotImplementedError("yo")
 
         return Series(counts, index=index).astype("Int64")
+
+    # ------------------------------------------------------------------------
+    # String methods interface
+
+    _str_na_value = ArrowStringDtype.na_value
+
+    def _str_map(self, f, na_value=None, dtype: Dtype | None = None):
+        # TODO: de-duplicate with StringArray method. This method is moreless copy and
+        # paste.
+
+        from pandas.arrays import (
+            BooleanArray,
+            IntegerArray,
+        )
+
+        if dtype is None:
+            dtype = self.dtype
+        if na_value is None:
+            na_value = self.dtype.na_value
+
+        mask = isna(self)
+        arr = np.asarray(self)
+
+        if is_integer_dtype(dtype) or is_bool_dtype(dtype):
+            constructor: type[IntegerArray] | type[BooleanArray]
+            if is_integer_dtype(dtype):
+                constructor = IntegerArray
+            else:
+                constructor = BooleanArray
+
+            na_value_is_na = isna(na_value)
+            if na_value_is_na:
+                na_value = 1
+            result = lib.map_infer_mask(
+                arr,
+                f,
+                mask.view("uint8"),
+                convert=False,
+                na_value=na_value,
+                # error: Value of type variable "_DTypeScalar" of "dtype" cannot be
+                # "object"
+                # error: Argument 1 to "dtype" has incompatible type
+                # "Union[ExtensionDtype, str, dtype[Any], Type[object]]"; expected
+                # "Type[object]"
+                dtype=np.dtype(dtype),  # type: ignore[type-var,arg-type]
+            )
+
+            if not na_value_is_na:
+                mask[:] = False
+
+            # error: Argument 1 to "IntegerArray" has incompatible type
+            # "Union[ExtensionArray, ndarray]"; expected "ndarray"
+            # error: Argument 1 to "BooleanArray" has incompatible type
+            # "Union[ExtensionArray, ndarray]"; expected "ndarray"
+            return constructor(result, mask)  # type: ignore[arg-type]
+
+        elif is_string_dtype(dtype) and not is_object_dtype(dtype):
+            # i.e. StringDtype
+            result = lib.map_infer_mask(
+                arr, f, mask.view("uint8"), convert=False, na_value=na_value
+            )
+            return self._from_sequence(result)
+        else:
+            # This is when the result type is object. We reach this when
+            # -> We know the result type is truly object (e.g. .encode returns bytes
+            #    or .findall returns a list).
+            # -> We don't know the result type. E.g. `.get` can return anything.
+            return lib.map_infer_mask(arr, f, mask.view("uint8"))
diff --git a/pandas/core/strings/__init__.py b/pandas/core/strings/__init__.py
@@ -25,6 +25,7 @@
 #     - StringArray
 #     - PandasArray
 #     - Categorical
+#     - ArrowStringArray
 
 from pandas.core.strings.accessor import StringMethods
 from pandas.core.strings.base import BaseStringArrayMethods

diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py
@@ -154,10 +154,11 @@ class StringMethods(NoNewAttributesMixin):
 
     def __init__(self, data):
         from pandas.core.arrays.string_ import StringDtype
+        from pandas.core.arrays.string_arrow import ArrowStringDtype
 
         self._inferred_dtype = self._validate(data)
         self._is_categorical = is_categorical_dtype(data.dtype)
-        self._is_string = isinstance(data.dtype, StringDtype)
+        self._is_string = isinstance(data.dtype, (StringDtype, ArrowStringDtype))
         self._data = data
 
         self._index = self._name = None
@@ -316,7 +317,7 @@ def cons_row(x):
             # This is a mess.
             dtype: Optional[str]
             if self._is_string and returns_string:
-                dtype = "string"
+                dtype = self._orig.dtype
             else:
                 dtype = None
 

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -91,17 +91,11 @@ def test_setitem_with_scalar_string(dtype):
 @pytest.mark.parametrize(
     "input, method",
     [
-        (["a", "b", "c"], operator.methodcaller("capitalize")),
         (["a", "b", "c"], operator.methodcaller("capitalize")),
         (["a b", "a bc. de"], operator.methodcaller("capitalize")),
     ],
 )
-def test_string_methods(input, method, dtype, request):
-    if dtype == "arrow_string":
-        reason = "AttributeError: 'ArrowStringDtype' object has no attribute 'base'"
-        mark = pytest.mark.xfail(reason=reason)
-        request.node.add_marker(mark)
-
+def test_string_methods(input, method, dtype):
     a = pd.Series(input, dtype=dtype)
     b = pd.Series(input, dtype="object")
     result = method(a.str)

diff --git a/pandas/tests/frame/methods/test_astype.py b/pandas/tests/frame/methods/test_astype.py
@@ -3,6 +3,8 @@
 import numpy as np
 import pytest
 
+import pandas.util._test_decorators as td
+
 import pandas as pd
 from pandas import (
     Categorical,
@@ -564,17 +566,25 @@ def test_astype_empty_dtype_dict(self):
         assert result is not df
 
     @pytest.mark.parametrize(
-        "df",
+        "data, dtype",
         [
-            DataFrame(Series(["x", "y", "z"], dtype="string")),
-            DataFrame(Series(["x", "y", "z"], dtype="category")),
-            DataFrame(Series(3 * [Timestamp("2020-01-01", tz="UTC")])),
-            DataFrame(Series(3 * [Interval(0, 1)])),
+            (["x", "y", "z"], "string"),
+            pytest.param(
+                ["x", "y", "z"],
+                "arrow_string",
+                marks=td.skip_if_no("pyarrow", min_version="1.0.0"),
+            ),
+            (["x", "y", "z"], "category"),
+            (3 * [Timestamp("2020-01-01", tz="UTC")], None),
+            (3 * [Interval(0, 1)], None),
         ],
     )
     @pytest.mark.parametrize("errors", ["raise", "ignore"])
-    def test_astype_ignores_errors_for_extension_dtypes(self, df, errors):
+    def test_astype_ignores_errors_for_extension_dtypes(self, data, dtype, errors):
         # https://github.com/pandas-dev/pandas/issues/35471
+        from pandas.core.arrays.string_arrow import ArrowStringDtype  # noqa: F401
+
+        df = DataFrame(Series(data, dtype=dtype))
         if errors == "ignore":
             expected = df
             result = df.astype(float, errors=errors)

diff --git a/pandas/tests/frame/methods/test_select_dtypes.py b/pandas/tests/frame/methods/test_select_dtypes.py
@@ -391,7 +391,6 @@ def test_select_dtypes_typecodes(self):
         (
             (np.array([1, 2], dtype=np.int32), True),
             (pd.array([1, 2], dtype="Int32"), True),
-            (pd.array(["a", "b"], dtype="string"), False),
             (DummyArray([1, 2], dtype=DummyDtype(numeric=True)), True),
             (DummyArray([1, 2], dtype=DummyDtype(numeric=False)), False),
         ),
@@ -402,3 +401,9 @@ def test_select_dtypes_numeric(self, arr, expected):
         df = DataFrame(arr)
         is_selected = df.select_dtypes(np.number).shape == df.shape
         assert is_selected == expected
+
+    def test_select_dtypes_numeric_nullable_string(self, nullable_string_dtype):
+        arr = pd.array(["a", "b"], dtype=nullable_string_dtype)
+        df = DataFrame(arr)
+        is_selected = df.select_dtypes(np.number).shape == df.shape
+        assert not is_selected
diff --git a/pandas/tests/indexing/test_check_indexer.py b/pandas/tests/indexing/test_check_indexer.py
@@ -78,7 +78,6 @@ def test_int_raise_missing_values(indexer):
         np.array([1.0, 2.0], dtype="float64"),
         np.array([True, False], dtype=object),
         pd.Index([True, False], dtype=object),
-        pd.array(["a", "b"], dtype="string"),
     ],
 )
 def test_raise_invalid_array_dtypes(indexer):
@@ -89,6 +88,15 @@ def test_raise_invalid_array_dtypes(indexer):
         check_array_indexer(arr, indexer)
 
 
+def test_raise_nullable_string_dtype(nullable_string_dtype):
+    indexer = pd.array(["a", "b"], dtype=nullable_string_dtype)
+    arr = np.array([1, 2, 3])
+
+    msg = "arrays used as indices must be of integer or boolean type"
+    with pytest.raises(IndexError, match=msg):
+        check_array_indexer(arr, indexer)
+
+
 @pytest.mark.parametrize("indexer", [None, Ellipsis, slice(0, 3), (None,)])
 def test_pass_through_non_array_likes(indexer):
     arr = np.array([1, 2, 3])

diff --git a/pandas/tests/strings/test_string_array.py b/pandas/tests/strings/test_string_array.py
@@ -11,14 +11,22 @@
 )
 
 
-def test_string_array(any_string_method):
+def test_string_array(nullable_string_dtype, any_string_method, request):
     method_name, args, kwargs = any_string_method
     if method_name == "decode":
         pytest.skip("decode requires bytes.")
 
+    if nullable_string_dtype == "arrow_string" and method_name in {
+        "extract",
+        "extractall",
+    }:
+        reason = "extract/extractall does not yet dispatch to array"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
     data = ["a", "bb", np.nan, "ccc"]
     a = Series(data, dtype=object)
-    b = Series(data, dtype="string")
+    b = Series(data, dtype=nullable_string_dtype)
 
     expected = getattr(a.str, method_name)(*args, **kwargs)
     result = getattr(b.str, method_name)(*args, **kwargs)
@@ -27,7 +35,7 @@ def test_string_array(any_string_method):
         if expected.dtype == "object" and lib.is_string_array(
             expected.dropna().values,
         ):
-            assert result.dtype == "string"
+            assert result.dtype == nullable_string_dtype
             result = result.astype(object)
 
         elif expected.dtype == "object" and lib.is_bool_array(
@@ -46,7 +54,7 @@ def test_string_array(any_string_method):
 
     elif isinstance(expected, DataFrame):
         columns = expected.select_dtypes(include="object").columns
-        assert all(result[columns].dtypes == "string")
+        assert all(result[columns].dtypes == nullable_string_dtype)
         result[columns] = result[columns].astype(object)
     tm.assert_equal(result, expected)
 
@@ -60,8 +68,8 @@ def test_string_array(any_string_method):
         ("rindex", [2, None]),
     ],
 )
-def test_string_array_numeric_integer_array(method, expected):
-    s = Series(["aba", None], dtype="string")
+def test_string_array_numeric_integer_array(nullable_string_dtype, method, expected):
+    s = Series(["aba", None], dtype=nullable_string_dtype)
     result = getattr(s.str, method)("a")
     expected = Series(expected, dtype="Int64")
     tm.assert_series_equal(result, expected)
@@ -73,33 +81,39 @@ def test_string_array_numeric_integer_array(method, expected):
         ("isdigit", [False, None, True]),
         ("isalpha", [True, None, False]),
         ("isalnum", [True, None, True]),
-        ("isdigit", [False, None, True]),
+        ("isnumeric", [False, None, True]),
     ],
 )
-def test_string_array_boolean_array(method, expected):
-    s = Series(["a", None, "1"], dtype="string")
+def test_string_array_boolean_array(nullable_string_dtype, method, expected):
+    s = Series(["a", None, "1"], dtype=nullable_string_dtype)
     result = getattr(s.str, method)()
     expected = Series(expected, dtype="boolean")
     tm.assert_series_equal(result, expected)
 
 
-def test_string_array_extract():
+def test_string_array_extract(nullable_string_dtype, request):
     # https://github.com/pandas-dev/pandas/issues/30969
     # Only expand=False & multiple groups was failing
-    a = Series(["a1", "b2", "cc"], dtype="string")
+
+    if nullable_string_dtype == "arrow_string":
+        reason = "extract does not yet dispatch to array"
+        mark = pytest.mark.xfail(reason=reason)
+        request.node.add_marker(mark)
+
+    a = Series(["a1", "b2", "cc"], dtype=nullable_string_dtype)
     b = Series(["a1", "b2", "cc"], dtype="object")
     pat = r"(\w)(\d)"
 
     result = a.str.extract(pat, expand=False)
     expected = b.str.extract(pat, expand=False)
-    assert all(result.dtypes == "string")
+    assert all(result.dtypes == nullable_string_dtype)
 
     result = result.astype(object)
     tm.assert_equal(result, expected)
 
 
-def test_str_get_stringarray_multiple_nans():
-    s = Series(pd.array(["a", "ab", pd.NA, "abc"]))
+def test_str_get_stringarray_multiple_nans(nullable_string_dtype):
+    s = Series(pd.array(["a", "ab", pd.NA, "abc"], dtype=nullable_string_dtype))
     result = s.str.get(2)
-    expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"]))
+    expected = Series(pd.array([pd.NA, pd.NA, pd.NA, "c"], dtype=nullable_string_dtype))
     tm.assert_series_equal(result, expected)