pandas-dev · jreback · Mar 18, 2022 · Mar 17, 2022 · Mar 17, 2022 · Mar 17, 2022
diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
@@ -2,6 +2,8 @@
 
 import pandas as pd
 
+from .pandas_vb_common import tm
+
 
 class BooleanArray:
     def setup(self):
@@ -39,3 +41,32 @@ def time_constructor(self):
 
     def time_from_integer_array(self):
         pd.array(self.values_integer, dtype="Int64")
+
+
+class ArrowStringArray:
+
+    params = [False, True]
+    param_names = ["multiple_chunks"]
+
+    def setup(self, multiple_chunks):
+        try:
+            import pyarrow as pa
+        except ImportError:
+            raise NotImplementedError
+        strings = tm.rands_array(3, 10_000)
+        if multiple_chunks:
+            chunks = [strings[i : i + 100] for i in range(0, len(strings), 100)]
+            self.array = pd.arrays.ArrowStringArray(pa.chunked_array(chunks))
+        else:
+            self.array = pd.arrays.ArrowStringArray(pa.array(strings))
+
+    def time_setitem(self, multiple_chunks):
+        for i in range(200):
+            self.array[i] = "foo"
+
+    def time_setitem_list(self, multiple_chunks):
+        indexer = list(range(0, 50)) + list(range(-50, 0))
+        self.array[indexer] = ["foo"] * len(indexer)
+
+    def time_setitem_slice(self, multiple_chunks):
+        self.array[::10] = "foo"
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -315,6 +315,7 @@ Performance improvements
 - Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)
 - Performance improvement in :meth:`DataFrame.join` when left and/or right are empty (:issue:`46015`)
 - Performance improvement in :meth:`DataFrame.reindex` and :meth:`Series.reindex` when target is a :class:`MultiIndex` (:issue:`46235`)
+- Performance improvement when setting values in a pyarrow backed string array (:issue:`46400`)
 - Performance improvement in :func:`factorize` (:issue:`46109`)
 - Performance improvement in :class:`DataFrame` and :class:`Series` constructors for extension dtype scalars (:issue:`45854`)
 

diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -22,6 +22,7 @@
     pa_version_under2p0,
     pa_version_under3p0,
     pa_version_under4p0,
+    pa_version_under5p0,
 )
 
 PY39 = sys.version_info >= (3, 9)
@@ -148,4 +149,5 @@ def get_lzma_file():
     "pa_version_under2p0",
     "pa_version_under3p0",
     "pa_version_under4p0",
+    "pa_version_under5p0",
 ]
diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
@@ -6,7 +6,6 @@
     TYPE_CHECKING,
     Any,
     Union,
-    cast,
     overload,
 )
 
@@ -31,6 +30,7 @@
     pa_version_under2p0,
     pa_version_under3p0,
     pa_version_under4p0,
+    pa_version_under5p0,
 )
 from pandas.util._decorators import doc
 
@@ -40,6 +40,7 @@
     is_dtype_equal,
     is_integer,
     is_integer_dtype,
+    is_list_like,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
@@ -363,48 +364,139 @@ def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None:
         """
         key = check_array_indexer(self, key)
 
-        if is_integer(key):
-            key = cast(int, key)
+        if is_list_like(key):
+            key = np.asarray(key)
+            if len(key) == 1:
+                key = key[0]
 
-            if not is_scalar(value):
-                raise ValueError("Must pass scalars with scalar indexer")
-            elif isna(value):
+        value_is_scalar = is_scalar(value)
+
+        # NA -> None
+        if value_is_scalar:
+            if isna(value):
                 value = None
             elif not isinstance(value, str):
                 raise ValueError("Scalar must be NA or str")
-
-            # Slice data and insert in-between
-            new_data = [
-                *self._data[0:key].chunks,
-                pa.array([value], type=pa.string()),
-                *self._data[(key + 1) :].chunks,
-            ]
-            self._data = pa.chunked_array(new_data)
         else:
-            # Convert to integer indices and iteratively assign.
-            # TODO: Make a faster variant of this in Arrow upstream.
-            #       This is probably extremely slow.
-
-            # Convert all possible input key types to an array of integers
-            if isinstance(key, slice):
-                key_array = np.array(range(len(self))[key])
-            elif is_bool_dtype(key):
-                # TODO(ARROW-9430): Directly support setitem(booleans)
-                key_array = np.argwhere(key).flatten()
-            else:
-                # TODO(ARROW-9431): Directly support setitem(integers)
-                key_array = np.asanyarray(key)
+            value = np.asarray(value, dtype=object)
+            for i, v in enumerate(value):
+                if isna(v):
+                    value[i] = None
+                elif not isinstance(v, str):
+                    raise ValueError("Scalar must be NA or str")
+
+        # reorder values to align with the mask positions
+        if is_bool_dtype(key):
+            pass
+        elif isinstance(key, slice):
+            if not value_is_scalar and key.step is not None and key.step < 0:
+                value = value[::-1]
+        else:
+            if not value_is_scalar:
+                if is_scalar(key):
+                    raise ValueError("Length of indexer and values mismatch")
+                key = np.asarray(key)
+                if len(key) != len(value):
+                    raise ValueError("Length of indexer and values mismatch")
+
+            if np.any(key < -len(self)):
+                min_key = np.asarray(key).min()
+                raise IndexError(
+                    f"index {min_key} is out of bounds for array of length {len(self)}"
+                )
+            if np.any(key >= len(self)):
+                max_key = np.asarray(key).max()
+                raise IndexError(
+                    f"index {max_key} is out of bounds for array of length {len(self)}"
+                )
 
-            if is_scalar(value):
-                value = np.broadcast_to(value, len(key_array))
+            # convert negative indices to positive before sorting
+            if is_integer(key):
+                if key < 0:
+                    key += len(self)
             else:
-                value = np.asarray(value)
+                key = np.asarray(key)
+                key[key < 0] += len(self)
+                if not value_is_scalar:
+                    value = value[np.argsort(key)]
+
+        # fast path
+        if is_integer(key) and value_is_scalar and self._data.num_chunks == 1:
+            idx = int(key)  # type: ignore[arg-type]
+            chunk = pa.concat_arrays(
+                [
+                    self._data.chunks[0][:idx],
+                    pa.array([value], type=pa.string()),
+                    self._data.chunks[0][idx + 1 :],
+                ]
+            )
+            self._data = pa.chunked_array([chunk])
+            return
 
-            if len(key_array) != len(value):
+        # create mask for positions to set
+        mask: npt.NDArray[np.bool_]
+        if is_bool_dtype(key):
+            mask = key  # type: ignore[assignment]
+        else:
+            mask = np.zeros(len(self), dtype=np.bool_)
+            mask[key] = True
+
+        if not value_is_scalar:
+            if len(value) != np.sum(mask):
                 raise ValueError("Length of indexer and values mismatch")
 
-            for k, v in zip(key_array, value):
-                self[k] = v
+        indices = mask.nonzero()[0]
+
+        # loop through the array chunks and set the new values while
+        # leaving the chunking layout unchanged
+        start = stop = 0
+        new_data = []
+
+        for chunk in self._data.iterchunks():
+            start, stop = stop, stop + len(chunk)
+
+            if len(indices) == 0 or indices[0] >= stop:
+                new_data.append(chunk)
+                continue
+
+            n = int(np.searchsorted(indices, stop, side="left"))
+            c_indices, indices = indices[:n], indices[n:]
+
+            if value_is_scalar:
+                c_value = value
+            else:
+                c_value, value = value[:n], value[n:]
+
+            if n == 1:
+                # fast path
+                idx = c_indices[0] - start
+                v = [c_value] if value_is_scalar else c_value
+                chunk = pa.concat_arrays(
+                    [
+                        chunk[:idx],
+                        pa.array(v, type=pa.string()),
+                        chunk[idx + 1 :],
+                    ]
+                )
+
+            elif n > 0:
+                submask = mask[start:stop]
+                if not pa_version_under5p0:
+                    if c_value is None or isna(np.array(c_value)).all():
+                        chunk = pc.if_else(submask, None, chunk)
+                    else:
+                        chunk = pc.replace_with_mask(chunk, submask, c_value)
+                else:
+                    # The pyarrow compute functions were added in
+                    # version 5.0. For prior versions we implement
+                    # our own by converting to numpy and back.
+                    chunk = chunk.to_numpy(zero_copy_only=False)
+                    chunk[submask] = c_value
+                    chunk = pa.array(chunk, type=pa.string())
+
+            new_data.append(chunk)
+
+        self._data = pa.chunked_array(new_data)
 
     def take(
         self,

diff --git a/pandas/tests/arrays/string_/test_string_arrow.py b/pandas/tests/arrays/string_/test_string_arrow.py
@@ -132,3 +132,68 @@ def test_pyarrow_not_installed_raises():
 
     with pytest.raises(ImportError, match=msg):
         ArrowStringArray._from_sequence(["a", None, "b"])
+
+
+@skip_if_no_pyarrow
+@pytest.mark.parametrize("multiple_chunks", [False, True])
+@pytest.mark.parametrize(
+    "key, value, expected",
+    [
+        (-1, "XX", ["a", "b", "c", "d", "XX"]),
+        (1, "XX", ["a", "XX", "c", "d", "e"]),
+        (1, None, ["a", None, "c", "d", "e"]),
+        (1, pd.NA, ["a", None, "c", "d", "e"]),
+        ([1, 3], "XX", ["a", "XX", "c", "XX", "e"]),
+        ([1, 3], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
+        ([1, 3], ["XX", None], ["a", "XX", "c", None, "e"]),
+        ([1, 3], ["XX", pd.NA], ["a", "XX", "c", None, "e"]),
+        ([0, -1], ["XX", "YY"], ["XX", "b", "c", "d", "YY"]),
+        ([-1, 0], ["XX", "YY"], ["YY", "b", "c", "d", "XX"]),
+        (slice(3, None), "XX", ["a", "b", "c", "XX", "XX"]),
+        (slice(2, 4), ["XX", "YY"], ["a", "b", "XX", "YY", "e"]),
+        (slice(3, 1, -1), ["XX", "YY"], ["a", "b", "YY", "XX", "e"]),
+        (slice(None), "XX", ["XX", "XX", "XX", "XX", "XX"]),
+        ([False, True, False, True, False], ["XX", "YY"], ["a", "XX", "c", "YY", "e"]),
+    ],
+)
+def test_setitem(multiple_chunks, key, value, expected):
+    import pyarrow as pa
+
+    result = pa.array(list("abcde"))
+    expected = pa.array(expected)
+
+    if multiple_chunks:
+        result = pa.chunked_array([result[:3], result[3:]])
+        expected = pa.chunked_array([expected[:3], expected[3:]])
+
+    result = ArrowStringArray(result)
+    expected = ArrowStringArray(expected)
+
+    result[key] = value
+    tm.assert_equal(result, expected)
+    assert result._data.num_chunks == expected._data.num_chunks
+
+
+@skip_if_no_pyarrow
+def test_setitem_invalid_indexer_raises():
+    import pyarrow as pa
+
+    arr = ArrowStringArray(pa.array(list("abcde")))
+
+    with pytest.raises(IndexError, match=None):
+        arr[5] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[-6] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[0, 5]] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[0, -6]] = "foo"
+
+    with pytest.raises(IndexError, match=None):
+        arr[[True, True, False]] = "foo"
+
+    with pytest.raises(ValueError, match=None):
+        arr[[0, 1]] = ["foo", "bar", "baz"]