Skip to content

Commit 75c69b1

Browse files
committed
Merge remote-tracking branch 'upstream/2.3.x' into remove-read_json-futurewarning
2 parents b0315f3 + 112c2e9 commit 75c69b1

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

58 files changed

+444
-436
lines changed

doc/source/whatsnew/v2.3.0.rst

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,10 +107,10 @@ Conversion
107107
Strings
108108
^^^^^^^
109109
- Bug in :meth:`Series.rank` for :class:`StringDtype` with ``storage="pyarrow"`` incorrectly returning integer results in case of ``method="average"`` and raising an error if it would truncate results (:issue:`59768`)
110+
- Bug in :meth:`Series.replace` with :class:`StringDtype` when replacing with a non-string value was not upcasting to ``object`` dtype (:issue:`60282`)
110111
- Bug in :meth:`Series.str.replace` when ``n < 0`` for :class:`StringDtype` with ``storage="pyarrow"`` (:issue:`59628`)
111112
- Bug in ``ser.str.slice`` with negative ``step`` with :class:`ArrowDtype` and :class:`StringDtype` with ``storage="pyarrow"`` giving incorrect results (:issue:`59710`)
112113
- Bug in the ``center`` method on :class:`Series` and :class:`Index` object ``str`` accessors with pyarrow-backed dtype not matching the python behavior in corner cases with an odd number of fill characters (:issue:`54792`)
113-
-
114114

115115
Interval
116116
^^^^^^^^
@@ -119,7 +119,7 @@ Interval
119119

120120
Indexing
121121
^^^^^^^^
122-
-
122+
- Fixed bug in :meth:`Index.get_indexer` round-tripping through string dtype when ``infer_string`` is enabled (:issue:`55834`)
123123
-
124124

125125
Missing

pandas/_libs/lib.pyx

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2741,7 +2741,13 @@ def maybe_convert_objects(ndarray[object] objects,
27412741
seen.object_ = True
27422742

27432743
elif seen.str_:
2744-
if using_string_dtype() and is_string_array(objects, skipna=True):
2744+
if convert_to_nullable_dtype and is_string_array(objects, skipna=True):
2745+
from pandas.core.arrays.string_ import StringDtype
2746+
2747+
dtype = StringDtype()
2748+
return dtype.construct_array_type()._from_sequence(objects, dtype=dtype)
2749+
2750+
elif using_string_dtype() and is_string_array(objects, skipna=True):
27452751
from pandas.core.arrays.string_ import StringDtype
27462752

27472753
dtype = StringDtype(na_value=np.nan)

pandas/core/arrays/arrow/array.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1633,7 +1633,11 @@ def _accumulate(
16331633
else:
16341634
data_to_accum = data_to_accum.cast(pa.int64())
16351635

1636-
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1636+
try:
1637+
result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
1638+
except pa.ArrowNotImplementedError as err:
1639+
msg = f"operation '{name}' not supported for dtype '{self.dtype}'"
1640+
raise TypeError(msg) from err
16371641

16381642
if convert_to_int:
16391643
result = result.cast(pa_dtype)

pandas/core/arrays/string_.py

Lines changed: 25 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -726,20 +726,9 @@ def _values_for_factorize(self) -> tuple[np.ndarray, libmissing.NAType | float]:
726726

727727
return arr, self.dtype.na_value
728728

729-
def __setitem__(self, key, value) -> None:
730-
value = extract_array(value, extract_numpy=True)
731-
if isinstance(value, type(self)):
732-
# extract_array doesn't extract NumpyExtensionArray subclasses
733-
value = value._ndarray
734-
735-
key = check_array_indexer(self, key)
736-
scalar_key = lib.is_scalar(key)
737-
scalar_value = lib.is_scalar(value)
738-
if scalar_key and not scalar_value:
739-
raise ValueError("setting an array element with a sequence.")
740-
741-
# validate new items
742-
if scalar_value:
729+
def _maybe_convert_setitem_value(self, value):
730+
"""Maybe convert value to be pyarrow compatible."""
731+
if lib.is_scalar(value):
743732
if isna(value):
744733
value = self.dtype.na_value
745734
elif not isinstance(value, str):
@@ -749,8 +738,11 @@ def __setitem__(self, key, value) -> None:
749738
"instead."
750739
)
751740
else:
741+
value = extract_array(value, extract_numpy=True)
752742
if not is_array_like(value):
753743
value = np.asarray(value, dtype=object)
744+
elif isinstance(value.dtype, type(self.dtype)):
745+
return value
754746
else:
755747
# cast categories and friends to arrays to see if values are
756748
# compatible, compatibility with arrow backed strings
@@ -760,11 +752,26 @@ def __setitem__(self, key, value) -> None:
760752
"Invalid value for dtype 'str'. Value should be a "
761753
"string or missing value (or array of those)."
762754
)
755+
return value
763756

764-
mask = isna(value)
765-
if mask.any():
766-
value = value.copy()
767-
value[isna(value)] = self.dtype.na_value
757+
def __setitem__(self, key, value) -> None:
758+
value = self._maybe_convert_setitem_value(value)
759+
760+
key = check_array_indexer(self, key)
761+
scalar_key = lib.is_scalar(key)
762+
scalar_value = lib.is_scalar(value)
763+
if scalar_key and not scalar_value:
764+
raise ValueError("setting an array element with a sequence.")
765+
766+
if not scalar_value:
767+
if value.dtype == self.dtype:
768+
value = value._ndarray
769+
else:
770+
value = np.asarray(value)
771+
mask = isna(value)
772+
if mask.any():
773+
value = value.copy()
774+
value[isna(value)] = self.dtype.na_value
768775

769776
super().__setitem__(key, value)
770777

pandas/core/dtypes/cast.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1163,6 +1163,7 @@ def convert_dtypes(
11631163

11641164
def maybe_infer_to_datetimelike(
11651165
value: npt.NDArray[np.object_],
1166+
convert_to_nullable_dtype: bool = False,
11661167
) -> np.ndarray | DatetimeArray | TimedeltaArray | PeriodArray | IntervalArray:
11671168
"""
11681169
we might have a array (or single object) that is datetime like,
@@ -1200,6 +1201,7 @@ def maybe_infer_to_datetimelike(
12001201
# numpy would have done it for us.
12011202
convert_numeric=False,
12021203
convert_non_numeric=True,
1204+
convert_to_nullable_dtype=convert_to_nullable_dtype,
12031205
dtype_if_all_nat=np.dtype("M8[ns]"),
12041206
)
12051207

@@ -1754,6 +1756,13 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool:
17541756
except (ValueError, TypeError):
17551757
return False
17561758

1759+
if dtype == "string":
1760+
try:
1761+
arr._maybe_convert_setitem_value(element) # type: ignore[union-attr]
1762+
return True
1763+
except (ValueError, TypeError):
1764+
return False
1765+
17571766
# This is technically incorrect, but maintains the behavior of
17581767
# ExtensionBlock._can_hold_element
17591768
return True

pandas/core/indexes/base.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6695,7 +6695,16 @@ def _maybe_cast_listlike_indexer(self, target) -> Index:
66956695
"""
66966696
Analogue to maybe_cast_indexer for get_indexer instead of get_loc.
66976697
"""
6698-
return ensure_index(target)
6698+
target_index = ensure_index(target)
6699+
if (
6700+
not hasattr(target, "dtype")
6701+
and self.dtype == object
6702+
and target_index.dtype == "string"
6703+
):
6704+
# If we started with a list-like, avoid inference to string dtype if self
6705+
# is object dtype (coercing to string dtype will alter the missing values)
6706+
target_index = Index(target, dtype=self.dtype)
6707+
return target_index
66996708

67006709
@final
67016710
def _validate_indexer(

pandas/core/interchange/from_dataframe.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import numpy as np
88

9+
from pandas._config import using_string_dtype
10+
911
from pandas.compat._optional import import_optional_dependency
1012
from pandas.errors import SettingWithCopyError
1113

@@ -124,8 +126,6 @@ def protocol_df_chunk_to_pandas(df: DataFrameXchg) -> pd.DataFrame:
124126
-------
125127
pd.DataFrame
126128
"""
127-
# We need a dict of columns here, with each column being a NumPy array (at
128-
# least for now, deal with non-NumPy dtypes later).
129129
columns: dict[str, Any] = {}
130130
buffers = [] # hold on to buffers, keeps memory alive
131131
for name in df.column_names():
@@ -324,8 +324,12 @@ def string_column_to_ndarray(col: Column) -> tuple[np.ndarray, Any]:
324324
# Add to our list of strings
325325
str_list[i] = string
326326

327-
# Convert the string list to a NumPy array
328-
return np.asarray(str_list, dtype="object"), buffers
327+
if using_string_dtype():
328+
res = pd.Series(str_list, dtype="str")
329+
else:
330+
res = np.asarray(str_list, dtype="object") # type: ignore[assignment]
331+
332+
return res, buffers # type: ignore[return-value]
329333

330334

331335
def parse_datetime_format_str(format_str, data) -> pd.Series | np.ndarray:

pandas/core/internals/blocks.py

Lines changed: 35 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,7 @@
8484
ABCNumpyExtensionArray,
8585
ABCSeries,
8686
)
87+
from pandas.core.dtypes.inference import is_re
8788
from pandas.core.dtypes.missing import (
8889
is_valid_na_for_dtype,
8990
isna,
@@ -115,6 +116,7 @@
115116
PeriodArray,
116117
TimedeltaArray,
117118
)
119+
from pandas.core.arrays.string_ import StringDtype
118120
from pandas.core.base import PandasObject
119121
import pandas.core.common as com
120122
from pandas.core.computation import expressions
@@ -476,7 +478,9 @@ def split_and_operate(self, func, *args, **kwargs) -> list[Block]:
476478
# Up/Down-casting
477479

478480
@final
479-
def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
481+
def coerce_to_target_dtype(
482+
self, other, warn_on_upcast: bool = False, using_cow: bool = False
483+
) -> Block:
480484
"""
481485
coerce the current block to a dtype compat for other
482486
we will return a block, possibly object, and not raise
@@ -528,7 +532,14 @@ def coerce_to_target_dtype(self, other, warn_on_upcast: bool = False) -> Block:
528532
f"{self.values.dtype}. Please report a bug at "
529533
"https://github.com/pandas-dev/pandas/issues."
530534
)
531-
return self.astype(new_dtype, copy=False)
535+
copy = False
536+
if (
537+
not using_cow
538+
and isinstance(self.dtype, StringDtype)
539+
and self.dtype.storage == "python"
540+
):
541+
copy = True
542+
return self.astype(new_dtype, copy=copy, using_cow=using_cow)
532543

533544
@final
534545
def _maybe_downcast(
@@ -879,7 +890,7 @@ def replace(
879890
else:
880891
return [self] if inplace else [self.copy()]
881892

882-
elif self._can_hold_element(value):
893+
elif self._can_hold_element(value) or (self.dtype == "string" and is_re(value)):
883894
# TODO(CoW): Maybe split here as well into columns where mask has True
884895
# and rest?
885896
blk = self._maybe_copy(using_cow, inplace)
@@ -926,12 +937,13 @@ def replace(
926937
if value is None or value is NA:
927938
blk = self.astype(np.dtype(object))
928939
else:
929-
blk = self.coerce_to_target_dtype(value)
940+
blk = self.coerce_to_target_dtype(value, using_cow=using_cow)
930941
return blk.replace(
931942
to_replace=to_replace,
932943
value=value,
933944
inplace=True,
934945
mask=mask,
946+
using_cow=using_cow,
935947
)
936948

937949
else:
@@ -980,16 +992,26 @@ def _replace_regex(
980992
-------
981993
List[Block]
982994
"""
983-
if not self._can_hold_element(to_replace):
995+
if not is_re(to_replace) and not self._can_hold_element(to_replace):
984996
# i.e. only if self.is_object is True, but could in principle include a
985997
# String ExtensionBlock
986998
if using_cow:
987999
return [self.copy(deep=False)]
9881000
return [self] if inplace else [self.copy()]
9891001

990-
rx = re.compile(to_replace)
1002+
if is_re(to_replace) and self.dtype not in [object, "string"]:
1003+
# only object or string dtype can hold strings, and a regex object
1004+
# will only match strings
1005+
return [self.copy(deep=False)]
9911006

992-
block = self._maybe_copy(using_cow, inplace)
1007+
if not (
1008+
self._can_hold_element(value) or (self.dtype == "string" and is_re(value))
1009+
):
1010+
block = self.astype(np.dtype(object))
1011+
else:
1012+
block = self._maybe_copy(using_cow, inplace)
1013+
1014+
rx = re.compile(to_replace)
9931015

9941016
replace_regex(block.values, rx, value, mask)
9951017

@@ -1048,7 +1070,9 @@ def replace_list(
10481070

10491071
# Exclude anything that we know we won't contain
10501072
pairs = [
1051-
(x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
1073+
(x, y)
1074+
for x, y in zip(src_list, dest_list)
1075+
if (self._can_hold_element(x) or (self.dtype == "string" and is_re(x)))
10521076
]
10531077
if not len(pairs):
10541078
if using_cow:
@@ -1686,7 +1710,7 @@ def fillna(
16861710
return nbs
16871711

16881712
if limit is not None:
1689-
mask[mask.cumsum(self.ndim - 1) > limit] = False
1713+
mask[mask.cumsum(self.values.ndim - 1) > limit] = False
16901714

16911715
if inplace:
16921716
nbs = self.putmask(
@@ -2112,7 +2136,7 @@ def where(
21122136
res_values = arr._where(cond, other).T
21132137
except (ValueError, TypeError):
21142138
if self.ndim == 1 or self.shape[0] == 1:
2115-
if isinstance(self.dtype, IntervalDtype):
2139+
if isinstance(self.dtype, (IntervalDtype, StringDtype)):
21162140
# TestSetitemFloatIntervalWithIntIntervalValues
21172141
blk = self.coerce_to_target_dtype(orig_other)
21182142
nbs = blk.where(orig_other, orig_cond, using_cow=using_cow)
@@ -2314,7 +2338,7 @@ def fillna(
23142338
using_cow: bool = False,
23152339
already_warned=None,
23162340
) -> list[Block]:
2317-
if isinstance(self.dtype, IntervalDtype):
2341+
if isinstance(self.dtype, (IntervalDtype, StringDtype)):
23182342
# Block.fillna handles coercion (test_fillna_interval)
23192343
return super().fillna(
23202344
value=value,

pandas/core/internals/construction.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1042,8 +1042,9 @@ def convert(arr):
10421042
if dtype is None:
10431043
if arr.dtype == np.dtype("O"):
10441044
# i.e. maybe_convert_objects didn't convert
1045-
arr = maybe_infer_to_datetimelike(arr)
1046-
if dtype_backend != "numpy" and arr.dtype == np.dtype("O"):
1045+
convert_to_nullable_dtype = dtype_backend != "numpy"
1046+
arr = maybe_infer_to_datetimelike(arr, convert_to_nullable_dtype)
1047+
if convert_to_nullable_dtype and arr.dtype == np.dtype("O"):
10471048
new_dtype = StringDtype()
10481049
arr_cls = new_dtype.construct_array_type()
10491050
arr = arr_cls._from_sequence(arr, dtype=new_dtype)

0 commit comments

Comments
 (0)