Skip to content

Commit 758711a

Browse files
committed
Merge remote-tracking branch 'upstream/main' into ref/memoryviews
2 parents 4c82b17 + 0ec5f26 commit 758711a

31 files changed

+185
-117
lines changed

doc/source/whatsnew/v2.3.0.rst

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,16 @@ These are bug fixes that might have notable behavior changes.
5353
notable_bug_fix1
5454
^^^^^^^^^^^^^^^^
5555

56+
.. _whatsnew_230.api_changes:
57+
58+
API changes
59+
~~~~~~~~~~~
60+
61+
- When enabling the ``future.infer_string`` option: Index set operations (like
62+
union or intersection) will now ignore the dtype of an empty ``RangeIndex`` or
63+
empty ``Index`` with object dtype when determining the dtype of the resulting
64+
Index (:issue:`60797`)
65+
5666
.. ---------------------------------------------------------------------------
5767
.. _whatsnew_230.deprecations:
5868

doc/source/whatsnew/v3.0.0.rst

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -361,6 +361,9 @@ Other API changes
361361
- pickle and HDF (``.h5``) files created with Python 2 are no longer explicitly supported (:issue:`57387`)
362362
- pickled objects from pandas version less than ``1.0.0`` are no longer supported (:issue:`57155`)
363363
- when comparing the indexes in :func:`testing.assert_series_equal`, check_exact defaults to True if an :class:`Index` is of integer dtypes. (:issue:`57386`)
364+
- Index set operations (like union or intersection) will now ignore the dtype of
365+
an empty ``RangeIndex`` or empty ``Index`` with object dtype when determining
366+
the dtype of the resulting Index (:issue:`60797`)
364367

365368
.. ---------------------------------------------------------------------------
366369
.. _whatsnew_300.deprecations:

pandas/core/indexes/base.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,10 @@
1919

2020
import numpy as np
2121

22-
from pandas._config import get_option
22+
from pandas._config import (
23+
get_option,
24+
using_string_dtype,
25+
)
2326

2427
from pandas._libs import (
2528
NaT,
@@ -6235,6 +6238,24 @@ def _find_common_type_compat(self, target) -> DtypeObj:
62356238
"""
62366239
target_dtype, _ = infer_dtype_from(target)
62376240

6241+
if using_string_dtype():
6242+
# special case: if left or right is a zero-length RangeIndex or
6243+
# Index[object], those can be created by the default empty constructors
6244+
# -> for that case ignore this dtype and always return the other
6245+
# (https://github.com/pandas-dev/pandas/pull/60797)
6246+
from pandas.core.indexes.range import RangeIndex
6247+
6248+
if len(self) == 0 and (
6249+
isinstance(self, RangeIndex) or self.dtype == np.object_
6250+
):
6251+
return target_dtype
6252+
if (
6253+
isinstance(target, Index)
6254+
and len(target) == 0
6255+
and (isinstance(target, RangeIndex) or target_dtype == np.object_)
6256+
):
6257+
return self.dtype
6258+
62386259
# special case: if one dtype is uint64 and the other a signed int, return object
62396260
# See https://github.com/pandas-dev/pandas/issues/26778 for discussion
62406261
# Now it's:
@@ -6888,6 +6909,14 @@ def insert(self, loc: int, item) -> Index:
68886909

68896910
arr = self._values
68906911

6912+
if using_string_dtype() and len(self) == 0 and self.dtype == np.object_:
6913+
# special case: if we are an empty object-dtype Index, also
6914+
# take into account the inserted item for the resulting dtype
6915+
# (https://github.com/pandas-dev/pandas/pull/60797)
6916+
dtype = self._find_common_type_compat(item)
6917+
if dtype != self.dtype:
6918+
return self.astype(dtype).insert(loc, item)
6919+
68916920
try:
68926921
if isinstance(arr, ExtensionArray):
68936922
res_values = arr.insert(loc, item)

pandas/io/pytables.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4159,6 +4159,8 @@ def _create_axes(
41594159
ordered = data_converted.ordered
41604160
meta = "category"
41614161
metadata = np.asarray(data_converted.categories).ravel()
4162+
elif isinstance(blk.dtype, StringDtype):
4163+
meta = str(blk.dtype)
41624164

41634165
data, dtype_name = _get_data_and_dtype_name(data_converted)
41644166

@@ -4419,7 +4421,8 @@ def read_column(
44194421
errors=self.errors,
44204422
)
44214423
cvs = col_values[1]
4422-
return Series(cvs, name=column, copy=False)
4424+
dtype = getattr(self.table.attrs, f"{column}_meta", None)
4425+
return Series(cvs, name=column, copy=False, dtype=dtype)
44234426

44244427
raise KeyError(f"column [{column}] not found in the table")
44254428

@@ -4769,8 +4772,18 @@ def read(
47694772
df = DataFrame._from_arrays([values], columns=cols_, index=index_)
47704773
if not (using_string_dtype() and values.dtype.kind == "O"):
47714774
assert (df.dtypes == values.dtype).all(), (df.dtypes, values.dtype)
4775+
4776+
# If str / string dtype is stored in meta, use that.
4777+
converted = False
4778+
for column in cols_:
4779+
dtype = getattr(self.table.attrs, f"{column}_meta", None)
4780+
if dtype in ["str", "string"]:
4781+
df[column] = df[column].astype(dtype)
4782+
converted = True
4783+
# Otherwise try inference.
47724784
if (
4773-
using_string_dtype()
4785+
not converted
4786+
and using_string_dtype()
47744787
and isinstance(values, np.ndarray)
47754788
and is_string_array(
47764789
values,

pandas/tests/dtypes/test_concat.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def test_concat_periodarray_2d():
4747
_concat.concat_compat([arr[:2], arr[2:]], axis=1)
4848

4949

50-
def test_concat_series_between_empty_and_tzaware_series():
50+
def test_concat_series_between_empty_and_tzaware_series(using_infer_string):
5151
tzaware_time = pd.Timestamp("2020-01-01T00:00:00+00:00")
5252
ser1 = Series(index=[tzaware_time], data=0, dtype=float)
5353
ser2 = Series(dtype=float)
@@ -57,7 +57,9 @@ def test_concat_series_between_empty_and_tzaware_series():
5757
data=[
5858
(0.0, None),
5959
],
60-
index=pd.Index([tzaware_time], dtype=object),
60+
index=[tzaware_time]
61+
if using_infer_string
62+
else pd.Index([tzaware_time], dtype=object),
6163
columns=[0, 1],
6264
dtype=float,
6365
)

pandas/tests/frame/constructors/test_from_dict.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,6 @@
33
import numpy as np
44
import pytest
55

6-
from pandas._config import using_string_dtype
7-
86
from pandas import (
97
DataFrame,
108
Index,
@@ -44,7 +42,6 @@ def test_constructor_single_row(self):
4442
)
4543
tm.assert_frame_equal(result, expected)
4644

47-
@pytest.mark.xfail(using_string_dtype(), reason="columns inferring logic broken")
4845
def test_constructor_list_of_series(self):
4946
data = [
5047
OrderedDict([["a", 1.5], ["b", 3.0], ["c", 4.0]]),

pandas/tests/frame/indexing/test_coercion.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,7 @@ def test_26395(indexer_al):
8888
df["D"] = 0
8989

9090
indexer_al(df)["C", "D"] = 2
91-
expected = DataFrame(
92-
{"D": [0, 0, 2]},
93-
index=["A", "B", "C"],
94-
columns=pd.Index(["D"], dtype=object),
95-
dtype=np.int64,
96-
)
91+
expected = DataFrame({"D": [0, 0, 2]}, index=["A", "B", "C"], dtype=np.int64)
9792
tm.assert_frame_equal(df, expected)
9893

9994
with pytest.raises(TypeError, match="Invalid value"):

pandas/tests/frame/indexing/test_indexing.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1138,7 +1138,7 @@ def test_loc_setitem_datetimelike_with_inference(self):
11381138
result = df.dtypes
11391139
expected = Series(
11401140
[np.dtype("timedelta64[ns]")] * 6 + [np.dtype("datetime64[ns]")] * 2,
1141-
index=Index(list("ABCDEFGH"), dtype=object),
1141+
index=list("ABCDEFGH"),
11421142
)
11431143
tm.assert_series_equal(result, expected)
11441144

pandas/tests/frame/indexing/test_insert.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,7 @@ def test_insert_with_columns_dups(self):
6868
df.insert(0, "A", ["d", "e", "f"], allow_duplicates=True)
6969
df.insert(0, "A", ["a", "b", "c"], allow_duplicates=True)
7070
exp = DataFrame(
71-
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]],
72-
columns=Index(["A", "A", "A"], dtype=object),
71+
[["a", "d", "g"], ["b", "e", "h"], ["c", "f", "i"]], columns=["A", "A", "A"]
7372
)
7473
tm.assert_frame_equal(df, exp)
7574

pandas/tests/frame/indexing/test_setitem.py

Lines changed: 22 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -144,18 +144,32 @@ def test_setitem_different_dtype(self):
144144
)
145145
tm.assert_series_equal(result, expected)
146146

147-
def test_setitem_empty_columns(self):
148-
# GH 13522
147+
def test_setitem_overwrite_index(self):
148+
# GH 13522 - assign the index as a column and then overwrite the values
149+
# -> should not affect the index
149150
df = DataFrame(index=["A", "B", "C"])
150151
df["X"] = df.index
151152
df["X"] = ["x", "y", "z"]
152153
exp = DataFrame(
153-
data={"X": ["x", "y", "z"]},
154-
index=["A", "B", "C"],
155-
columns=Index(["X"], dtype=object),
154+
data={"X": ["x", "y", "z"]}, index=["A", "B", "C"], columns=["X"]
156155
)
157156
tm.assert_frame_equal(df, exp)
158157

158+
def test_setitem_empty_columns(self):
159+
# Starting from an empty DataFrame and setting a column should result
160+
# in a default string dtype for the columns' Index
161+
# https://github.com/pandas-dev/pandas/issues/60338
162+
163+
df = DataFrame()
164+
df["foo"] = [1, 2, 3]
165+
expected = DataFrame({"foo": [1, 2, 3]})
166+
tm.assert_frame_equal(df, expected)
167+
168+
df = DataFrame(columns=Index([]))
169+
df["foo"] = [1, 2, 3]
170+
expected = DataFrame({"foo": [1, 2, 3]})
171+
tm.assert_frame_equal(df, expected)
172+
159173
def test_setitem_dt64_index_empty_columns(self):
160174
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
161175
df = DataFrame(index=np.arange(len(rng)))
@@ -169,9 +183,7 @@ def test_setitem_timestamp_empty_columns(self):
169183
df["now"] = Timestamp("20130101", tz="UTC")
170184

171185
expected = DataFrame(
172-
[[Timestamp("20130101", tz="UTC")]] * 3,
173-
index=range(3),
174-
columns=Index(["now"], dtype=object),
186+
[[Timestamp("20130101", tz="UTC")]] * 3, index=range(3), columns=["now"]
175187
)
176188
tm.assert_frame_equal(df, expected)
177189

@@ -210,7 +222,7 @@ def test_setitem_period_preserves_dtype(self):
210222
result = DataFrame([])
211223
result["a"] = data
212224

213-
expected = DataFrame({"a": data}, columns=Index(["a"], dtype=object))
225+
expected = DataFrame({"a": data}, columns=["a"])
214226

215227
tm.assert_frame_equal(result, expected)
216228

@@ -930,7 +942,7 @@ def test_setitem_scalars_no_index(self):
930942
# GH#16823 / GH#17894
931943
df = DataFrame()
932944
df["foo"] = 1
933-
expected = DataFrame(columns=Index(["foo"], dtype=object)).astype(np.int64)
945+
expected = DataFrame(columns=["foo"]).astype(np.int64)
934946
tm.assert_frame_equal(df, expected)
935947

936948
def test_setitem_newcol_tuple_key(self, float_frame):

pandas/tests/frame/methods/test_dropna.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -182,12 +182,9 @@ def test_dropna_multiple_axes(self):
182182
with pytest.raises(TypeError, match="supplying multiple axes"):
183183
inp.dropna(how="all", axis=(0, 1), inplace=True)
184184

185-
def test_dropna_tz_aware_datetime(self, using_infer_string):
185+
def test_dropna_tz_aware_datetime(self):
186186
# GH13407
187-
188187
df = DataFrame()
189-
if using_infer_string:
190-
df.columns = df.columns.astype("str")
191188
dt1 = datetime.datetime(2015, 1, 1, tzinfo=dateutil.tz.tzutc())
192189
dt2 = datetime.datetime(2015, 2, 2, tzinfo=dateutil.tz.tzutc())
193190
df["Time"] = [dt1]

pandas/tests/frame/methods/test_reset_index.py

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
import numpy as np
55
import pytest
66

7-
from pandas._config import using_string_dtype
8-
97
from pandas.core.dtypes.common import (
108
is_float_dtype,
119
is_integer_dtype,
@@ -644,7 +642,6 @@ def test_rest_index_multiindex_categorical_with_missing_values(self, codes):
644642
tm.assert_frame_equal(res, expected)
645643

646644

647-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string) - GH#60338")
648645
@pytest.mark.parametrize(
649646
"array, dtype",
650647
[
@@ -781,3 +778,34 @@ def test_reset_index_false_index_name():
781778
result_frame.reset_index()
782779
expected_frame = DataFrame(range(5, 10), RangeIndex(range(5), name=False))
783780
tm.assert_frame_equal(result_frame, expected_frame)
781+
782+
783+
@pytest.mark.parametrize("columns", [None, Index([])])
784+
def test_reset_index_with_empty_frame(columns):
785+
# Currently empty DataFrame has RangeIndex or object dtype Index, but when
786+
# resetting the index we still want to end up with the default string dtype
787+
# https://github.com/pandas-dev/pandas/issues/60338
788+
789+
index = Index([], name="foo")
790+
df = DataFrame(index=index, columns=columns)
791+
result = df.reset_index()
792+
expected = DataFrame(columns=["foo"])
793+
tm.assert_frame_equal(result, expected)
794+
795+
index = Index([1, 2, 3], name="foo")
796+
df = DataFrame(index=index, columns=columns)
797+
result = df.reset_index()
798+
expected = DataFrame({"foo": [1, 2, 3]})
799+
tm.assert_frame_equal(result, expected)
800+
801+
index = MultiIndex.from_tuples([], names=["foo", "bar"])
802+
df = DataFrame(index=index, columns=columns)
803+
result = df.reset_index()
804+
expected = DataFrame(columns=["foo", "bar"])
805+
tm.assert_frame_equal(result, expected)
806+
807+
index = MultiIndex.from_tuples([(1, 2), (2, 3)], names=["foo", "bar"])
808+
df = DataFrame(index=index, columns=columns)
809+
result = df.reset_index()
810+
expected = DataFrame({"foo": [1, 2], "bar": [2, 3]})
811+
tm.assert_frame_equal(result, expected)

pandas/tests/frame/test_constructors.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121
from numpy.ma import mrecords
2222
import pytest
2323

24-
from pandas._config import using_string_dtype
25-
2624
from pandas._libs import lib
2725
from pandas.compat.numpy import np_version_gt2
2826
from pandas.errors import IntCastingNaNError
@@ -1974,7 +1972,6 @@ def test_constructor_with_datetimes4(self):
19741972
df = DataFrame({"value": dr})
19751973
assert str(df.iat[0, 0].tz) == "US/Eastern"
19761974

1977-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
19781975
def test_constructor_with_datetimes5(self):
19791976
# GH 7822
19801977
# preserver an index with a tz on dict construction

pandas/tests/frame/test_query_eval.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -791,7 +791,6 @@ def test_check_tz_aware_index_query(self, tz_aware_fixture):
791791
tm.assert_frame_equal(result, expected)
792792

793793
expected = DataFrame(df_index)
794-
expected.columns = expected.columns.astype(object)
795794
result = df.reset_index().query('"2018-01-03 00:00:00+00" < time')
796795
tm.assert_frame_equal(result, expected)
797796

pandas/tests/groupby/test_groupby.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1278,7 +1278,7 @@ def test_groupby_2d_malformed():
12781278
d["label"] = ["l1", "l2"]
12791279
tmp = d.groupby(["group"]).mean(numeric_only=True)
12801280
res_values = np.array([[0.0, 1.0], [0.0, 1.0]])
1281-
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"], dtype=object))
1281+
tm.assert_index_equal(tmp.columns, Index(["zeros", "ones"]))
12821282
tm.assert_numpy_array_equal(tmp.values, res_values)
12831283

12841284

pandas/tests/indexes/base_class/test_reshape.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def test_insert(self):
3434

3535
# test empty
3636
null_index = Index([])
37-
tm.assert_index_equal(Index(["a"], dtype=object), null_index.insert(0, "a"))
37+
tm.assert_index_equal(Index(["a"]), null_index.insert(0, "a"))
3838

3939
def test_insert_missing(self, nulls_fixture, using_infer_string):
4040
# GH#22295

pandas/tests/indexes/base_class/test_setops.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -240,7 +240,6 @@ def test_tuple_union_bug(self, method, expected, sort):
240240
def test_union_name_preservation(
241241
self, first_list, second_list, first_name, second_name, expected_name, sort
242242
):
243-
expected_dtype = object if not first_list or not second_list else "str"
244243
first = Index(first_list, name=first_name)
245244
second = Index(second_list, name=second_name)
246245
union = first.union(second, sort=sort)
@@ -251,7 +250,7 @@ def test_union_name_preservation(
251250
expected = Index(sorted(vals), name=expected_name)
252251
tm.assert_index_equal(union, expected)
253252
else:
254-
expected = Index(vals, name=expected_name, dtype=expected_dtype)
253+
expected = Index(vals, name=expected_name)
255254
tm.assert_index_equal(union.sort_values(), expected.sort_values())
256255

257256
@pytest.mark.parametrize(

0 commit comments

Comments
 (0)