Skip to content

Commit df965f3

Browse files
authored
Merge branch 'pandas-dev:main' into main
2 parents eefc8a4 + 3f310c4 commit df965f3

File tree

31 files changed

+625
-480
lines changed

31 files changed

+625
-480
lines changed

.github/workflows/python-dev.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ jobs:
5454
os: [ubuntu-latest, macOS-latest, windows-latest]
5555

5656
name: actions-311-dev
57-
timeout-minutes: 80
57+
timeout-minutes: 120
5858

5959
concurrency:
6060
#https://i.8713187.xyzmunity/t/concurrecy-not-work-for-push/183068/7
@@ -75,7 +75,7 @@ jobs:
7575
run: |
7676
python --version
7777
python -m pip install --upgrade pip setuptools wheel
78-
python -m pip install git+https://github.com/numpy/numpy.git
78+
python -m pip install -i https://pypi.anaconda.org/scipy-wheels-nightly/simple numpy
7979
python -m pip install git+https://github.com/nedbat/coveragepy.git
8080
python -m pip install python-dateutil pytz cython hypothesis==6.52.1 pytest>=6.2.5 pytest-xdist pytest-cov pytest-asyncio>=0.17
8181
python -m pip list
@@ -84,7 +84,7 @@ jobs:
8484
- name: Build Pandas
8585
run: |
8686
python setup.py build_ext -q -j1
87-
python -m pip install -e . --no-build-isolation --no-use-pep517
87+
python -m pip install -e . --no-build-isolation --no-use-pep517 --no-index
8888
8989
- name: Build Version
9090
run: |

README.md

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,8 +128,6 @@ or for installing in [development mode](https://pip.pypa.io/en/latest/cli/pip_in
128128
python -m pip install -e . --no-build-isolation --no-use-pep517
129129
```
130130

131-
If you have `make`, you can also use `make develop` to run the same command.
132-
133131
or alternatively
134132

135133
```sh

doc/source/whatsnew/v1.6.0.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ Other API changes
119119
- Passing ``nanoseconds`` greater than 999 or less than 0 in :class:`Timestamp` now raises a ``ValueError`` (:issue:`48538`, :issue:`48255`)
120120
- :func:`read_csv`: specifying an incorrect number of columns with ``index_col`` of now raises ``ParserError`` instead of ``IndexError`` when using the c parser.
121121
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting datetime64 data to any of "datetime64[s]", "datetime64[ms]", "datetime64[us]" will return an object with the given resolution instead of coercing back to "datetime64[ns]" (:issue:`48928`)
122+
- :meth:`DataFrame.astype`, :meth:`Series.astype`, and :meth:`DatetimeIndex.astype` casting timedelta64 data to any of "timedelta64[s]", "timedelta64[ms]", "timedelta64[us]" will return an object with the given resolution instead of coercing to "float64" dtype (:issue:`48963`)
122123
-
123124

124125
.. ---------------------------------------------------------------------------
@@ -153,6 +154,7 @@ Performance improvements
153154
- Performance improvement in ``var`` for nullable dtypes (:issue:`48379`).
154155
- Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
155156
- Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
157+
- Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
156158

157159
.. ---------------------------------------------------------------------------
158160
.. _whatsnew_160.bug_fixes:

pandas/_libs/tslibs/timedeltas.pyx

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -341,8 +341,9 @@ cdef convert_to_timedelta64(object ts, str unit):
341341
elif isinstance(ts, _Timedelta):
342342
# already in the proper format
343343
if ts._reso != NPY_FR_ns:
344-
raise NotImplementedError
345-
ts = np.timedelta64(ts.value, "ns")
344+
ts = ts._as_unit("ns").asm8
345+
else:
346+
ts = np.timedelta64(ts.value, "ns")
346347
elif is_timedelta64_object(ts):
347348
ts = ensure_td64ns(ts)
348349
elif is_integer_object(ts):
@@ -1706,7 +1707,13 @@ class Timedelta(_Timedelta):
17061707
value = parse_timedelta_string(value)
17071708
value = np.timedelta64(value)
17081709
elif PyDelta_Check(value):
1709-
value = convert_to_timedelta64(value, 'ns')
1710+
# pytimedelta object -> microsecond resolution
1711+
new_value = delta_to_nanoseconds(
1712+
value, reso=NPY_DATETIMEUNIT.NPY_FR_us
1713+
)
1714+
return cls._from_value_and_reso(
1715+
new_value, reso=NPY_DATETIMEUNIT.NPY_FR_us
1716+
)
17101717
elif is_timedelta64_object(value):
17111718
# Retain the resolution if possible, otherwise cast to the nearest
17121719
# supported resolution.
@@ -1720,7 +1727,7 @@ class Timedelta(_Timedelta):
17201727
if reso != NPY_DATETIMEUNIT.NPY_FR_GENERIC:
17211728
try:
17221729
new_value = convert_reso(
1723-
get_timedelta64_value(value),
1730+
new_value,
17241731
reso,
17251732
new_reso,
17261733
round_ok=True,
@@ -1730,7 +1737,10 @@ class Timedelta(_Timedelta):
17301737
return cls._from_value_and_reso(new_value, reso=new_reso)
17311738

17321739
elif is_tick_object(value):
1733-
value = np.timedelta64(value.nanos, 'ns')
1740+
new_reso = get_supported_reso(value._reso)
1741+
new_value = delta_to_nanoseconds(value, reso=new_reso)
1742+
return cls._from_value_and_reso(new_value, reso=new_reso)
1743+
17341744
elif is_integer_object(value) or is_float_object(value):
17351745
# unit=None is de-facto 'ns'
17361746
unit = parse_timedelta_unit(unit)

pandas/core/arrays/datetimelike.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1275,7 +1275,8 @@ def _add_timedeltalike_scalar(self, other):
12751275

12761276
# PeriodArray overrides, so we only get here with DTA/TDA
12771277
self = cast("DatetimeArray | TimedeltaArray", self)
1278-
other = Timedelta(other)._as_unit(self._unit)
1278+
other = Timedelta(other)
1279+
self, other = self._ensure_matching_resos(other)
12791280
return self._add_timedeltalike(other)
12801281

12811282
def _add_timedelta_arraylike(self, other: TimedeltaArray):

pandas/core/arrays/masked.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,10 @@
1717
lib,
1818
missing as libmissing,
1919
)
20+
from pandas._libs.tslibs import (
21+
get_unit_from_dtype,
22+
is_supported_unit,
23+
)
2024
from pandas._typing import (
2125
ArrayLike,
2226
AstypeArg,
@@ -750,12 +754,16 @@ def _maybe_mask_result(self, result, mask):
750754

751755
return BooleanArray(result, mask, copy=False)
752756

753-
elif result.dtype == "timedelta64[ns]":
757+
elif (
758+
isinstance(result.dtype, np.dtype)
759+
and result.dtype.kind == "m"
760+
and is_supported_unit(get_unit_from_dtype(result.dtype))
761+
):
754762
# e.g. test_numeric_arr_mul_tdscalar_numexpr_path
755763
from pandas.core.arrays import TimedeltaArray
756764

757765
if not isinstance(result, TimedeltaArray):
758-
result = TimedeltaArray._simple_new(result)
766+
result = TimedeltaArray._simple_new(result, dtype=result.dtype)
759767

760768
result[mask] = result.dtype.type("NaT")
761769
return result

pandas/core/arrays/timedeltas.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,9 @@
2020
Tick,
2121
Timedelta,
2222
astype_overflowsafe,
23+
get_unit_from_dtype,
2324
iNaT,
25+
is_supported_unit,
2426
periods_per_second,
2527
to_offset,
2628
)
@@ -257,10 +259,10 @@ def _generate_range(cls, start, end, periods, freq, closed=None):
257259
)
258260

259261
if start is not None:
260-
start = Timedelta(start)
262+
start = Timedelta(start)._as_unit("ns")
261263

262264
if end is not None:
263-
end = Timedelta(end)
265+
end = Timedelta(end)._as_unit("ns")
264266

265267
left_closed, right_closed = validate_endpoints(closed)
266268

@@ -308,6 +310,18 @@ def astype(self, dtype, copy: bool = True):
308310
dtype = pandas_dtype(dtype)
309311

310312
if dtype.kind == "m":
313+
if dtype == self.dtype:
314+
if copy:
315+
return self.copy()
316+
return self
317+
318+
if is_supported_unit(get_unit_from_dtype(dtype)):
319+
# unit conversion e.g. timedelta64[s]
320+
res_values = astype_overflowsafe(self._ndarray, dtype, copy=False)
321+
return type(self)._simple_new(
322+
res_values, dtype=res_values.dtype, freq=self.freq
323+
)
324+
311325
return astype_td64_unit_conversion(self._ndarray, dtype, copy=copy)
312326

313327
return dtl.DatetimeLikeArrayMixin.astype(self, dtype, copy=copy)

pandas/core/construction.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,6 @@
4545
maybe_convert_platform,
4646
maybe_infer_to_datetimelike,
4747
maybe_upcast,
48-
sanitize_to_nanoseconds,
4948
)
5049
from pandas.core.dtypes.common import (
5150
is_datetime64_ns_dtype,
@@ -782,7 +781,9 @@ def _try_cast(
782781
if is_ndarray:
783782
arr = cast(np.ndarray, arr)
784783
if arr.dtype != object:
785-
return sanitize_to_nanoseconds(arr, copy=copy)
784+
if copy:
785+
return arr.copy()
786+
return arr
786787

787788
out = maybe_infer_to_datetimelike(arr)
788789
if out is arr and copy:

pandas/core/dtypes/astype.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,14 @@ def astype_nansafe(
136136
return arr.view(dtype)
137137

138138
elif dtype.kind == "m":
139+
# TODO(2.0): change to use the same logic as TDA.astype, i.e.
140+
# giving the requested dtype for supported units (s, ms, us, ns)
141+
# and doing the old convert-to-float behavior otherwise.
142+
if is_supported_unit(get_unit_from_dtype(arr.dtype)):
143+
from pandas.core.construction import ensure_wrapped_if_datetimelike
144+
145+
arr = ensure_wrapped_if_datetimelike(arr)
146+
return arr.astype(dtype, copy=copy)
139147
return astype_td64_unit_conversion(arr, dtype, copy=copy)
140148

141149
raise TypeError(f"cannot astype a timedelta from [{arr.dtype}] to [{dtype}]")

pandas/core/dtypes/cast.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1423,12 +1423,7 @@ def maybe_cast_to_datetime(
14231423
return astype_nansafe(value, dtype) # type: ignore[arg-type]
14241424

14251425
elif isinstance(value, np.ndarray):
1426-
if value.dtype.kind in ["M", "m"]:
1427-
# catch a datetime/timedelta that is not of ns variety
1428-
# and no coercion specified
1429-
value = sanitize_to_nanoseconds(value)
1430-
1431-
elif value.dtype == _dtype_obj:
1426+
if value.dtype == _dtype_obj:
14321427
value = maybe_infer_to_datetimelike(value)
14331428

14341429
elif isinstance(value, list):

pandas/core/generic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5878,7 +5878,7 @@ def pipe(
58785878
58795879
If you have a function that takes the data as (say) the second
58805880
argument, pass a tuple indicating which keyword expects the
5881-
data. For example, suppose ``f`` takes its data as ``arg2``:
5881+
data. For example, suppose ``func`` takes its data as ``arg2``:
58825882
58835883
>>> (df.pipe(h)
58845884
... .pipe(g, arg1=a)

pandas/core/groupby/categorical.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -75,21 +75,21 @@ def recode_for_groupby(
7575
return c, None
7676

7777
# sort=False should order groups in as-encountered order (GH-8868)
78-
cat = c.unique()
7978

80-
# See GH-38140 for block below
81-
# exclude nan from indexer for categories
82-
take_codes = cat.codes[cat.codes != -1]
83-
if cat.ordered:
84-
take_codes = np.sort(take_codes)
85-
cat = cat.set_categories(cat.categories.take(take_codes))
86-
87-
# But for groupby to work, all categories should be present,
88-
# including those missing from the data (GH-13179), which .unique()
89-
# above dropped
90-
cat = cat.add_categories(c.categories[~c.categories.isin(cat.categories)])
91-
92-
return c.reorder_categories(cat.categories), None
79+
# xref GH:46909: Re-ordering codes faster than using (set|add|reorder)_categories
80+
all_codes = np.arange(c.categories.nunique(), dtype=np.int8)
81+
# GH 38140: exclude nan from indexer for categories
82+
unique_notnan_codes = unique1d(c.codes[c.codes != -1])
83+
if c.ordered:
84+
unique_notnan_codes = np.sort(unique_notnan_codes)
85+
if len(all_codes) > len(unique_notnan_codes):
86+
# GH 13179: All categories need to be present, even if missing from the data
87+
missing_codes = np.setdiff1d(all_codes, unique_notnan_codes, assume_unique=True)
88+
take_codes = np.concatenate((unique_notnan_codes, missing_codes))
89+
else:
90+
take_codes = unique_notnan_codes
91+
92+
return Categorical(c, c.unique().categories.take(take_codes)), None
9393

9494

9595
def recode_from_groupby(

pandas/core/groupby/grouper.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
from pandas.util._decorators import cache_readonly
2727
from pandas.util._exceptions import find_stack_level
2828

29-
from pandas.core.dtypes.cast import sanitize_to_nanoseconds
3029
from pandas.core.dtypes.common import (
3130
is_categorical_dtype,
3231
is_list_like,
@@ -558,9 +557,12 @@ def __init__(
558557
raise AssertionError(errmsg)
559558

560559
if isinstance(self.grouping_vector, np.ndarray):
561-
# if we have a date/time-like grouper, make sure that we have
562-
# Timestamps like
563-
self.grouping_vector = sanitize_to_nanoseconds(self.grouping_vector)
560+
if self.grouping_vector.dtype.kind in ["m", "M"]:
561+
# if we have a date/time-like grouper, make sure that we have
562+
# Timestamps like
563+
# TODO 2022-10-08 we only have one test that gets here and
564+
# values are already in nanoseconds in that case.
565+
self.grouping_vector = Series(self.grouping_vector).to_numpy()
564566

565567
def __repr__(self) -> str:
566568
return f"Grouping({self.name})"

pandas/tests/arithmetic/test_numeric.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from __future__ import annotations
55

66
from collections import abc
7+
from datetime import timedelta
78
from decimal import Decimal
89
import operator
910
from typing import Any
@@ -27,6 +28,7 @@
2728
Int64Index,
2829
UInt64Index,
2930
)
31+
from pandas.core.arrays import TimedeltaArray
3032
from pandas.core.computation import expressions as expr
3133
from pandas.tests.arithmetic.common import (
3234
assert_invalid_addsub_type,
@@ -209,6 +211,11 @@ def test_numeric_arr_mul_tdscalar(self, scalar_td, numeric_idx, box_with_array):
209211
tda = expected._data
210212
dtype = scalar_td.dtype
211213
expected = type(tda)._simple_new(tda._ndarray.astype(dtype), dtype=dtype)
214+
elif type(scalar_td) is timedelta and box not in [Index, Series]:
215+
# TODO(2.0): once TDA.astype converts to m8, just do expected.astype
216+
tda = expected._data
217+
dtype = np.dtype("m8[us]")
218+
expected = type(tda)._simple_new(tda._ndarray.astype(dtype), dtype=dtype)
212219

213220
index = tm.box_expected(index, box)
214221
expected = tm.box_expected(expected, box)
@@ -240,6 +247,13 @@ def test_numeric_arr_mul_tdscalar_numexpr_path(
240247
obj = tm.box_expected(arr, box, transpose=False)
241248

242249
expected = arr_i8.view("timedelta64[D]").astype("timedelta64[ns]")
250+
if type(scalar_td) is timedelta and box is array:
251+
# TODO(2.0): this shouldn't depend on 'box'
252+
expected = expected.astype("timedelta64[us]")
253+
# TODO(2.0): won't be necessary to construct TimedeltaArray
254+
# explicitly.
255+
expected = TimedeltaArray._simple_new(expected, dtype=expected.dtype)
256+
243257
expected = tm.box_expected(expected, box, transpose=False)
244258

245259
result = obj * scalar_td
@@ -262,6 +276,11 @@ def test_numeric_arr_rdiv_tdscalar(self, three_days, numeric_idx, box_with_array
262276
# i.e. resolution is lower -> use lowest supported resolution
263277
dtype = np.dtype("m8[s]")
264278
expected = type(tda)._simple_new(tda._ndarray.astype(dtype), dtype=dtype)
279+
elif type(three_days) is timedelta and box not in [Index, Series]:
280+
# TODO(2.0): just use expected.astype
281+
tda = expected._data
282+
dtype = np.dtype("m8[us]")
283+
expected = type(tda)._simple_new(tda._ndarray.astype(dtype), dtype=dtype)
265284

266285
index = tm.box_expected(index, box)
267286
expected = tm.box_expected(expected, box)

pandas/tests/dtypes/cast/test_promote.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -480,6 +480,12 @@ def test_maybe_promote_any_with_timedelta64(
480480
"Timedelta scalar"
481481
)
482482
request.node.add_marker(mark)
483+
elif type(fill_value) is datetime.timedelta:
484+
mark = pytest.mark.xfail(
485+
reason="maybe_promote not yet updated to handle non-nano "
486+
"Timedelta scalar"
487+
)
488+
request.node.add_marker(mark)
483489
else:
484490
expected_dtype = np.dtype(object)
485491
exp_val_for_scalar = fill_value

pandas/tests/frame/methods/test_astype.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -480,12 +480,19 @@ def test_astype_to_timedelta_unit_ns(self, unit):
480480
@pytest.mark.parametrize("unit", ["us", "ms", "s", "h", "m", "D"])
481481
def test_astype_to_timedelta_unit(self, unit):
482482
# coerce to float
483-
# GH#19223
483+
# GH#19223 until 2.0 used to coerce to float
484484
dtype = f"m8[{unit}]"
485485
arr = np.array([[1, 2, 3]], dtype=dtype)
486486
df = DataFrame(arr)
487487
result = df.astype(dtype)
488-
expected = DataFrame(df.values.astype(dtype).astype(float))
488+
489+
if unit in ["m", "h", "D"]:
490+
# We don't support these, so we use the old logic to convert to float
491+
expected = DataFrame(df.values.astype(dtype).astype(float))
492+
else:
493+
tda = pd.core.arrays.TimedeltaArray._simple_new(arr, dtype=arr.dtype)
494+
expected = DataFrame(tda)
495+
assert (expected.dtypes == dtype).all()
489496

490497
tm.assert_frame_equal(result, expected)
491498

0 commit comments

Comments
 (0)