Skip to content

ENH: Allow dt accessor when using ArrowDtype with datetime types #50954

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 19 commits into from
Feb 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -263,6 +263,7 @@ Alternatively, copy on write can be enabled locally through:

Other enhancements
^^^^^^^^^^^^^^^^^^
- Added support for ``dt`` accessor methods when using :class:`ArrowDtype` with a ``pyarrow.timestamp`` type (:issue:`50954`)
- :func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`)
- :meth:`.DataFrameGroupBy.quantile`, :meth:`.SeriesGroupBy.quantile` and :meth:`.DataFrameGroupBy.std` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
Expand Down
53 changes: 47 additions & 6 deletions pandas/core/accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,10 @@
"""
from __future__ import annotations

from typing import final
from typing import (
Callable,
final,
)
import warnings

from pandas.util._decorators import doc
Expand Down Expand Up @@ -59,7 +62,13 @@ def _delegate_method(self, name, *args, **kwargs):

@classmethod
def _add_delegate_accessors(
cls, delegate, accessors, typ: str, overwrite: bool = False
cls,
delegate,
accessors: list[str],
typ: str,
overwrite: bool = False,
accessor_mapping: Callable[[str], str] = lambda x: x,
raise_on_missing: bool = True,
) -> None:
"""
Add accessors to cls from the delegate class.
Expand All @@ -75,6 +84,11 @@ def _add_delegate_accessors(
typ : {'property', 'method'}
overwrite : bool, default False
Overwrite the method/property in the target class if it exists.
accessor_mapping: Callable, default lambda x: x
Callable to map the delegate's function to the cls' function.
raise_on_missing: bool, default True
Raise if an accessor does not exist on delegate.
False skips the missing accessor.
"""

def _create_delegator_property(name):
Expand All @@ -88,20 +102,28 @@ def _setter(self, new_values):
_setter.__name__ = name

return property(
fget=_getter, fset=_setter, doc=getattr(delegate, name).__doc__
fget=_getter,
fset=_setter,
doc=getattr(delegate, accessor_mapping(name)).__doc__,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we do this once near the top?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ill put this into my next Assorted Cleanups branch

)

def _create_delegator_method(name):
def f(self, *args, **kwargs):
return self._delegate_method(name, *args, **kwargs)

f.__name__ = name
f.__doc__ = getattr(delegate, name).__doc__
f.__doc__ = getattr(delegate, accessor_mapping(name)).__doc__

return f

for name in accessors:

if (
not raise_on_missing
and getattr(delegate, accessor_mapping(name), None) is None
):
continue

if typ == "property":
f = _create_delegator_property(name)
else:
Expand All @@ -112,7 +134,14 @@ def f(self, *args, **kwargs):
setattr(cls, name, f)


def delegate_names(delegate, accessors, typ: str, overwrite: bool = False):
def delegate_names(
delegate,
accessors: list[str],
typ: str,
overwrite: bool = False,
accessor_mapping: Callable[[str], str] = lambda x: x,
raise_on_missing: bool = True,
):
"""
Add delegated names to a class using a class decorator. This provides
an alternative usage to directly calling `_add_delegate_accessors`
Expand All @@ -127,6 +156,11 @@ def delegate_names(delegate, accessors, typ: str, overwrite: bool = False):
typ : {'property', 'method'}
overwrite : bool, default False
Overwrite the method/property in the target class if it exists.
accessor_mapping: Callable, default lambda x: x
Callable to map the delegate's function to the cls' function.
raise_on_missing: bool, default True
Raise if an accessor does not exist on delegate.
False skips the missing accessor.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you expect to reuse this machinery again? if not, could the pyarrow-dt be accessor be implemented standalone to avoid making this more complicaed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am hoping to use this for the pyarrow duration types as well.

I think a standalone implementation would require a further refactoring of this dt accessor. Looks like currently datetime and timedelta dispatch to the Index while we probably need something that dispatches to the array


Returns
-------
Expand All @@ -141,7 +175,14 @@ class CategoricalAccessor(PandasDelegate):
"""

def add_delegate_accessors(cls):
cls._add_delegate_accessors(delegate, accessors, typ, overwrite=overwrite)
cls._add_delegate_accessors(
delegate,
accessors,
typ,
overwrite=overwrite,
accessor_mapping=accessor_mapping,
raise_on_missing=raise_on_missing,
)
return cls

return add_delegate_accessors
Expand Down
155 changes: 155 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
Scalar,
SortKind,
TakeIndexer,
TimeAmbiguous,
TimeNonexistent,
npt,
)
from pandas.compat import (
Expand Down Expand Up @@ -53,6 +55,8 @@
validate_indices,
)

from pandas.tseries.frequencies import to_offset

if not pa_version_under7p0:
import pyarrow as pa
import pyarrow.compute as pc
Expand Down Expand Up @@ -1413,3 +1417,154 @@ def _replace_with_mask(
result = np.array(values, dtype=object)
result[mask] = replacements
return pa.array(result, type=values.type, from_pandas=True)

@property
def _dt_day(self):
return type(self)(pc.day(self._data))

@property
def _dt_day_of_week(self):
return type(self)(pc.day_of_week(self._data))

_dt_dayofweek = _dt_day_of_week
_dt_weekday = _dt_day_of_week

@property
def _dt_day_of_year(self):
return type(self)(pc.day_of_year(self._data))

_dt_dayofyear = _dt_day_of_year

@property
def _dt_hour(self):
return type(self)(pc.hour(self._data))

def _dt_isocalendar(self):
return type(self)(pc.iso_calendar(self._data))

@property
def _dt_is_leap_year(self):
return type(self)(pc.is_leap_year(self._data))

@property
def _dt_microsecond(self):
return type(self)(pc.microsecond(self._data))

@property
def _dt_minute(self):
return type(self)(pc.minute(self._data))

@property
def _dt_month(self):
return type(self)(pc.month(self._data))

@property
def _dt_nanosecond(self):
return type(self)(pc.nanosecond(self._data))

@property
def _dt_quarter(self):
return type(self)(pc.quarter(self._data))

@property
def _dt_second(self):
return type(self)(pc.second(self._data))

@property
def _dt_date(self):
return type(self)(self._data.cast(pa.date64()))

@property
def _dt_time(self):
unit = (
self.dtype.pyarrow_dtype.unit
if self.dtype.pyarrow_dtype.unit in {"us", "ns"}
else "ns"
)
return type(self)(self._data.cast(pa.time64(unit)))

@property
def _dt_tz(self):
return self.dtype.pyarrow_dtype.tz

def _dt_strftime(self, format: str):
return type(self)(pc.strftime(self._data, format=format))

def _round_temporally(
self,
method: Literal["ceil", "floor", "round"],
freq,
ambiguous: TimeAmbiguous = "raise",
nonexistent: TimeNonexistent = "raise",
):
if ambiguous != "raise":
raise NotImplementedError("ambiguous is not supported.")
if nonexistent != "raise":
raise NotImplementedError("nonexistent is not supported.")
offset = to_offset(freq)
if offset is None:
raise ValueError(f"Must specify a valid frequency: {freq}")
pa_supported_unit = {
"A": "year",
"AS": "year",
"Q": "quarter",
"QS": "quarter",
"M": "month",
"MS": "month",
"W": "week",
"D": "day",
"H": "hour",
"T": "minute",
"S": "second",
"L": "millisecond",
"U": "microsecond",
"N": "nanosecond",
}
unit = pa_supported_unit.get(offset._prefix, None)
if unit is None:
raise ValueError(f"{freq=} is not supported")
multiple = offset.n
rounding_method = getattr(pc, f"{method}_temporal")
return type(self)(rounding_method(self._data, multiple=multiple, unit=unit))

def _dt_ceil(
self,
freq,
ambiguous: TimeAmbiguous = "raise",
nonexistent: TimeNonexistent = "raise",
):
return self._round_temporally("ceil", freq, ambiguous, nonexistent)

def _dt_floor(
self,
freq,
ambiguous: TimeAmbiguous = "raise",
nonexistent: TimeNonexistent = "raise",
):
return self._round_temporally("floor", freq, ambiguous, nonexistent)

def _dt_round(
self,
freq,
ambiguous: TimeAmbiguous = "raise",
nonexistent: TimeNonexistent = "raise",
):
return self._round_temporally("round", freq, ambiguous, nonexistent)

def _dt_tz_localize(
self,
tz,
ambiguous: TimeAmbiguous = "raise",
nonexistent: TimeNonexistent = "raise",
):
if ambiguous != "raise":
raise NotImplementedError(f"{ambiguous=} is not supported")
if nonexistent != "raise":
raise NotImplementedError(f"{nonexistent=} is not supported")
if tz is None:
new_type = pa.timestamp(self.dtype.pyarrow_dtype.unit)
return type(self)(self._data.cast(new_type))
pa_tz = str(tz)
return type(self)(
self._data.cast(pa.timestamp(self.dtype.pyarrow_dtype.unit, pa_tz))
)
3 changes: 3 additions & 0 deletions pandas/core/arrays/arrow/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,9 @@ def numpy_dtype(self) -> np.dtype:

@cache_readonly
def kind(self) -> str:
if pa.types.is_timestamp(self.pyarrow_dtype):
# To mirror DatetimeTZDtype
return "M"
return self.numpy_dtype.kind

@cache_readonly
Expand Down
Loading