Skip to content

ENH: Add more dt property/method support for ArrowDtype(timestamp) #52503

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 21 commits into from
Apr 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ Other enhancements
- Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`)
- Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`)
- Classes that are useful for type-hinting have been added to the public API in the new submodule ``pandas.api.typing`` (:issue:`48577`)
- Implemented :attr:`Series.dt.is_month_start`, :attr:`Series.dt.is_month_end`, :attr:`Series.dt.is_year_start`, :attr:`Series.dt.is_year_end`, :attr:`Series.dt.is_quarter_start`, :attr:`Series.dt.is_quarter_end`, :attr:`Series.dt.is_days_in_month`, :attr:`Series.dt.unit`, :meth:`Series.dt.is_normalize`, :meth:`Series.dt.day_name`, :meth:`Series.dt.month_name`, :meth:`Series.dt.tz_convert` for :class:`ArrowDtype` with ``pyarrow.timestamp`` (:issue:`52388`, :issue:`51718`)
- Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype`. (:issue:`52201`)
- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide <extending.pandas_priority>` (:issue:`48347`)
- Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`)
Expand Down
88 changes: 88 additions & 0 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -2089,6 +2089,68 @@ def _dt_isocalendar(self):
def _dt_is_leap_year(self):
return type(self)(pc.is_leap_year(self._pa_array))

@property
def _dt_is_month_start(self):
return type(self)(pc.equal(pc.day(self._pa_array), 1))

@property
def _dt_is_month_end(self):
result = pc.equal(
pc.days_between(
pc.floor_temporal(self._pa_array, unit="day"),
pc.ceil_temporal(self._pa_array, unit="month"),
),
1,
)
return type(self)(result)

@property
def _dt_is_year_start(self):
return type(self)(
pc.and_(
pc.equal(pc.month(self._pa_array), 1),
pc.equal(pc.day(self._pa_array), 1),
)
)

@property
def _dt_is_year_end(self):
return type(self)(
pc.and_(
pc.equal(pc.month(self._pa_array), 12),
pc.equal(pc.day(self._pa_array), 31),
)
)

@property
def _dt_is_quarter_start(self):
result = pc.equal(
pc.floor_temporal(self._pa_array, unit="quarter"),
pc.floor_temporal(self._pa_array, unit="day"),
)
return type(self)(result)

@property
def _dt_is_quarter_end(self):
result = pc.equal(
pc.days_between(
pc.floor_temporal(self._pa_array, unit="day"),
pc.ceil_temporal(self._pa_array, unit="quarter"),
),
1,
)
return type(self)(result)

@property
def _dt_days_in_month(self):
result = pc.days_between(
pc.floor_temporal(self._pa_array, unit="month"),
pc.ceil_temporal(self._pa_array, unit="month"),
)
return type(self)(result)

_dt_daysinmonth = _dt_days_in_month

@property
def _dt_microsecond(self):
return type(self)(pc.microsecond(self._pa_array))
Expand Down Expand Up @@ -2130,6 +2192,13 @@ def _dt_time(self):
def _dt_tz(self):
return self.dtype.pyarrow_dtype.tz

@property
def _dt_unit(self):
return self.dtype.pyarrow_dtype.unit

def _dt_normalize(self):
return type(self)(pc.floor_temporal(self._pa_array, 1, "day"))

def _dt_strftime(self, format: str):
return type(self)(pc.strftime(self._pa_array, format=format))

Expand Down Expand Up @@ -2194,6 +2263,16 @@ def _dt_round(
):
return self._round_temporally("round", freq, ambiguous, nonexistent)

def _dt_day_name(self, locale: str | None = None):
if locale is None:
locale = "C"
return type(self)(pc.strftime(self._pa_array, format="%A", locale=locale))

def _dt_month_name(self, locale: str | None = None):
if locale is None:
locale = "C"
return type(self)(pc.strftime(self._pa_array, format="%B", locale=locale))

def _dt_to_pydatetime(self):
data = self._pa_array.to_pylist()
if self._dtype.pyarrow_dtype.unit == "ns":
Expand Down Expand Up @@ -2224,3 +2303,12 @@ def _dt_tz_localize(
self._pa_array, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa
)
return type(self)(result)

def _dt_tz_convert(self, tz):
if self.dtype.pyarrow_dtype.tz is None:
raise TypeError(
"Cannot convert tz-naive timestamps, use tz_localize to localize"
)
current_unit = self.dtype.pyarrow_dtype.unit
result = self._pa_array.cast(pa.timestamp(current_unit, tz))
return type(self)(result)
155 changes: 155 additions & 0 deletions pandas/tests/extension/test_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -2248,12 +2248,110 @@ def test_dt_properties(prop, expected):
tm.assert_series_equal(result, expected)


def test_dt_is_month_start_end():
ser = pd.Series(
[
datetime(year=2023, month=12, day=2, hour=3),
datetime(year=2023, month=1, day=1, hour=3),
datetime(year=2023, month=3, day=31, hour=3),
None,
],
dtype=ArrowDtype(pa.timestamp("us")),
)
result = ser.dt.is_month_start
expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
tm.assert_series_equal(result, expected)

result = ser.dt.is_month_end
expected = pd.Series([False, False, True, None], dtype=ArrowDtype(pa.bool_()))
tm.assert_series_equal(result, expected)


def test_dt_is_year_start_end():
ser = pd.Series(
[
datetime(year=2023, month=12, day=31, hour=3),
datetime(year=2023, month=1, day=1, hour=3),
datetime(year=2023, month=3, day=31, hour=3),
None,
],
dtype=ArrowDtype(pa.timestamp("us")),
)
result = ser.dt.is_year_start
expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
tm.assert_series_equal(result, expected)

result = ser.dt.is_year_end
expected = pd.Series([True, False, False, None], dtype=ArrowDtype(pa.bool_()))
tm.assert_series_equal(result, expected)


def test_dt_is_quarter_start_end():
ser = pd.Series(
[
datetime(year=2023, month=11, day=30, hour=3),
datetime(year=2023, month=1, day=1, hour=3),
datetime(year=2023, month=3, day=31, hour=3),
None,
],
dtype=ArrowDtype(pa.timestamp("us")),
)
result = ser.dt.is_quarter_start
expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
tm.assert_series_equal(result, expected)

result = ser.dt.is_quarter_end
expected = pd.Series([False, False, True, None], dtype=ArrowDtype(pa.bool_()))
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("method", ["days_in_month", "daysinmonth"])
def test_dt_days_in_month(method):
ser = pd.Series(
[
datetime(year=2023, month=3, day=30, hour=3),
datetime(year=2023, month=4, day=1, hour=3),
datetime(year=2023, month=2, day=3, hour=3),
None,
],
dtype=ArrowDtype(pa.timestamp("us")),
)
result = getattr(ser.dt, method)
expected = pd.Series([31, 30, 28, None], dtype=ArrowDtype(pa.int64()))
tm.assert_series_equal(result, expected)


def test_dt_normalize():
ser = pd.Series(
[
datetime(year=2023, month=3, day=30),
datetime(year=2023, month=4, day=1, hour=3),
datetime(year=2023, month=2, day=3, hour=23, minute=59, second=59),
None,
],
dtype=ArrowDtype(pa.timestamp("us")),
)
result = ser.dt.normalize()
expected = pd.Series(
[
datetime(year=2023, month=3, day=30),
datetime(year=2023, month=4, day=1),
datetime(year=2023, month=2, day=3),
None,
],
dtype=ArrowDtype(pa.timestamp("us")),
)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("unit", ["us", "ns"])
def test_dt_time_preserve_unit(unit):
ser = pd.Series(
[datetime(year=2023, month=1, day=2, hour=3), None],
dtype=ArrowDtype(pa.timestamp(unit)),
)
assert ser.dt.unit == unit

result = ser.dt.time
expected = pd.Series(
ArrowExtensionArray(pa.array([time(3, 0), None], type=pa.time64(unit)))
Expand Down Expand Up @@ -2285,6 +2383,27 @@ def test_dt_isocalendar():
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"method, exp", [["day_name", "Sunday"], ["month_name", "January"]]
)
def test_dt_day_month_name(method, exp, request):
# GH 52388
if is_platform_windows() and is_ci_environment():
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowInvalid,
reason=(
"TODO: Set ARROW_TIMEZONE_DATABASE environment variable "
"on CI to path to the tzdata for pyarrow."
),
)
)
ser = pd.Series([datetime(2023, 1, 1), None], dtype=ArrowDtype(pa.timestamp("ms")))
result = getattr(ser.dt, method)()
expected = pd.Series([exp, None], dtype=ArrowDtype(pa.string()))
tm.assert_series_equal(result, expected)


def test_dt_strftime(request):
if is_platform_windows() and is_ci_environment():
request.node.add_marker(
Expand Down Expand Up @@ -2445,6 +2564,42 @@ def test_dt_tz_localize_nonexistent(nonexistent, exp_date, request):
tm.assert_series_equal(result, expected)


def test_dt_tz_convert_not_tz_raises():
ser = pd.Series(
[datetime(year=2023, month=1, day=2, hour=3), None],
dtype=ArrowDtype(pa.timestamp("ns")),
)
with pytest.raises(TypeError, match="Cannot convert tz-naive timestamps"):
ser.dt.tz_convert("UTC")


def test_dt_tz_convert_none():
ser = pd.Series(
[datetime(year=2023, month=1, day=2, hour=3), None],
dtype=ArrowDtype(pa.timestamp("ns", "US/Pacific")),
)
result = ser.dt.tz_convert(None)
expected = pd.Series(
[datetime(year=2023, month=1, day=2, hour=3), None],
dtype=ArrowDtype(pa.timestamp("ns")),
)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("unit", ["us", "ns"])
def test_dt_tz_convert(unit):
ser = pd.Series(
[datetime(year=2023, month=1, day=2, hour=3), None],
dtype=ArrowDtype(pa.timestamp(unit, "US/Pacific")),
)
result = ser.dt.tz_convert("US/Eastern")
expected = pd.Series(
[datetime(year=2023, month=1, day=2, hour=3), None],
dtype=ArrowDtype(pa.timestamp(unit, "US/Eastern")),
)
tm.assert_series_equal(result, expected)


@pytest.mark.parametrize("skipna", [True, False])
def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna):
# GH51624
Expand Down