Skip to content

Commit 367d24f

Browse files
authored
ENH: Add more dt property/method support for ArrowDtype(timestamp) (#52503)
* Add more properties & attributes * Add issue number * Add xfails * Simplify days_in_month * Add tz_convert * Undo quarter * Add another issue * simplify is_quarter * undo test * simplify * fix is_quarter_end * Address is_month_end * Remove unused
1 parent c3f0aac commit 367d24f

File tree

3 files changed

+244
-0
lines changed

3 files changed

+244
-0
lines changed

doc/source/whatsnew/v2.1.0.rst

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@ Other enhancements
8787
- Added to the escape mode "latex-math" preserving without escaping all characters between "\(" and "\)" in formatter (:issue:`51903`)
8888
- Adding ``engine_kwargs`` parameter to :meth:`DataFrame.read_excel` (:issue:`52214`)
8989
- Classes that are useful for type-hinting have been added to the public API in the new submodule ``pandas.api.typing`` (:issue:`48577`)
90+
- Implemented :attr:`Series.dt.is_month_start`, :attr:`Series.dt.is_month_end`, :attr:`Series.dt.is_year_start`, :attr:`Series.dt.is_year_end`, :attr:`Series.dt.is_quarter_start`, :attr:`Series.dt.is_quarter_end`, :attr:`Series.dt.is_days_in_month`, :attr:`Series.dt.unit`, :meth:`Series.dt.is_normalize`, :meth:`Series.dt.day_name`, :meth:`Series.dt.month_name`, :meth:`Series.dt.tz_convert` for :class:`ArrowDtype` with ``pyarrow.timestamp`` (:issue:`52388`, :issue:`51718`)
9091
- Implemented ``__from_arrow__`` on :class:`DatetimeTZDtype`. (:issue:`52201`)
9192
- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide <extending.pandas_priority>` (:issue:`48347`)
9293
- Improve error message when having incompatible columns using :meth:`DataFrame.merge` (:issue:`51861`)

pandas/core/arrays/arrow/array.py

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2102,6 +2102,68 @@ def _dt_isocalendar(self):
21022102
def _dt_is_leap_year(self):
21032103
return type(self)(pc.is_leap_year(self._pa_array))
21042104

2105+
@property
2106+
def _dt_is_month_start(self):
2107+
return type(self)(pc.equal(pc.day(self._pa_array), 1))
2108+
2109+
@property
2110+
def _dt_is_month_end(self):
2111+
result = pc.equal(
2112+
pc.days_between(
2113+
pc.floor_temporal(self._pa_array, unit="day"),
2114+
pc.ceil_temporal(self._pa_array, unit="month"),
2115+
),
2116+
1,
2117+
)
2118+
return type(self)(result)
2119+
2120+
@property
2121+
def _dt_is_year_start(self):
2122+
return type(self)(
2123+
pc.and_(
2124+
pc.equal(pc.month(self._pa_array), 1),
2125+
pc.equal(pc.day(self._pa_array), 1),
2126+
)
2127+
)
2128+
2129+
@property
2130+
def _dt_is_year_end(self):
2131+
return type(self)(
2132+
pc.and_(
2133+
pc.equal(pc.month(self._pa_array), 12),
2134+
pc.equal(pc.day(self._pa_array), 31),
2135+
)
2136+
)
2137+
2138+
@property
2139+
def _dt_is_quarter_start(self):
2140+
result = pc.equal(
2141+
pc.floor_temporal(self._pa_array, unit="quarter"),
2142+
pc.floor_temporal(self._pa_array, unit="day"),
2143+
)
2144+
return type(self)(result)
2145+
2146+
@property
2147+
def _dt_is_quarter_end(self):
2148+
result = pc.equal(
2149+
pc.days_between(
2150+
pc.floor_temporal(self._pa_array, unit="day"),
2151+
pc.ceil_temporal(self._pa_array, unit="quarter"),
2152+
),
2153+
1,
2154+
)
2155+
return type(self)(result)
2156+
2157+
@property
2158+
def _dt_days_in_month(self):
2159+
result = pc.days_between(
2160+
pc.floor_temporal(self._pa_array, unit="month"),
2161+
pc.ceil_temporal(self._pa_array, unit="month"),
2162+
)
2163+
return type(self)(result)
2164+
2165+
_dt_daysinmonth = _dt_days_in_month
2166+
21052167
@property
21062168
def _dt_microsecond(self):
21072169
return type(self)(pc.microsecond(self._pa_array))
@@ -2143,6 +2205,13 @@ def _dt_time(self):
21432205
def _dt_tz(self):
21442206
return self.dtype.pyarrow_dtype.tz
21452207

2208+
@property
2209+
def _dt_unit(self):
2210+
return self.dtype.pyarrow_dtype.unit
2211+
2212+
def _dt_normalize(self):
2213+
return type(self)(pc.floor_temporal(self._pa_array, 1, "day"))
2214+
21462215
def _dt_strftime(self, format: str):
21472216
return type(self)(pc.strftime(self._pa_array, format=format))
21482217

@@ -2207,6 +2276,16 @@ def _dt_round(
22072276
):
22082277
return self._round_temporally("round", freq, ambiguous, nonexistent)
22092278

2279+
def _dt_day_name(self, locale: str | None = None):
2280+
if locale is None:
2281+
locale = "C"
2282+
return type(self)(pc.strftime(self._pa_array, format="%A", locale=locale))
2283+
2284+
def _dt_month_name(self, locale: str | None = None):
2285+
if locale is None:
2286+
locale = "C"
2287+
return type(self)(pc.strftime(self._pa_array, format="%B", locale=locale))
2288+
22102289
def _dt_to_pydatetime(self):
22112290
data = self._pa_array.to_pylist()
22122291
if self._dtype.pyarrow_dtype.unit == "ns":
@@ -2237,3 +2316,12 @@ def _dt_tz_localize(
22372316
self._pa_array, str(tz), ambiguous=ambiguous, nonexistent=nonexistent_pa
22382317
)
22392318
return type(self)(result)
2319+
2320+
def _dt_tz_convert(self, tz):
2321+
if self.dtype.pyarrow_dtype.tz is None:
2322+
raise TypeError(
2323+
"Cannot convert tz-naive timestamps, use tz_localize to localize"
2324+
)
2325+
current_unit = self.dtype.pyarrow_dtype.unit
2326+
result = self._pa_array.cast(pa.timestamp(current_unit, tz))
2327+
return type(self)(result)

pandas/tests/extension/test_arrow.py

Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2275,12 +2275,110 @@ def test_dt_properties(prop, expected):
22752275
tm.assert_series_equal(result, expected)
22762276

22772277

2278+
def test_dt_is_month_start_end():
2279+
ser = pd.Series(
2280+
[
2281+
datetime(year=2023, month=12, day=2, hour=3),
2282+
datetime(year=2023, month=1, day=1, hour=3),
2283+
datetime(year=2023, month=3, day=31, hour=3),
2284+
None,
2285+
],
2286+
dtype=ArrowDtype(pa.timestamp("us")),
2287+
)
2288+
result = ser.dt.is_month_start
2289+
expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
2290+
tm.assert_series_equal(result, expected)
2291+
2292+
result = ser.dt.is_month_end
2293+
expected = pd.Series([False, False, True, None], dtype=ArrowDtype(pa.bool_()))
2294+
tm.assert_series_equal(result, expected)
2295+
2296+
2297+
def test_dt_is_year_start_end():
2298+
ser = pd.Series(
2299+
[
2300+
datetime(year=2023, month=12, day=31, hour=3),
2301+
datetime(year=2023, month=1, day=1, hour=3),
2302+
datetime(year=2023, month=3, day=31, hour=3),
2303+
None,
2304+
],
2305+
dtype=ArrowDtype(pa.timestamp("us")),
2306+
)
2307+
result = ser.dt.is_year_start
2308+
expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
2309+
tm.assert_series_equal(result, expected)
2310+
2311+
result = ser.dt.is_year_end
2312+
expected = pd.Series([True, False, False, None], dtype=ArrowDtype(pa.bool_()))
2313+
tm.assert_series_equal(result, expected)
2314+
2315+
2316+
def test_dt_is_quarter_start_end():
2317+
ser = pd.Series(
2318+
[
2319+
datetime(year=2023, month=11, day=30, hour=3),
2320+
datetime(year=2023, month=1, day=1, hour=3),
2321+
datetime(year=2023, month=3, day=31, hour=3),
2322+
None,
2323+
],
2324+
dtype=ArrowDtype(pa.timestamp("us")),
2325+
)
2326+
result = ser.dt.is_quarter_start
2327+
expected = pd.Series([False, True, False, None], dtype=ArrowDtype(pa.bool_()))
2328+
tm.assert_series_equal(result, expected)
2329+
2330+
result = ser.dt.is_quarter_end
2331+
expected = pd.Series([False, False, True, None], dtype=ArrowDtype(pa.bool_()))
2332+
tm.assert_series_equal(result, expected)
2333+
2334+
2335+
@pytest.mark.parametrize("method", ["days_in_month", "daysinmonth"])
2336+
def test_dt_days_in_month(method):
2337+
ser = pd.Series(
2338+
[
2339+
datetime(year=2023, month=3, day=30, hour=3),
2340+
datetime(year=2023, month=4, day=1, hour=3),
2341+
datetime(year=2023, month=2, day=3, hour=3),
2342+
None,
2343+
],
2344+
dtype=ArrowDtype(pa.timestamp("us")),
2345+
)
2346+
result = getattr(ser.dt, method)
2347+
expected = pd.Series([31, 30, 28, None], dtype=ArrowDtype(pa.int64()))
2348+
tm.assert_series_equal(result, expected)
2349+
2350+
2351+
def test_dt_normalize():
2352+
ser = pd.Series(
2353+
[
2354+
datetime(year=2023, month=3, day=30),
2355+
datetime(year=2023, month=4, day=1, hour=3),
2356+
datetime(year=2023, month=2, day=3, hour=23, minute=59, second=59),
2357+
None,
2358+
],
2359+
dtype=ArrowDtype(pa.timestamp("us")),
2360+
)
2361+
result = ser.dt.normalize()
2362+
expected = pd.Series(
2363+
[
2364+
datetime(year=2023, month=3, day=30),
2365+
datetime(year=2023, month=4, day=1),
2366+
datetime(year=2023, month=2, day=3),
2367+
None,
2368+
],
2369+
dtype=ArrowDtype(pa.timestamp("us")),
2370+
)
2371+
tm.assert_series_equal(result, expected)
2372+
2373+
22782374
@pytest.mark.parametrize("unit", ["us", "ns"])
22792375
def test_dt_time_preserve_unit(unit):
22802376
ser = pd.Series(
22812377
[datetime(year=2023, month=1, day=2, hour=3), None],
22822378
dtype=ArrowDtype(pa.timestamp(unit)),
22832379
)
2380+
assert ser.dt.unit == unit
2381+
22842382
result = ser.dt.time
22852383
expected = pd.Series(
22862384
ArrowExtensionArray(pa.array([time(3, 0), None], type=pa.time64(unit)))
@@ -2312,6 +2410,27 @@ def test_dt_isocalendar():
23122410
tm.assert_frame_equal(result, expected)
23132411

23142412

2413+
@pytest.mark.parametrize(
2414+
"method, exp", [["day_name", "Sunday"], ["month_name", "January"]]
2415+
)
2416+
def test_dt_day_month_name(method, exp, request):
2417+
# GH 52388
2418+
if is_platform_windows() and is_ci_environment():
2419+
request.node.add_marker(
2420+
pytest.mark.xfail(
2421+
raises=pa.ArrowInvalid,
2422+
reason=(
2423+
"TODO: Set ARROW_TIMEZONE_DATABASE environment variable "
2424+
"on CI to path to the tzdata for pyarrow."
2425+
),
2426+
)
2427+
)
2428+
ser = pd.Series([datetime(2023, 1, 1), None], dtype=ArrowDtype(pa.timestamp("ms")))
2429+
result = getattr(ser.dt, method)()
2430+
expected = pd.Series([exp, None], dtype=ArrowDtype(pa.string()))
2431+
tm.assert_series_equal(result, expected)
2432+
2433+
23152434
def test_dt_strftime(request):
23162435
if is_platform_windows() and is_ci_environment():
23172436
request.node.add_marker(
@@ -2472,6 +2591,42 @@ def test_dt_tz_localize_nonexistent(nonexistent, exp_date, request):
24722591
tm.assert_series_equal(result, expected)
24732592

24742593

2594+
def test_dt_tz_convert_not_tz_raises():
2595+
ser = pd.Series(
2596+
[datetime(year=2023, month=1, day=2, hour=3), None],
2597+
dtype=ArrowDtype(pa.timestamp("ns")),
2598+
)
2599+
with pytest.raises(TypeError, match="Cannot convert tz-naive timestamps"):
2600+
ser.dt.tz_convert("UTC")
2601+
2602+
2603+
def test_dt_tz_convert_none():
2604+
ser = pd.Series(
2605+
[datetime(year=2023, month=1, day=2, hour=3), None],
2606+
dtype=ArrowDtype(pa.timestamp("ns", "US/Pacific")),
2607+
)
2608+
result = ser.dt.tz_convert(None)
2609+
expected = pd.Series(
2610+
[datetime(year=2023, month=1, day=2, hour=3), None],
2611+
dtype=ArrowDtype(pa.timestamp("ns")),
2612+
)
2613+
tm.assert_series_equal(result, expected)
2614+
2615+
2616+
@pytest.mark.parametrize("unit", ["us", "ns"])
2617+
def test_dt_tz_convert(unit):
2618+
ser = pd.Series(
2619+
[datetime(year=2023, month=1, day=2, hour=3), None],
2620+
dtype=ArrowDtype(pa.timestamp(unit, "US/Pacific")),
2621+
)
2622+
result = ser.dt.tz_convert("US/Eastern")
2623+
expected = pd.Series(
2624+
[datetime(year=2023, month=1, day=2, hour=3), None],
2625+
dtype=ArrowDtype(pa.timestamp(unit, "US/Eastern")),
2626+
)
2627+
tm.assert_series_equal(result, expected)
2628+
2629+
24752630
@pytest.mark.parametrize("skipna", [True, False])
24762631
def test_boolean_reduce_series_all_null(all_boolean_reductions, skipna):
24772632
# GH51624

0 commit comments

Comments
 (0)