Skip to content

Commit 2421931

Browse files
authored
PERF: array_strptime (#55898)
* PERF: array_strptime avoid object path * creso fixup * Fixup leftover assertion * object instead of raise * post-merge fixup * post-merge fixup
1 parent b8d0471 commit 2421931

File tree

2 files changed

+238
-77
lines changed

2 files changed

+238
-77
lines changed

pandas/_libs/tslibs/strptime.pyx

Lines changed: 227 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,18 @@ from numpy cimport (
4747
)
4848

4949
from pandas._libs.missing cimport checknull_with_nat_and_na
50-
from pandas._libs.tslibs.conversion cimport get_datetime64_nanos
50+
from pandas._libs.tslibs.conversion cimport (
51+
get_datetime64_nanos,
52+
parse_pydatetime,
53+
)
5154
from pandas._libs.tslibs.dtypes cimport (
5255
get_supported_reso,
5356
npy_unit_to_abbrev,
5457
npy_unit_to_attrname,
5558
)
5659
from pandas._libs.tslibs.nattype cimport (
5760
NPY_NAT,
61+
c_NaT as NaT,
5862
c_nat_strings as nat_strings,
5963
)
6064
from pandas._libs.tslibs.np_datetime cimport (
@@ -65,7 +69,6 @@ from pandas._libs.tslibs.np_datetime cimport (
6569
npy_datetimestruct,
6670
npy_datetimestruct_to_datetime,
6771
pydate_to_dt64,
68-
pydatetime_to_dt64,
6972
string_to_dts,
7073
)
7174

@@ -82,6 +85,8 @@ from pandas._libs.util cimport (
8285

8386
from pandas._libs.tslibs.timestamps import Timestamp
8487

88+
from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
89+
8590
cnp.import_array()
8691

8792

@@ -314,11 +319,13 @@ def array_strptime(
314319
Py_ssize_t i, n = len(values)
315320
npy_datetimestruct dts
316321
int64_t[::1] iresult
317-
object[::1] result_timezone
318322
object val, tz
323+
bint seen_datetime_offset = False
319324
bint is_raise = errors=="raise"
320325
bint is_ignore = errors=="ignore"
321326
bint is_coerce = errors=="coerce"
327+
bint is_same_offsets
328+
set out_tzoffset_vals = set()
322329
tzinfo tz_out = None
323330
bint iso_format = format_is_iso(fmt)
324331
NPY_DATETIMEUNIT out_bestunit, item_reso
@@ -338,7 +345,6 @@ def array_strptime(
338345
abbrev = npy_unit_to_abbrev(creso)
339346
result = np.empty(n, dtype=f"M8[{abbrev}]")
340347
iresult = result.view("i8")
341-
result_timezone = np.empty(n, dtype="object")
342348

343349
dts.us = dts.ps = dts.as = 0
344350

@@ -361,23 +367,18 @@ def array_strptime(
361367
if infer_reso:
362368
creso = state.creso
363369
tz_out = state.process_datetime(val, tz_out, utc)
364-
if isinstance(val, _Timestamp):
365-
val = (<_Timestamp>val)._as_creso(creso)
366-
iresult[i] = val.tz_localize(None)._value
367-
else:
368-
iresult[i] = pydatetime_to_dt64(
369-
val.replace(tzinfo=None), &dts, reso=creso
370-
)
371-
result_timezone[i] = val.tzinfo
370+
iresult[i] = parse_pydatetime(val, &dts, state.creso)
372371
continue
373372
elif PyDate_Check(val):
373+
state.found_other = True
374374
item_reso = NPY_DATETIMEUNIT.NPY_FR_s
375375
state.update_creso(item_reso)
376376
if infer_reso:
377377
creso = state.creso
378378
iresult[i] = pydate_to_dt64(val, &dts, reso=creso)
379379
continue
380380
elif cnp.is_datetime64_object(val):
381+
state.found_other = True
381382
item_reso = get_supported_reso(get_datetime64_unit(val))
382383
state.update_creso(item_reso)
383384
if infer_reso:
@@ -418,13 +419,17 @@ def array_strptime(
418419
f"Out of bounds {attrname} timestamp: {val}"
419420
) from err
420421
if out_local == 1:
421-
# Store the out_tzoffset in seconds
422-
# since we store the total_seconds of
423-
# dateutil.tz.tzoffset objects
422+
nsecs = out_tzoffset * 60
423+
out_tzoffset_vals.add(nsecs)
424+
seen_datetime_offset = True
424425
tz = timezone(timedelta(minutes=out_tzoffset))
425-
result_timezone[i] = tz
426-
out_local = 0
427-
out_tzoffset = 0
426+
value = tz_localize_to_utc_single(
427+
value, tz, ambiguous="raise", nonexistent=None, creso=creso
428+
)
429+
else:
430+
tz = None
431+
out_tzoffset_vals.add("naive")
432+
state.found_naive_str = True
428433
iresult[i] = value
429434
continue
430435

@@ -450,14 +455,34 @@ def array_strptime(
450455
state.update_creso(item_reso)
451456
if infer_reso:
452457
creso = state.creso
458+
453459
try:
454460
iresult[i] = npy_datetimestruct_to_datetime(creso, &dts)
455461
except OverflowError as err:
456462
attrname = npy_unit_to_attrname[creso]
457463
raise OutOfBoundsDatetime(
458464
f"Out of bounds {attrname} timestamp: {val}"
459465
) from err
460-
result_timezone[i] = tz
466+
467+
if tz is not None:
468+
ival = iresult[i]
469+
iresult[i] = tz_localize_to_utc_single(
470+
ival, tz, ambiguous="raise", nonexistent=None, creso=creso
471+
)
472+
nsecs = (ival - iresult[i])
473+
if creso == NPY_FR_ns:
474+
nsecs = nsecs // 10**9
475+
elif creso == NPY_DATETIMEUNIT.NPY_FR_us:
476+
nsecs = nsecs // 10**6
477+
elif creso == NPY_DATETIMEUNIT.NPY_FR_ms:
478+
nsecs = nsecs // 10**3
479+
480+
out_tzoffset_vals.add(nsecs)
481+
seen_datetime_offset = True
482+
else:
483+
state.found_naive_str = True
484+
tz = None
485+
out_tzoffset_vals.add("naive")
461486

462487
except (ValueError, OutOfBoundsDatetime) as ex:
463488
ex.args = (
@@ -474,7 +499,37 @@ def array_strptime(
474499
continue
475500
elif is_raise:
476501
raise
477-
return values, []
502+
return values, None
503+
504+
if seen_datetime_offset and not utc:
505+
is_same_offsets = len(out_tzoffset_vals) == 1
506+
if not is_same_offsets or (state.found_naive or state.found_other):
507+
result2 = _array_strptime_object_fallback(
508+
values, fmt=fmt, exact=exact, errors=errors, utc=utc
509+
)
510+
return result2, None
511+
elif tz_out is not None:
512+
# GH#55693
513+
tz_offset = out_tzoffset_vals.pop()
514+
tz_out2 = timezone(timedelta(seconds=tz_offset))
515+
if not tz_compare(tz_out, tz_out2):
516+
# e.g. test_to_datetime_mixed_offsets_with_utc_false_deprecated
517+
result2 = _array_strptime_object_fallback(
518+
values, fmt=fmt, exact=exact, errors=errors, utc=utc
519+
)
520+
return result2, None
521+
# e.g. test_guess_datetime_format_with_parseable_formats
522+
else:
523+
# e.g. test_to_datetime_iso8601_with_timezone_valid
524+
tz_offset = out_tzoffset_vals.pop()
525+
tz_out = timezone(timedelta(seconds=tz_offset))
526+
elif not utc:
527+
if tz_out and (state.found_other or state.found_naive_str):
528+
# found_other indicates a tz-naive int, float, dt64, or date
529+
result2 = _array_strptime_object_fallback(
530+
values, fmt=fmt, exact=exact, errors=errors, utc=utc
531+
)
532+
return result2, None
478533

479534
if infer_reso:
480535
if state.creso_ever_changed:
@@ -488,7 +543,6 @@ def array_strptime(
488543
utc=utc,
489544
creso=state.creso,
490545
)
491-
492546
elif state.creso == NPY_DATETIMEUNIT.NPY_FR_GENERIC:
493547
# i.e. we never encountered anything non-NaT, default to "s". This
494548
# ensures that insert and concat-like operations with NaT
@@ -499,7 +553,7 @@ def array_strptime(
499553
# a second pass.
500554
abbrev = npy_unit_to_abbrev(state.creso)
501555
result = iresult.base.view(f"M8[{abbrev}]")
502-
return result, result_timezone.base
556+
return result, tz_out
503557

504558

505559
cdef tzinfo _parse_with_format(
@@ -737,6 +791,157 @@ cdef tzinfo _parse_with_format(
737791
return tz
738792

739793

794+
def _array_strptime_object_fallback(
795+
ndarray[object] values,
796+
str fmt,
797+
bint exact=True,
798+
errors="raise",
799+
bint utc=False,
800+
):
801+
802+
cdef:
803+
Py_ssize_t i, n = len(values)
804+
npy_datetimestruct dts
805+
int64_t iresult
806+
object val
807+
tzinfo tz
808+
bint is_raise = errors=="raise"
809+
bint is_ignore = errors=="ignore"
810+
bint is_coerce = errors=="coerce"
811+
bint iso_format = format_is_iso(fmt)
812+
NPY_DATETIMEUNIT creso, out_bestunit, item_reso
813+
int out_local = 0, out_tzoffset = 0
814+
bint string_to_dts_succeeded = 0
815+
816+
assert is_raise or is_ignore or is_coerce
817+
818+
item_reso = NPY_DATETIMEUNIT.NPY_FR_GENERIC
819+
format_regex, locale_time = _get_format_regex(fmt)
820+
821+
result = np.empty(n, dtype=object)
822+
823+
dts.us = dts.ps = dts.as = 0
824+
825+
for i in range(n):
826+
val = values[i]
827+
try:
828+
if isinstance(val, str):
829+
if len(val) == 0 or val in nat_strings:
830+
result[i] = NaT
831+
continue
832+
elif checknull_with_nat_and_na(val):
833+
result[i] = NaT
834+
continue
835+
elif PyDateTime_Check(val):
836+
result[i] = Timestamp(val)
837+
continue
838+
elif PyDate_Check(val):
839+
result[i] = Timestamp(val)
840+
continue
841+
elif cnp.is_datetime64_object(val):
842+
result[i] = Timestamp(val)
843+
continue
844+
elif (
845+
(is_integer_object(val) or is_float_object(val))
846+
and (val != val or val == NPY_NAT)
847+
):
848+
result[i] = NaT
849+
continue
850+
else:
851+
val = str(val)
852+
853+
if fmt == "ISO8601":
854+
string_to_dts_succeeded = not string_to_dts(
855+
val, &dts, &out_bestunit, &out_local,
856+
&out_tzoffset, False, None, False
857+
)
858+
elif iso_format:
859+
string_to_dts_succeeded = not string_to_dts(
860+
val, &dts, &out_bestunit, &out_local,
861+
&out_tzoffset, False, fmt, exact
862+
)
863+
if string_to_dts_succeeded:
864+
# No error reported by string_to_dts, pick back up
865+
# where we left off
866+
creso = get_supported_reso(out_bestunit)
867+
try:
868+
value = npy_datetimestruct_to_datetime(creso, &dts)
869+
except OverflowError as err:
870+
raise OutOfBoundsDatetime(
871+
f"Out of bounds nanosecond timestamp: {val}"
872+
) from err
873+
if out_local == 1:
874+
tz = timezone(timedelta(minutes=out_tzoffset))
875+
value = tz_localize_to_utc_single(
876+
value, tz, ambiguous="raise", nonexistent=None, creso=creso
877+
)
878+
else:
879+
tz = None
880+
ts = Timestamp._from_value_and_reso(value, creso, tz)
881+
result[i] = ts
882+
continue
883+
884+
if parse_today_now(val, &iresult, utc, NPY_FR_ns):
885+
result[i] = Timestamp(val)
886+
continue
887+
888+
# Some ISO formats can't be parsed by string_to_dts
889+
# For example, 6-digit YYYYMD. So, if there's an error, and a format
890+
# was specified, then try the string-matching code below. If the format
891+
# specified was 'ISO8601', then we need to error, because
892+
# only string_to_dts handles mixed ISO8601 formats.
893+
if not string_to_dts_succeeded and fmt == "ISO8601":
894+
raise ValueError(f"Time data {val} is not ISO8601 format")
895+
896+
tz = _parse_with_format(
897+
val, fmt, exact, format_regex, locale_time, &dts, &item_reso
898+
)
899+
try:
900+
iresult = npy_datetimestruct_to_datetime(item_reso, &dts)
901+
except OverflowError as err:
902+
raise OutOfBoundsDatetime(
903+
f"Out of bounds nanosecond timestamp: {val}"
904+
) from err
905+
if tz is not None:
906+
iresult = tz_localize_to_utc_single(
907+
iresult, tz, ambiguous="raise", nonexistent=None, creso=item_reso
908+
)
909+
ts = Timestamp._from_value_and_reso(iresult, item_reso, tz)
910+
result[i] = ts
911+
912+
except (ValueError, OutOfBoundsDatetime) as ex:
913+
ex.args = (
914+
f"{str(ex)}, at position {i}. You might want to try:\n"
915+
" - passing `format` if your strings have a consistent format;\n"
916+
" - passing `format='ISO8601'` if your strings are "
917+
"all ISO8601 but not necessarily in exactly the same format;\n"
918+
" - passing `format='mixed'`, and the format will be "
919+
"inferred for each element individually. "
920+
"You might want to use `dayfirst` alongside this.",
921+
)
922+
if is_coerce:
923+
result[i] = NaT
924+
continue
925+
elif is_raise:
926+
raise
927+
return values
928+
929+
import warnings
930+
931+
from pandas.util._exceptions import find_stack_level
932+
warnings.warn(
933+
"In a future version of pandas, parsing datetimes with mixed time "
934+
"zones will raise an error unless `utc=True`. Please specify `utc=True` "
935+
"to opt in to the new behaviour and silence this warning. "
936+
"To create a `Series` with mixed offsets and `object` dtype, "
937+
"please use `apply` and `datetime.datetime.strptime`",
938+
FutureWarning,
939+
stacklevel=find_stack_level(),
940+
)
941+
942+
return result
943+
944+
740945
class TimeRE(_TimeRE):
741946
"""
742947
Handle conversion from format directives to regexes.

0 commit comments

Comments
 (0)