Skip to content

Commit dd82aa0

Browse files
committed
Refactor non-UTF-8 sanitization
This increases performance for valid non-UTF-8 strings by avoiding an error condition, and minimizes the impact on the rest of the algorithm.
1 parent 85a99ca commit dd82aa0

File tree

1 file changed

+47
-27
lines changed

1 file changed

+47
-27
lines changed

Modules/_datetimemodule.c

Lines changed: 47 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -4852,6 +4852,40 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
48524852
return result;
48534853
}
48544854

4855+
static PyObject *
4856+
_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
4857+
// `fromisoformat` allows surrogate characters in exactly one position,
4858+
// the separator; to allow datetime_fromisoformat to make the simplifying
4859+
// assumption that all valid strings can be encoded in UTF-8, this function
4860+
// replaces any surrogate character separators with `T`.
4861+
Py_ssize_t len = PyUnicode_GetLength(dtstr);
4862+
*needs_decref = 0;
4863+
if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
4864+
return dtstr;
4865+
}
4866+
4867+
PyObject *left = PyUnicode_Substring(dtstr, 0, 10);
4868+
if (left == NULL) {
4869+
return NULL;
4870+
}
4871+
4872+
PyObject *right = PyUnicode_Substring(dtstr, 11, len);
4873+
if (right == NULL) {
4874+
Py_DECREF(left);
4875+
return NULL;
4876+
}
4877+
4878+
PyObject *str_out = PyUnicode_FromFormat("%UT%U", left, right);
4879+
Py_DECREF(left);
4880+
Py_DECREF(right);
4881+
if (str_out == NULL) {
4882+
return NULL;
4883+
}
4884+
4885+
*needs_decref = 1;
4886+
return str_out;
4887+
}
4888+
48554889
static PyObject *
48564890
datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
48574891
assert(dtstr != NULL);
@@ -4861,34 +4895,20 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
48614895
return NULL;
48624896
}
48634897

4864-
const PyObject * dtstr_bytes = NULL;
4865-
unsigned char bytes_needs_decref = 0;
4898+
int needs_decref = 0;
4899+
dtstr = _sanitize_isoformat_str(dtstr, &needs_decref);
4900+
if (dtstr == NULL) {
4901+
goto error;
4902+
}
48664903

48674904
Py_ssize_t len;
48684905
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
48694906

48704907
if (dt_ptr == NULL) {
4871-
len = PyUnicode_GET_LENGTH(dtstr);
4872-
if (len == 10) {
4873-
goto invalid_string_error;
4874-
}
4875-
PyErr_Clear();
4876-
4877-
// If the datetime string cannot be encoded as UTF8 because the
4878-
// separator character is an invalid character, this could still
4879-
// be a valid isoformat, so we decode it and ignore.
4880-
dtstr_bytes = PyUnicode_AsEncodedString(dtstr, "ascii", "replace");
4881-
if (dtstr_bytes == NULL) {
4882-
goto finally;
4883-
}
4884-
bytes_needs_decref = 1;
4885-
dt_ptr = PyBytes_AS_STRING(dtstr_bytes);
4886-
if (dt_ptr == NULL) {
4887-
goto finally;
4888-
}
4908+
goto invalid_string_error;
48894909
}
48904910

4891-
const char * p = dt_ptr;
4911+
const char *p = dt_ptr;
48924912

48934913
int year = 0, month = 0, day = 0;
48944914
int hour = 0, minute = 0, second = 0, microsecond = 0;
@@ -4926,24 +4946,24 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
49264946

49274947
PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
49284948
if (tzinfo == NULL) {
4929-
goto finally;
4949+
goto error;
49304950
}
49314951

49324952
PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute,
49334953
second, microsecond, tzinfo, cls);
49344954

49354955
Py_DECREF(tzinfo);
4936-
if (bytes_needs_decref) {
4937-
Py_DECREF(dtstr_bytes);
4956+
if (needs_decref) {
4957+
Py_DECREF(dtstr);
49384958
}
49394959
return dt;
49404960

49414961
invalid_string_error:
49424962
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);
49434963

4944-
finally:
4945-
if (bytes_needs_decref) {
4946-
Py_DECREF(dtstr_bytes);
4964+
error:
4965+
if (needs_decref) {
4966+
Py_DECREF(dtstr);
49474967
}
49484968

49494969
return NULL;

0 commit comments

Comments
 (0)