Skip to content

[3.7] bpo-34454: fix .fromisoformat() methods crashing on inputs with surrogate code points (GH-8862) #8877

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 23, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion Lib/test/datetimetester.py
Original file line number Diff line number Diff line change
Expand Up @@ -1667,6 +1667,7 @@ def test_fromisoformat_fails(self):
# Test that fromisoformat() fails on invalid values
bad_strs = [
'', # Empty string
'\ud800', # bpo-34454: Surrogate code point
'009-03-04', # Not 10 characters
'123456789', # Not a date
'200a-12-04', # Invalid character in year
Expand All @@ -1675,6 +1676,7 @@ def test_fromisoformat_fails(self):
'2009-01-32', # Invalid day
'2009-02-29', # Invalid leap day
'20090228', # Valid ISO8601 output not from isoformat()
'2009\ud80002\ud80028', # Separators are surrogate codepoints
]

for bad_str in bad_strs:
Expand Down Expand Up @@ -2587,7 +2589,8 @@ def test_fromisoformat_separators(self):
' ', 'T', '\u007f', # 1-bit widths
'\u0080', 'ʁ', # 2-bit widths
'ᛇ', '時', # 3-bit widths
'🐍' # 4-bit widths
'🐍', # 4-bit widths
'\ud800', # bpo-34454: Surrogate code point
]

for sep in separators:
Expand Down Expand Up @@ -2639,6 +2642,7 @@ def test_fromisoformat_fails_datetime(self):
# Test that fromisoformat() fails on invalid values
bad_strs = [
'', # Empty string
'\ud800', # bpo-34454: Surrogate code point
'2009.04-19T03', # Wrong first separator
'2009-04.19T03', # Wrong second separator
'2009-04-19T0a', # Invalid hours
Expand All @@ -2652,6 +2656,8 @@ def test_fromisoformat_fails_datetime(self):
'2009-04-19T03:15:45.123456+24:30', # Invalid time zone offset
'2009-04-19T03:15:45.123456-24:30', # Invalid negative offset
'2009-04-10ᛇᛇᛇᛇᛇ12:15', # Too many unicode separators
'2009-04\ud80010T12:15', # Surrogate char in date
'2009-04-10T12\ud80015', # Surrogate char in time
'2009-04-19T1', # Incomplete hours
'2009-04-19T12:3', # Incomplete minutes
'2009-04-19T12:30:4', # Incomplete seconds
Expand Down Expand Up @@ -3521,6 +3527,7 @@ def test_fromisoformat_timespecs(self):
def test_fromisoformat_fails(self):
bad_strs = [
'', # Empty string
'12\ud80000', # Invalid separator - surrogate char
'12:', # Ends on a separator
'12:30:', # Ends on a separator
'12:30:15.', # Ends on a separator
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Fix the .fromisoformat() methods of datetime types crashing when given
unicode with non-UTF-8-encodable code points. Specifically,
datetime.fromisoformat() now accepts surrogate unicode code points used as
the separator. Report and tests by Alexey Izbyshev, patch by Paul Ganssle.
81 changes: 72 additions & 9 deletions Modules/_datetimemodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -2883,6 +2883,9 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
Py_ssize_t len;

const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
if (dt_ptr == NULL) {
goto invalid_string_error;
}

int year = 0, month = 0, day = 0;

Expand All @@ -2894,12 +2897,15 @@ date_fromisoformat(PyObject *cls, PyObject *dtstr) {
}

if (rv < 0) {
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s",
dt_ptr);
return NULL;
goto invalid_string_error;
}

return new_date_subclass_ex(year, month, day, cls);

invalid_string_error:
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R",
dtstr);
return NULL;
}


Expand Down Expand Up @@ -4258,15 +4264,18 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {
Py_ssize_t len;
const char *p = PyUnicode_AsUTF8AndSize(tstr, &len);

if (p == NULL) {
goto invalid_string_error;
}

int hour = 0, minute = 0, second = 0, microsecond = 0;
int tzoffset, tzimicrosecond = 0;
int rv = parse_isoformat_time(p, len,
&hour, &minute, &second, &microsecond,
&tzoffset, &tzimicrosecond);

if (rv < 0) {
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", p);
return NULL;
goto invalid_string_error;
}

PyObject *tzinfo = tzinfo_from_isoformat_results(rv, tzoffset,
Expand All @@ -4286,6 +4295,10 @@ time_fromisoformat(PyObject *cls, PyObject *tstr) {

Py_DECREF(tzinfo);
return t;

invalid_string_error:
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", tstr);
return NULL;
}


Expand Down Expand Up @@ -4839,6 +4852,33 @@ datetime_combine(PyObject *cls, PyObject *args, PyObject *kw)
return result;
}

static PyObject *
_sanitize_isoformat_str(PyObject *dtstr, int *needs_decref) {
// `fromisoformat` allows surrogate characters in exactly one position,
// the separator; to allow datetime_fromisoformat to make the simplifying
// assumption that all valid strings can be encoded in UTF-8, this function
// replaces any surrogate character separators with `T`.
Py_ssize_t len = PyUnicode_GetLength(dtstr);
*needs_decref = 0;
if (len <= 10 || !Py_UNICODE_IS_SURROGATE(PyUnicode_READ_CHAR(dtstr, 10))) {
return dtstr;
}

PyObject *str_out = PyUnicode_New(len, PyUnicode_MAX_CHAR_VALUE(dtstr));
if (str_out == NULL) {
return NULL;
}

if (PyUnicode_CopyCharacters(str_out, 0, dtstr, 0, len) == -1 ||
PyUnicode_WriteChar(str_out, 10, (Py_UCS4)'T')) {
Py_DECREF(str_out);
return NULL;
}

*needs_decref = 1;
return str_out;
}

static PyObject *
datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
assert(dtstr != NULL);
Expand All @@ -4848,9 +4888,20 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
return NULL;
}

int needs_decref = 0;
dtstr = _sanitize_isoformat_str(dtstr, &needs_decref);
if (dtstr == NULL) {
goto error;
}

Py_ssize_t len;
const char * dt_ptr = PyUnicode_AsUTF8AndSize(dtstr, &len);
const char * p = dt_ptr;

if (dt_ptr == NULL) {
goto invalid_string_error;
}

const char *p = dt_ptr;

int year = 0, month = 0, day = 0;
int hour = 0, minute = 0, second = 0, microsecond = 0;
Expand Down Expand Up @@ -4883,20 +4934,32 @@ datetime_fromisoformat(PyObject* cls, PyObject *dtstr) {
&tzoffset, &tzusec);
}
if (rv < 0) {
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %s", dt_ptr);
return NULL;
goto invalid_string_error;
}

PyObject* tzinfo = tzinfo_from_isoformat_results(rv, tzoffset, tzusec);
if (tzinfo == NULL) {
return NULL;
goto error;
}

PyObject *dt = new_datetime_subclass_ex(year, month, day, hour, minute,
second, microsecond, tzinfo, cls);

Py_DECREF(tzinfo);
if (needs_decref) {
Py_DECREF(dtstr);
}
return dt;

invalid_string_error:
PyErr_Format(PyExc_ValueError, "Invalid isoformat string: %R", dtstr);

error:
if (needs_decref) {
Py_DECREF(dtstr);
}

return NULL;
}


Expand Down