Skip to content

Commit bba966c

Browse files
committed
Move validation to _splittype & correct boundary.
1 parent 97bcc4b commit bba966c

File tree

2 files changed

+22
-9
lines changed

2 files changed

+22
-9
lines changed

Lib/urllib/parse.py

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -991,6 +991,8 @@ def splittype(url):
991991

992992

993993
_typeprog = None
994+
_control_char_re = None
995+
_schemes_disallowing_control_chars = frozenset({'http', 'https', 'ftp'})
994996
def _splittype(url):
995997
"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
996998
global _typeprog
@@ -1000,7 +1002,26 @@ def _splittype(url):
10001002
match = _typeprog.match(url)
10011003
if match:
10021004
scheme, data = match.groups()
1003-
return scheme.lower(), data
1005+
scheme = scheme.lower()
1006+
if scheme in _schemes_disallowing_control_chars:
1007+
# Sanity check url data to avoid control characters.
1008+
# https://bugs.python.org/issue14826
1009+
# https://bugs.python.org/issue36276
1010+
# The same control characters check was adopted by Golang in:
1011+
# https://go-review.googlesource.com/c/go/+/159157
1012+
# Isn't it odd to be performing validation within this utility
1013+
# function? Yes... but it is in wide use in all of the right
1014+
# places where URLs need a sanity check to avoid potential security
1015+
# issues in newline delimited text based protocol implementations.
1016+
# This way many things get it for free without every use needing to
1017+
# be updated to explicitly sanity check the path contents.
1018+
global _control_char_re
1019+
if _control_char_re is None:
1020+
_control_char_re = re.compile('[\x00-\x1f\x7f-\x9f]')
1021+
if _control_char_re.search(data):
1022+
raise ValueError(f"{scheme} URL can't contain control "
1023+
f"characters. {data!r}")
1024+
return scheme, data
10041025
return None, url
10051026

10061027

Lib/urllib/request.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -350,14 +350,6 @@ def full_url(self):
350350
def full_url(self, url):
351351
# unwrap('<URL:type://host/path>') --> 'type://host/path'
352352
self._full_url = _unwrap(url)
353-
# Sanity check self._full_url to avoid control characters in HTTP.
354-
# https://bugs.python.org/issue14826
355-
# https://bugs.python.org/issue36276
356-
# The same control characters check was adopted by Golang in:
357-
# https://go-review.googlesource.com/c/go/+/159157
358-
if (self._full_url.startswith('http') and
359-
re.search("[\x00- \x7f-\x9f]", self._full_url)):
360-
raise ValueError("URL can't contain control characters. %r" % (self._full_url,))
361353
self._full_url, self.fragment = _splittag(self._full_url)
362354
self._parse()
363355

0 commit comments

Comments
 (0)