Skip to content

Commit c4e671e

Browse files
authored
bpo-30458: Disallow control chars in http URLs. (GH-12755)
Disallow control chars in http URLs in urllib.urlopen. This addresses a potential security problem for applications that do not sanity check their URLs where http request headers could be injected.
1 parent 5f38b84 commit c4e671e

File tree

4 files changed

+71
-2
lines changed

4 files changed

+71
-2
lines changed

Lib/http/client.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,16 @@
137137
_is_legal_header_name = re.compile(rb'[^:\s][^:\r\n]*').fullmatch
138138
_is_illegal_header_value = re.compile(rb'\n(?![ \t])|\r(?![ \t\n])').search
139139

140+
# These characters are not allowed within HTTP URL paths.
141+
# See https://tools.ietf.org/html/rfc3986#section-3.3 and the
142+
# https://tools.ietf.org/html/rfc3986#appendix-A pchar definition.
143+
# Prevents CVE-2019-9740. Includes control characters such as \r\n.
144+
# We don't restrict chars above \x7f as putrequest() limits us to ASCII.
145+
_contains_disallowed_url_pchar_re = re.compile('[\x00-\x20\x7f]')
146+
# Arguably only these _should_ allowed:
147+
# _is_allowed_url_pchars_re = re.compile(r"^[/!$&'()*+,;=:@%a-zA-Z0-9._~-]+$")
148+
# We are more lenient for assumed real world compatibility purposes.
149+
140150
# We always set the Content-Length header for these methods because some
141151
# servers will otherwise respond with a 411
142152
_METHODS_EXPECTING_BODY = {'PATCH', 'POST', 'PUT'}
@@ -1079,6 +1089,10 @@ def putrequest(self, method, url, skip_host=False,
10791089
self._method = method
10801090
if not url:
10811091
url = '/'
1092+
# Prevent CVE-2019-9740.
1093+
if match := _contains_disallowed_url_pchar_re.search(url):
1094+
raise ValueError(f"URL can't contain control characters. {url!r} "
1095+
f"(found at least {match.group()!r})")
10821096
request = '%s %s %s' % (method, url, self._http_vsn_str)
10831097

10841098
# Non-ASCII characters should have been eliminated earlier

Lib/test/test_urllib.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,55 @@ def test_willclose(self):
329329
finally:
330330
self.unfakehttp()
331331

332+
def test_url_with_control_char_rejected(self):
333+
for char_no in list(range(0, 0x21)) + [0x7f]:
334+
char = chr(char_no)
335+
schemeless_url = f"//localhost:7777/test{char}/"
336+
self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.")
337+
try:
338+
# We explicitly test urllib.request.urlopen() instead of the top
339+
# level 'def urlopen()' function defined in this... (quite ugly)
340+
# test suite. They use different url opening codepaths. Plain
341+
# urlopen uses FancyURLOpener which goes via a codepath that
342+
# calls urllib.parse.quote() on the URL which makes all of the
343+
# above attempts at injection within the url _path_ safe.
344+
escaped_char_repr = repr(char).replace('\\', r'\\')
345+
with self.assertRaisesRegex(
346+
ValueError, f"contain control.*{escaped_char_repr}"):
347+
urllib.request.urlopen(f"http:{schemeless_url}")
348+
with self.assertRaisesRegex(
349+
ValueError, f"contain control.*{escaped_char_repr}"):
350+
urllib.request.urlopen(f"https:{schemeless_url}")
351+
# This code path quotes the URL so there is no injection.
352+
resp = urlopen(f"http:{schemeless_url}")
353+
self.assertNotIn(char, resp.geturl())
354+
finally:
355+
self.unfakehttp()
356+
357+
def test_url_with_newline_header_injection_rejected(self):
358+
self.fakehttp(b"HTTP/1.1 200 OK\r\n\r\nHello.")
359+
host = "localhost:7777?a=1 HTTP/1.1\r\nX-injected: header\r\nTEST: 123"
360+
schemeless_url = "//" + host + ":8080/test/?test=a"
361+
try:
362+
# We explicitly test urllib.request.urlopen() instead of the top
363+
# level 'def urlopen()' function defined in this... (quite ugly)
364+
# test suite. They use different url opening codepaths. Plain
365+
# urlopen uses FancyURLOpener which goes via a codepath that
366+
# calls urllib.parse.quote() on the URL which makes all of the
367+
# above attempts at injection within the url _path_ safe.
368+
with self.assertRaisesRegex(
369+
ValueError, r"contain control.*\\r.*(found at least . .)"):
370+
urllib.request.urlopen(f"http:{schemeless_url}")
371+
with self.assertRaisesRegex(ValueError, r"contain control.*\\n"):
372+
urllib.request.urlopen(f"https:{schemeless_url}")
373+
# This code path quotes the URL so there is no injection.
374+
resp = urlopen(f"http:{schemeless_url}")
375+
self.assertNotIn(' ', resp.geturl())
376+
self.assertNotIn('\r', resp.geturl())
377+
self.assertNotIn('\n', resp.geturl())
378+
finally:
379+
self.unfakehttp()
380+
332381
def test_read_0_9(self):
333382
# "0.9" response accepted (but not "simple responses" without
334383
# a status line)

Lib/test/test_xmlrpc.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -943,8 +943,13 @@ def test_unicode_host(self):
943943

944944
def test_partial_post(self):
945945
# Check that a partial POST doesn't make the server loop: issue #14001.
946-
with contextlib.closing(http.client.HTTPConnection(ADDR, PORT)) as conn:
947-
conn.request('POST', '/RPC2 HTTP/1.0\r\nContent-Length: 100\r\n\r\nbye')
946+
with contextlib.closing(socket.create_connection((ADDR, PORT))) as conn:
947+
conn.send('POST /RPC2 HTTP/1.0\r\n'
948+
'Content-Length: 100\r\n\r\n'
949+
'bye HTTP/1.1\r\n'
950+
f'Host: {ADDR}:{PORT}\r\n'
951+
'Accept-Encoding: identity\r\n'
952+
'Content-Length: 0\r\n\r\n'.encode('ascii'))
948953

949954
def test_context_manager(self):
950955
with xmlrpclib.ServerProxy(URL) as server:
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Address CVE-2019-9740 by disallowing URL paths with embedded whitespace or control characters through into the underlying http client request. Such potentially malicious header injection URLs now cause a ValueError to be raised.

0 commit comments

Comments
 (0)