Skip to content

Commit e37ef41

Browse files
authored
bpo-36216: Add check for characters in netloc that normalize to separators (GH-12201)
1 parent 68041e0 commit e37ef41

File tree

4 files changed

+64
-0
lines changed

4 files changed

+64
-0
lines changed

Doc/library/urlparse.rst

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,12 +119,22 @@ The :mod:`urlparse` module defines the following functions:
119119
See section :ref:`urlparse-result-object` for more information on the result
120120
object.
121121

122+
Characters in the :attr:`netloc` attribute that decompose under NFKC
123+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
124+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
125+
decomposed before parsing, or is not a Unicode string, no error will be
126+
raised.
127+
122128
.. versionchanged:: 2.5
123129
Added attributes to return value.
124130

125131
.. versionchanged:: 2.7
126132
Added IPv6 URL parsing capabilities.
127133

134+
.. versionchanged:: 2.7.17
135+
Characters that affect netloc parsing under NFKC normalization will
136+
now raise :exc:`ValueError`.
137+
128138

129139
.. function:: parse_qs(qs[, keep_blank_values[, strict_parsing[, max_num_fields]]])
130140

@@ -232,11 +242,21 @@ The :mod:`urlparse` module defines the following functions:
232242
See section :ref:`urlparse-result-object` for more information on the result
233243
object.
234244

245+
Characters in the :attr:`netloc` attribute that decompose under NFKC
246+
normalization (as used by the IDNA encoding) into any of ``/``, ``?``,
247+
``#``, ``@``, or ``:`` will raise a :exc:`ValueError`. If the URL is
248+
decomposed before parsing, or is not a Unicode string, no error will be
249+
raised.
250+
235251
.. versionadded:: 2.2
236252

237253
.. versionchanged:: 2.5
238254
Added attributes to return value.
239255

256+
.. versionchanged:: 2.7.17
257+
Characters that affect netloc parsing under NFKC normalization will
258+
now raise :exc:`ValueError`.
259+
240260

241261
.. function:: urlunsplit(parts)
242262

Lib/test/test_urlparse.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
from test import test_support
2+
import sys
3+
import unicodedata
24
import unittest
35
import urlparse
46

@@ -624,6 +626,28 @@ def test_portseparator(self):
624626
self.assertEqual(urlparse.urlparse("http://www.python.org:80"),
625627
('http','www.python.org:80','','','',''))
626628

629+
def test_urlsplit_normalization(self):
630+
# Certain characters should never occur in the netloc,
631+
# including under normalization.
632+
# Ensure that ALL of them are detected and cause an error
633+
illegal_chars = u'/:#?@'
634+
hex_chars = {'{:04X}'.format(ord(c)) for c in illegal_chars}
635+
denorm_chars = [
636+
c for c in map(unichr, range(128, sys.maxunicode))
637+
if (hex_chars & set(unicodedata.decomposition(c).split()))
638+
and c not in illegal_chars
639+
]
640+
# Sanity check that we found at least one such character
641+
self.assertIn(u'\u2100', denorm_chars)
642+
self.assertIn(u'\uFF03', denorm_chars)
643+
644+
for scheme in [u"http", u"https", u"ftp"]:
645+
for c in denorm_chars:
646+
url = u"{}://netloc{}false.netloc/path".format(scheme, c)
647+
print "Checking %r" % url
648+
with self.assertRaises(ValueError):
649+
urlparse.urlsplit(url)
650+
627651
def test_main():
628652
test_support.run_unittest(UrlParseTestCase)
629653

Lib/urlparse.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,21 @@ def _splitnetloc(url, start=0):
165165
delim = min(delim, wdelim) # use earliest delim position
166166
return url[start:delim], url[delim:] # return (domain, rest)
167167

168+
def _checknetloc(netloc):
169+
if not netloc or not isinstance(netloc, unicode):
170+
return
171+
# looking for characters like \u2100 that expand to 'a/c'
172+
# IDNA uses NFKC equivalence, so normalize for this check
173+
import unicodedata
174+
netloc2 = unicodedata.normalize('NFKC', netloc)
175+
if netloc == netloc2:
176+
return
177+
_, _, netloc = netloc.rpartition('@') # anything to the left of '@' is okay
178+
for c in '/?#@:':
179+
if c in netloc2:
180+
raise ValueError("netloc '" + netloc2 + "' contains invalid " +
181+
"characters under NFKC normalization")
182+
168183
def urlsplit(url, scheme='', allow_fragments=True):
169184
"""Parse a URL into 5 components:
170185
<scheme>://<netloc>/<path>?<query>#<fragment>
@@ -193,6 +208,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
193208
url, fragment = url.split('#', 1)
194209
if '?' in url:
195210
url, query = url.split('?', 1)
211+
_checknetloc(netloc)
196212
v = SplitResult(scheme, netloc, url, query, fragment)
197213
_parse_cache[key] = v
198214
return v
@@ -216,6 +232,7 @@ def urlsplit(url, scheme='', allow_fragments=True):
216232
url, fragment = url.split('#', 1)
217233
if '?' in url:
218234
url, query = url.split('?', 1)
235+
_checknetloc(netloc)
219236
v = SplitResult(scheme, netloc, url, query, fragment)
220237
_parse_cache[key] = v
221238
return v
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
Changes urlsplit() to raise ValueError when the URL contains characters that
2+
decompose under IDNA encoding (NFKC-normalization) into characters that
3+
affect how the URL is parsed.

0 commit comments

Comments
 (0)