Skip to content

Commit 90e01e5

Browse files
postmastersvstinner
authored andcommitted
urllib: Simplify splithost by calling into urlparse. (#1849)
The current regex based splitting produces a wrong result. For example:: http://abc#@def Web browsers parse that URL as ``http://abc/#@def``, that is, the host is ``abc``, the path is ``/``, and the fragment is ``#@def``.
1 parent 5cc7ac2 commit 90e01e5

File tree

4 files changed

+47
-14
lines changed

4 files changed

+47
-14
lines changed

Lib/test/test_urlparse.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -755,28 +755,35 @@ def test_default_scheme(self):
755755
def test_parse_fragments(self):
756756
# Exercise the allow_fragments parameter of urlparse() and urlsplit()
757757
tests = (
758-
("http:#frag", "path"),
759-
("//example.net#frag", "path"),
760-
("index.html#frag", "path"),
761-
(";a=b#frag", "params"),
762-
("?a=b#frag", "query"),
763-
("#frag", "path"),
758+
("http:#frag", "path", "frag"),
759+
("//example.net#frag", "path", "frag"),
760+
("index.html#frag", "path", "frag"),
761+
(";a=b#frag", "params", "frag"),
762+
("?a=b#frag", "query", "frag"),
763+
("#frag", "path", "frag"),
764+
("abc#@frag", "path", "@frag"),
765+
("//abc#@frag", "path", "@frag"),
766+
("//abc:80#@frag", "path", "@frag"),
767+
("//abc#@frag:80", "path", "@frag:80"),
764768
)
765-
for url, attr in tests:
769+
for url, attr, expected_frag in tests:
766770
for func in (urllib.parse.urlparse, urllib.parse.urlsplit):
767771
if attr == "params" and func is urllib.parse.urlsplit:
768772
attr = "path"
769773
with self.subTest(url=url, function=func):
770774
result = func(url, allow_fragments=False)
771775
self.assertEqual(result.fragment, "")
772-
self.assertTrue(getattr(result, attr).endswith("#frag"))
776+
self.assertTrue(
777+
getattr(result, attr).endswith("#" + expected_frag))
773778
self.assertEqual(func(url, "", False).fragment, "")
774779

775780
result = func(url, allow_fragments=True)
776-
self.assertEqual(result.fragment, "frag")
777-
self.assertFalse(getattr(result, attr).endswith("frag"))
778-
self.assertEqual(func(url, "", True).fragment, "frag")
779-
self.assertEqual(func(url).fragment, "frag")
781+
self.assertEqual(result.fragment, expected_frag)
782+
self.assertFalse(
783+
getattr(result, attr).endswith(expected_frag))
784+
self.assertEqual(func(url, "", True).fragment,
785+
expected_frag)
786+
self.assertEqual(func(url).fragment, expected_frag)
780787

781788
def test_mixed_types_rejected(self):
782789
# Several functions that process either strings or ASCII encoded bytes
@@ -983,6 +990,26 @@ def test_splithost(self):
983990
self.assertEqual(splithost('/foo/bar/baz.html'),
984991
(None, '/foo/bar/baz.html'))
985992

993+
# bpo-30500: # starts a fragment.
994+
self.assertEqual(splithost('//127.0.0.1#@host.com'),
995+
('127.0.0.1', '/#@host.com'))
996+
self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
997+
('127.0.0.1', '/#@host.com:80'))
998+
self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
999+
('127.0.0.1:80', '/#@host.com'))
1000+
1001+
# Empty host is returned as empty string.
1002+
self.assertEqual(splithost("///file"),
1003+
('', '/file'))
1004+
1005+
# Trailing semicolon, question mark and hash symbol are kept.
1006+
self.assertEqual(splithost("//example.net/file;"),
1007+
('example.net', '/file;'))
1008+
self.assertEqual(splithost("//example.net/file?"),
1009+
('example.net', '/file?'))
1010+
self.assertEqual(splithost("//example.net/file#"),
1011+
('example.net', '/file#'))
1012+
9861013
def test_splituser(self):
9871014
splituser = urllib.parse.splituser
9881015
self.assertEqual(splituser('User:[email protected]:080'),

Lib/urllib/parse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -947,7 +947,7 @@ def splithost(url):
947947
"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
948948
global _hostprog
949949
if _hostprog is None:
950-
_hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
950+
_hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
951951

952952
match = _hostprog.match(url)
953953
if match:

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1091,6 +1091,7 @@ Max Neunhöffer
10911091
Anthon van der Neut
10921092
George Neville-Neil
10931093
Hieu Nguyen
1094+
Nam Nguyen
10941095
Johannes Nicolai
10951096
Samuel Nicolary
10961097
Jonathan Niehof

Misc/NEWS

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Core and Builtins
1212

1313
- bpo-30682: Removed a too-strict assertion that failed for certain f-strings,
1414
such as eval("f'\\\n'") and eval("f'\\\r'").
15-
15+
1616
- bpo-30501: The compiler now produces more optimal code for complex condition
1717
expressions in the "if", "while" and "assert" statement, the "if" expression,
1818
and generator expressions and comprehensions.
@@ -365,6 +365,11 @@ Extension Modules
365365
Library
366366
-------
367367

368+
- [Security] bpo-30500: Fix urllib.parse.splithost() to correctly parse
369+
fragments. For example, ``splithost('http://127.0.0.1#@evil.com/')`` now
370+
correctly returns the ``127.0.0.1`` host, instead of treating ``@evil.com``
371+
as the host in an authentification (``login@host``).
372+
368373
- bpo-30038: Fix race condition between signal delivery and wakeup file
369374
descriptor. Patch by Nathaniel Smith.
370375

0 commit comments

Comments
 (0)