Skip to content

Commit 4899d84

Browse files
authored
bpo-30500: urllib: Simplify splithost by calling into urlparse. (#1849) (#2290)
The current regex based splitting produces a wrong result. For example:: http://abc#@def Web browsers parse that URL as ``http://abc/#@def``, that is, the host is ``abc``, the path is ``/``, and the fragment is ``#@def``. (cherry picked from commit 90e01e5)
1 parent 070ba85 commit 4899d84

File tree

4 files changed

+46
-13
lines changed

4 files changed

+46
-13
lines changed

Lib/test/test_urlparse.py

Lines changed: 39 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -757,28 +757,35 @@ def test_default_scheme(self):
757757
def test_parse_fragments(self):
758758
# Exercise the allow_fragments parameter of urlparse() and urlsplit()
759759
tests = (
760-
("http:#frag", "path"),
761-
("//example.net#frag", "path"),
762-
("index.html#frag", "path"),
763-
(";a=b#frag", "params"),
764-
("?a=b#frag", "query"),
765-
("#frag", "path"),
760+
("http:#frag", "path", "frag"),
761+
("//example.net#frag", "path", "frag"),
762+
("index.html#frag", "path", "frag"),
763+
(";a=b#frag", "params", "frag"),
764+
("?a=b#frag", "query", "frag"),
765+
("#frag", "path", "frag"),
766+
("abc#@frag", "path", "@frag"),
767+
("//abc#@frag", "path", "@frag"),
768+
("//abc:80#@frag", "path", "@frag"),
769+
("//abc#@frag:80", "path", "@frag:80"),
766770
)
767-
for url, attr in tests:
771+
for url, attr, expected_frag in tests:
768772
for func in (urllib.parse.urlparse, urllib.parse.urlsplit):
769773
if attr == "params" and func is urllib.parse.urlsplit:
770774
attr = "path"
771775
with self.subTest(url=url, function=func):
772776
result = func(url, allow_fragments=False)
773777
self.assertEqual(result.fragment, "")
774-
self.assertTrue(getattr(result, attr).endswith("#frag"))
778+
self.assertTrue(
779+
getattr(result, attr).endswith("#" + expected_frag))
775780
self.assertEqual(func(url, "", False).fragment, "")
776781

777782
result = func(url, allow_fragments=True)
778-
self.assertEqual(result.fragment, "frag")
779-
self.assertFalse(getattr(result, attr).endswith("frag"))
780-
self.assertEqual(func(url, "", True).fragment, "frag")
781-
self.assertEqual(func(url).fragment, "frag")
783+
self.assertEqual(result.fragment, expected_frag)
784+
self.assertFalse(
785+
getattr(result, attr).endswith(expected_frag))
786+
self.assertEqual(func(url, "", True).fragment,
787+
expected_frag)
788+
self.assertEqual(func(url).fragment, expected_frag)
782789

783790
def test_mixed_types_rejected(self):
784791
# Several functions that process either strings or ASCII encoded bytes
@@ -985,6 +992,26 @@ def test_splithost(self):
985992
self.assertEqual(splithost('/foo/bar/baz.html'),
986993
(None, '/foo/bar/baz.html'))
987994

995+
# bpo-30500: # starts a fragment.
996+
self.assertEqual(splithost('//127.0.0.1#@host.com'),
997+
('127.0.0.1', '/#@host.com'))
998+
self.assertEqual(splithost('//127.0.0.1#@host.com:80'),
999+
('127.0.0.1', '/#@host.com:80'))
1000+
self.assertEqual(splithost('//127.0.0.1:80#@host.com'),
1001+
('127.0.0.1:80', '/#@host.com'))
1002+
1003+
# Empty host is returned as empty string.
1004+
self.assertEqual(splithost("///file"),
1005+
('', '/file'))
1006+
1007+
# Trailing semicolon, question mark and hash symbol are kept.
1008+
self.assertEqual(splithost("//example.net/file;"),
1009+
('example.net', '/file;'))
1010+
self.assertEqual(splithost("//example.net/file?"),
1011+
('example.net', '/file?'))
1012+
self.assertEqual(splithost("//example.net/file#"),
1013+
('example.net', '/file#'))
1014+
9881015
def test_splituser(self):
9891016
splituser = urllib.parse.splituser
9901017
self.assertEqual(splituser('User:[email protected]:080'),

Lib/urllib/parse.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -882,7 +882,7 @@ def splithost(url):
882882
"""splithost('//host[:port]/path') --> 'host[:port]', '/path'."""
883883
global _hostprog
884884
if _hostprog is None:
885-
_hostprog = re.compile('//([^/?]*)(.*)', re.DOTALL)
885+
_hostprog = re.compile('//([^/#?]*)(.*)', re.DOTALL)
886886

887887
match = _hostprog.match(url)
888888
if match:

Misc/ACKS

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1060,6 +1060,7 @@ Max Neunhöffer
10601060
Anthon van der Neut
10611061
George Neville-Neil
10621062
Hieu Nguyen
1063+
Nam Nguyen
10631064
Johannes Nicolai
10641065
Samuel Nicolary
10651066
Jonathan Niehof

Misc/NEWS

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,11 @@ Extension Modules
5959
Library
6060
-------
6161

62+
- [Security] bpo-30500: Fix urllib.parse.splithost() to correctly parse
63+
fragments. For example, ``splithost('http://127.0.0.1#@evil.com/')`` now
64+
correctly returns the ``127.0.0.1`` host, instead of treating ``@evil.com``
65+
as the host in an authentification (``login@host``).
66+
6267
- bpo-23890: unittest.TestCase.assertRaises() now manually breaks a reference
6368
cycle to not keep objects alive longer than expected.
6469

0 commit comments

Comments
 (0)