@@ -991,6 +991,8 @@ def splittype(url):
991
991
992
992
993
993
_typeprog = None
994
+ _control_char_re = None
995
+ _schemes_disallowing_control_chars = frozenset ({'http' , 'https' , 'ftp' })
994
996
def _splittype (url ):
995
997
"""splittype('type:opaquestring') --> 'type', 'opaquestring'."""
996
998
global _typeprog
@@ -1000,7 +1002,26 @@ def _splittype(url):
1000
1002
match = _typeprog .match (url )
1001
1003
if match :
1002
1004
scheme , data = match .groups ()
1003
- return scheme .lower (), data
1005
+ scheme = scheme .lower ()
1006
+ if scheme in _schemes_disallowing_control_chars :
1007
+ # Sanity check url data to avoid control characters.
1008
+ # https://bugs.python.org/issue14826
1009
+ # https://bugs.python.org/issue36276
1010
+ # The same control characters check was adopted by Golang in:
1011
+ # https://go-review.googlesource.com/c/go/+/159157
1012
+ # Isn't it odd to be performing validation within this utility
1013
+ # function? Yes... but it is in wide use in all of the right
1014
+ # places where URLs need a sanity check to avoid potential security
1015
+ # issues in newline delimited text based protocol implementations.
1016
+ # This way many things get it for free without every use needing to
1017
+ # be updated to explicitly sanity check the path contents.
1018
+ global _control_char_re
1019
+ if _control_char_re is None :
1020
+ _control_char_re = re .compile ('[\x00 -\x1f \x7f -\x9f ]' )
1021
+ if _control_char_re .search (data ):
1022
+ raise ValueError (f"{ scheme } URL can't contain control "
1023
+ f"characters. { data !r} " )
1024
+ return scheme , data
1004
1025
return None , url
1005
1026
1006
1027
0 commit comments