bpo-28595: Allow shlex whitespace_split with punctuation_chars (GH-2071)

evanunderscore · vsajip · commit 56624a99a916 · 2019-06-01T20:09:22.000+01:00
diff --git a/Doc/library/shlex.rst b/Doc/library/shlex.rst
@@ -225,7 +225,8 @@ variables which either control lexical analysis or can be used for debugging:
    appear in filename specifications and command line parameters, will also be
    included in this attribute, and any characters which appear in
    ``punctuation_chars`` will be removed from ``wordchars`` if they are present
-   there.
+   there. If :attr:`whitespace_split` is set to ``True``, this will have no
+   effect.
 
 
 .. attribute:: shlex.whitespace
@@ -258,11 +259,13 @@ variables which either control lexical analysis or can be used for debugging:
 
    If ``True``, tokens will only be split in whitespaces.  This is useful, for
    example, for parsing command lines with :class:`~shlex.shlex`, getting
-   tokens in a similar way to shell arguments.  If this attribute is ``True``,
-   :attr:`punctuation_chars` will have no effect, and splitting will happen
-   only on whitespaces.  When using :attr:`punctuation_chars`, which is
-   intended to provide parsing closer to that implemented by shells, it is
-   advisable to leave ``whitespace_split`` as ``False`` (the default value).
+   tokens in a similar way to shell arguments.  When used in combination with
+   :attr:`punctuation_chars`, tokens will be split on whitespace in addition to
+   those characters.
+
+   .. versionchanged:: 3.8
+      The :attr:`punctuation_chars` attribute was made compatible with the
+      :attr:`whitespace_split` attribute.
 
 
 .. attribute:: shlex.infile
@@ -398,12 +401,15 @@ otherwise.  To illustrate, you can see the difference in the following snippet:
 
     >>> import shlex
     >>> text = "a && b; c && d || e; f >'abc'; (def \"ghi\")"
-    >>> list(shlex.shlex(text))
-    ['a', '&', '&', 'b', ';', 'c', '&', '&', 'd', '|', '|', 'e', ';', 'f', '>',
-    "'abc'", ';', '(', 'def', '"ghi"', ')']
-    >>> list(shlex.shlex(text, punctuation_chars=True))
-    ['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', "'abc'",
-    ';', '(', 'def', '"ghi"', ')']
+    >>> s = shlex.shlex(text, posix=True)
+    >>> s.whitespace_split = True
+    >>> list(s)
+    ['a', '&&', 'b;', 'c', '&&', 'd', '||', 'e;', 'f', '>abc;', '(def', 'ghi)']
+    >>> s = shlex.shlex(text, posix=True, punctuation_chars=True)
+    >>> s.whitespace_split = True
+    >>> list(s)
+    ['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', 'abc', ';',
+    '(', 'def', 'ghi', ')']
 
 Of course, tokens will be returned which are not valid for shells, and you'll
 need to implement your own error checks on the returned tokens.
@@ -428,6 +434,11 @@ which characters constitute punctuation. For example::
       >>> list(s)
       ['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?']
 
+   However, to match the shell as closely as possible, it is recommended to
+   always use ``posix`` and :attr:`~shlex.whitespace_split` when using
+   :attr:`~shlex.punctuation_chars`, which will negate
+   :attr:`~shlex.wordchars` entirely.
+
 For best effect, ``punctuation_chars`` should be set in conjunction with
 ``posix=True``. (Note that ``posix=False`` is the default for
 :class:`~shlex.shlex`.)
diff --git a/Lib/shlex.py b/Lib/shlex.py
@@ -246,7 +246,8 @@ def read_token(self):
                     escapedstate = 'a'
                     self.state = nextchar
                 elif (nextchar in self.wordchars or nextchar in self.quotes
-                      or self.whitespace_split):
+                      or (self.whitespace_split and
+                          nextchar not in self.punctuation_chars)):
                     self.token += nextchar
                 else:
                     if self.punctuation_chars:
diff --git a/Lib/test/test_shlex.py b/Lib/test/test_shlex.py
@@ -1,4 +1,5 @@
 import io
+import itertools
 import shlex
 import string
 import unittest
@@ -183,10 +184,12 @@ def testSyntaxSplitAmpersandAndPipe(self):
             src = ['echo hi %s echo bye' % delimiter,
                    'echo hi%secho bye' % delimiter]
             ref = ['echo', 'hi', delimiter, 'echo', 'bye']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                 s = shlex.shlex(ss, punctuation_chars=True)
+                s.whitespace_split = ws
                 result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
+                                 "While splitting '%s' [ws=%s]" % (ss, ws))
 
     def testSyntaxSplitSemicolon(self):
         """Test handling of syntax splitting of ;"""
@@ -197,10 +200,12 @@ def testSyntaxSplitSemicolon(self):
                    'echo hi%s echo bye' % delimiter,
                    'echo hi%secho bye' % delimiter]
             ref = ['echo', 'hi', delimiter, 'echo', 'bye']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                 s = shlex.shlex(ss, punctuation_chars=True)
+                s.whitespace_split = ws
                 result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
+                                 "While splitting '%s' [ws=%s]" % (ss, ws))
 
     def testSyntaxSplitRedirect(self):
         """Test handling of syntax splitting of >"""
@@ -211,29 +216,37 @@ def testSyntaxSplitRedirect(self):
                    'echo hi%s out' % delimiter,
                    'echo hi%sout' % delimiter]
             ref = ['echo', 'hi', delimiter, 'out']
-            for ss in src:
+            for ss, ws in itertools.product(src, (False, True)):
                 s = shlex.shlex(ss, punctuation_chars=True)
                 result = list(s)
-                self.assertEqual(ref, result, "While splitting '%s'" % ss)
+                self.assertEqual(ref, result,
+                                 "While splitting '%s' [ws=%s]" % (ss, ws))
 
     def testSyntaxSplitParen(self):
         """Test handling of syntax splitting of ()"""
         # these should all parse to the same output
         src = ['( echo hi )',
                '(echo hi)']
         ref = ['(', 'echo', 'hi', ')']
-        for ss in src:
+        for ss, ws in itertools.product(src, (False, True)):
             s = shlex.shlex(ss, punctuation_chars=True)
+            s.whitespace_split = ws
             result = list(s)
-            self.assertEqual(ref, result, "While splitting '%s'" % ss)
+            self.assertEqual(ref, result,
+                             "While splitting '%s' [ws=%s]" % (ss, ws))
 
     def testSyntaxSplitCustom(self):
         """Test handling of syntax splitting with custom chars"""
+        ss = "~/a&&b-c --color=auto||d *.py?"
         ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
-        ss = "~/a && b-c --color=auto || d *.py?"
         s = shlex.shlex(ss, punctuation_chars="|")
         result = list(s)
-        self.assertEqual(ref, result, "While splitting '%s'" % ss)
+        self.assertEqual(ref, result, "While splitting '%s' [ws=False]" % ss)
+        ref = ['~/a&&b-c', '--color=auto', '||', 'd', '*.py?']
+        s = shlex.shlex(ss, punctuation_chars="|")
+        s.whitespace_split = True
+        result = list(s)
+        self.assertEqual(ref, result, "While splitting '%s' [ws=True]" % ss)
 
     def testTokenTypes(self):
         """Test that tokens are split with types as expected."""
@@ -293,6 +306,19 @@ def testEmptyStringHandling(self):
         s = shlex.shlex("'')abc", punctuation_chars=True)
         self.assertEqual(list(s), expected)
 
+    def testUnicodeHandling(self):
+        """Test punctuation_chars and whitespace_split handle unicode."""
+        ss = "\u2119\u01b4\u2602\u210c\u00f8\u1f24"
+        # Should be parsed as one complete token (whitespace_split=True).
+        ref = ['\u2119\u01b4\u2602\u210c\u00f8\u1f24']
+        s = shlex.shlex(ss, punctuation_chars=True)
+        s.whitespace_split = True
+        self.assertEqual(list(s), ref)
+        # Without whitespace_split, uses wordchars and splits on all.
+        ref = ['\u2119', '\u01b4', '\u2602', '\u210c', '\u00f8', '\u1f24']
+        s = shlex.shlex(ss, punctuation_chars=True)
+        self.assertEqual(list(s), ref)
+
     def testQuote(self):
         safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
         unicode_sample = '\xe9\xe0\xdf'  # e + acute accent, a + grave, sharp s