Skip to content

Commit 56624a9

Browse files
evanunderscorevsajip
authored andcommitted
bpo-28595: Allow shlex whitespace_split with punctuation_chars (GH-2071)
1 parent 2b843ac commit 56624a9

File tree

3 files changed

+61
-23
lines changed

3 files changed

+61
-23
lines changed

Doc/library/shlex.rst

Lines changed: 23 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,8 @@ variables which either control lexical analysis or can be used for debugging:
225225
appear in filename specifications and command line parameters, will also be
226226
included in this attribute, and any characters which appear in
227227
``punctuation_chars`` will be removed from ``wordchars`` if they are present
228-
there.
228+
there. If :attr:`whitespace_split` is set to ``True``, this will have no
229+
effect.
229230

230231

231232
.. attribute:: shlex.whitespace
@@ -258,11 +259,13 @@ variables which either control lexical analysis or can be used for debugging:
258259

259260
If ``True``, tokens will only be split in whitespaces. This is useful, for
260261
example, for parsing command lines with :class:`~shlex.shlex`, getting
261-
tokens in a similar way to shell arguments. If this attribute is ``True``,
262-
:attr:`punctuation_chars` will have no effect, and splitting will happen
263-
only on whitespaces. When using :attr:`punctuation_chars`, which is
264-
intended to provide parsing closer to that implemented by shells, it is
265-
advisable to leave ``whitespace_split`` as ``False`` (the default value).
262+
tokens in a similar way to shell arguments. When used in combination with
263+
:attr:`punctuation_chars`, tokens will be split on whitespace in addition to
264+
those characters.
265+
266+
.. versionchanged:: 3.8
267+
The :attr:`punctuation_chars` attribute was made compatible with the
268+
:attr:`whitespace_split` attribute.
266269

267270

268271
.. attribute:: shlex.infile
@@ -398,12 +401,15 @@ otherwise. To illustrate, you can see the difference in the following snippet:
398401

399402
>>> import shlex
400403
>>> text = "a && b; c && d || e; f >'abc'; (def \"ghi\")"
401-
>>> list(shlex.shlex(text))
402-
['a', '&', '&', 'b', ';', 'c', '&', '&', 'd', '|', '|', 'e', ';', 'f', '>',
403-
"'abc'", ';', '(', 'def', '"ghi"', ')']
404-
>>> list(shlex.shlex(text, punctuation_chars=True))
405-
['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', "'abc'",
406-
';', '(', 'def', '"ghi"', ')']
404+
>>> s = shlex.shlex(text, posix=True)
405+
>>> s.whitespace_split = True
406+
>>> list(s)
407+
['a', '&&', 'b;', 'c', '&&', 'd', '||', 'e;', 'f', '>abc;', '(def', 'ghi)']
408+
>>> s = shlex.shlex(text, posix=True, punctuation_chars=True)
409+
>>> s.whitespace_split = True
410+
>>> list(s)
411+
['a', '&&', 'b', ';', 'c', '&&', 'd', '||', 'e', ';', 'f', '>', 'abc', ';',
412+
'(', 'def', 'ghi', ')']
407413

408414
Of course, tokens will be returned which are not valid for shells, and you'll
409415
need to implement your own error checks on the returned tokens.
@@ -428,6 +434,11 @@ which characters constitute punctuation. For example::
428434
>>> list(s)
429435
['~/a', '&&', 'b-c', '--color=auto', '||', 'd', '*.py?']
430436

437+
However, to match the shell as closely as possible, it is recommended to
438+
always use ``posix`` and :attr:`~shlex.whitespace_split` when using
439+
:attr:`~shlex.punctuation_chars`, which will negate
440+
:attr:`~shlex.wordchars` entirely.
441+
431442
For best effect, ``punctuation_chars`` should be set in conjunction with
432443
``posix=True``. (Note that ``posix=False`` is the default for
433444
:class:`~shlex.shlex`.)

Lib/shlex.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,8 @@ def read_token(self):
246246
escapedstate = 'a'
247247
self.state = nextchar
248248
elif (nextchar in self.wordchars or nextchar in self.quotes
249-
or self.whitespace_split):
249+
or (self.whitespace_split and
250+
nextchar not in self.punctuation_chars)):
250251
self.token += nextchar
251252
else:
252253
if self.punctuation_chars:

Lib/test/test_shlex.py

Lines changed: 36 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import io
2+
import itertools
23
import shlex
34
import string
45
import unittest
@@ -183,10 +184,12 @@ def testSyntaxSplitAmpersandAndPipe(self):
183184
src = ['echo hi %s echo bye' % delimiter,
184185
'echo hi%secho bye' % delimiter]
185186
ref = ['echo', 'hi', delimiter, 'echo', 'bye']
186-
for ss in src:
187+
for ss, ws in itertools.product(src, (False, True)):
187188
s = shlex.shlex(ss, punctuation_chars=True)
189+
s.whitespace_split = ws
188190
result = list(s)
189-
self.assertEqual(ref, result, "While splitting '%s'" % ss)
191+
self.assertEqual(ref, result,
192+
"While splitting '%s' [ws=%s]" % (ss, ws))
190193

191194
def testSyntaxSplitSemicolon(self):
192195
"""Test handling of syntax splitting of ;"""
@@ -197,10 +200,12 @@ def testSyntaxSplitSemicolon(self):
197200
'echo hi%s echo bye' % delimiter,
198201
'echo hi%secho bye' % delimiter]
199202
ref = ['echo', 'hi', delimiter, 'echo', 'bye']
200-
for ss in src:
203+
for ss, ws in itertools.product(src, (False, True)):
201204
s = shlex.shlex(ss, punctuation_chars=True)
205+
s.whitespace_split = ws
202206
result = list(s)
203-
self.assertEqual(ref, result, "While splitting '%s'" % ss)
207+
self.assertEqual(ref, result,
208+
"While splitting '%s' [ws=%s]" % (ss, ws))
204209

205210
def testSyntaxSplitRedirect(self):
206211
"""Test handling of syntax splitting of >"""
@@ -211,29 +216,37 @@ def testSyntaxSplitRedirect(self):
211216
'echo hi%s out' % delimiter,
212217
'echo hi%sout' % delimiter]
213218
ref = ['echo', 'hi', delimiter, 'out']
214-
for ss in src:
219+
for ss, ws in itertools.product(src, (False, True)):
215220
s = shlex.shlex(ss, punctuation_chars=True)
216221
result = list(s)
217-
self.assertEqual(ref, result, "While splitting '%s'" % ss)
222+
self.assertEqual(ref, result,
223+
"While splitting '%s' [ws=%s]" % (ss, ws))
218224

219225
def testSyntaxSplitParen(self):
220226
"""Test handling of syntax splitting of ()"""
221227
# these should all parse to the same output
222228
src = ['( echo hi )',
223229
'(echo hi)']
224230
ref = ['(', 'echo', 'hi', ')']
225-
for ss in src:
231+
for ss, ws in itertools.product(src, (False, True)):
226232
s = shlex.shlex(ss, punctuation_chars=True)
233+
s.whitespace_split = ws
227234
result = list(s)
228-
self.assertEqual(ref, result, "While splitting '%s'" % ss)
235+
self.assertEqual(ref, result,
236+
"While splitting '%s' [ws=%s]" % (ss, ws))
229237

230238
def testSyntaxSplitCustom(self):
231239
"""Test handling of syntax splitting with custom chars"""
240+
ss = "~/a&&b-c --color=auto||d *.py?"
232241
ref = ['~/a', '&', '&', 'b-c', '--color=auto', '||', 'd', '*.py?']
233-
ss = "~/a && b-c --color=auto || d *.py?"
234242
s = shlex.shlex(ss, punctuation_chars="|")
235243
result = list(s)
236-
self.assertEqual(ref, result, "While splitting '%s'" % ss)
244+
self.assertEqual(ref, result, "While splitting '%s' [ws=False]" % ss)
245+
ref = ['~/a&&b-c', '--color=auto', '||', 'd', '*.py?']
246+
s = shlex.shlex(ss, punctuation_chars="|")
247+
s.whitespace_split = True
248+
result = list(s)
249+
self.assertEqual(ref, result, "While splitting '%s' [ws=True]" % ss)
237250

238251
def testTokenTypes(self):
239252
"""Test that tokens are split with types as expected."""
@@ -293,6 +306,19 @@ def testEmptyStringHandling(self):
293306
s = shlex.shlex("'')abc", punctuation_chars=True)
294307
self.assertEqual(list(s), expected)
295308

309+
def testUnicodeHandling(self):
310+
"""Test punctuation_chars and whitespace_split handle unicode."""
311+
ss = "\u2119\u01b4\u2602\u210c\u00f8\u1f24"
312+
# Should be parsed as one complete token (whitespace_split=True).
313+
ref = ['\u2119\u01b4\u2602\u210c\u00f8\u1f24']
314+
s = shlex.shlex(ss, punctuation_chars=True)
315+
s.whitespace_split = True
316+
self.assertEqual(list(s), ref)
317+
# Without whitespace_split, uses wordchars and splits on all.
318+
ref = ['\u2119', '\u01b4', '\u2602', '\u210c', '\u00f8', '\u1f24']
319+
s = shlex.shlex(ss, punctuation_chars=True)
320+
self.assertEqual(list(s), ref)
321+
296322
def testQuote(self):
297323
safeunquoted = string.ascii_letters + string.digits + '@%_-+=:,./'
298324
unicode_sample = '\xe9\xe0\xdf' # e + acute accent, a + grave, sharp s

0 commit comments

Comments
 (0)