Skip to content

gh-124130: Increase test coverage for \b and \B in regular expressions #124330

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 113 additions & 7 deletions Lib/test/test_re.py
Original file line number Diff line number Diff line change
Expand Up @@ -883,31 +883,137 @@ def test_named_unicode_escapes(self):
self.checkPatternError(br'\N{LESS-THAN SIGN}', r'bad escape \N', 0)
self.checkPatternError(br'[\N{LESS-THAN SIGN}]', r'bad escape \N', 1)

def test_string_boundaries(self):
def test_word_boundaries(self):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[no change needed] I think we could use subtests to provide more informative context (basically what you've got in each # comment header) when a test fails? but this is already in keeping with the existing style of this file, so not a big deal. these seem to nicely encode the current behavior state of our re implementation.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The comments provide context to the reader of the code (so you will know where to add new tests), not when a test fails. The traceback already has all necessary context.

subTest() has two functions:

  • Provide more informative context when a test fails. This is especially useful when traceback does not identify the failed test (in a loop, with generated test data).
  • Allows to continue execution when a test fails and collect information about other test failures in one run. This works only when tests are independent and subTest() is well granulated.

It has a drawback -- the traceback is limited and can be less informative if subTest() is used in a helper.

# See http://bugs.python.org/issue10713
self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1),
"abc")
self.assertEqual(re.search(r"\b(abc)\b", "abc").group(1), "abc")
self.assertEqual(re.search(r"\b(abc)\b", "abc", re.ASCII).group(1), "abc")
self.assertEqual(re.search(br"\b(abc)\b", b"abc").group(1), b"abc")
self.assertEqual(re.search(br"\b(abc)\b", b"abc", re.LOCALE).group(1), b"abc")
self.assertEqual(re.search(r"\b(ьюя)\b", "ьюя").group(1), "ьюя")
self.assertIsNone(re.search(r"\b(ьюя)\b", "ьюя", re.ASCII))
# There's a word boundary between a word and a non-word.
self.assertTrue(re.match(r".\b", "a="))
self.assertTrue(re.match(r".\b", "a=", re.ASCII))
self.assertTrue(re.match(br".\b", b"a="))
self.assertTrue(re.match(br".\b", b"a=", re.LOCALE))
self.assertTrue(re.match(r".\b", "я="))
self.assertIsNone(re.match(r".\b", "я=", re.ASCII))
# There's a word boundary between a non-word and a word.
self.assertTrue(re.match(r".\b", "=a"))
self.assertTrue(re.match(r".\b", "=a", re.ASCII))
self.assertTrue(re.match(br".\b", b"=a"))
self.assertTrue(re.match(br".\b", b"=a", re.LOCALE))
self.assertTrue(re.match(r".\b", "=я"))
self.assertIsNone(re.match(r".\b", "=я", re.ASCII))
# There is no word boundary inside a word.
self.assertIsNone(re.match(r".\b", "ab"))
self.assertIsNone(re.match(r".\b", "ab", re.ASCII))
self.assertIsNone(re.match(br".\b", b"ab"))
self.assertIsNone(re.match(br".\b", b"ab", re.LOCALE))
self.assertIsNone(re.match(r".\b", "юя"))
self.assertIsNone(re.match(r".\b", "юя", re.ASCII))
# There is no word boundary between a non-word characters.
self.assertIsNone(re.match(r".\b", "=-"))
self.assertIsNone(re.match(r".\b", "=-", re.ASCII))
self.assertIsNone(re.match(br".\b", b"=-"))
self.assertIsNone(re.match(br".\b", b"=-", re.LOCALE))
# There is no non-boundary match between a word and a non-word.
self.assertIsNone(re.match(r".\B", "a="))
self.assertIsNone(re.match(r".\B", "a=", re.ASCII))
self.assertIsNone(re.match(br".\B", b"a="))
self.assertIsNone(re.match(br".\B", b"a=", re.LOCALE))
self.assertIsNone(re.match(r".\B", "я="))
self.assertTrue(re.match(r".\B", "я=", re.ASCII))
# There is no non-boundary match between a non-word and a word.
self.assertIsNone(re.match(r".\B", "=a"))
self.assertIsNone(re.match(r".\B", "=a", re.ASCII))
self.assertIsNone(re.match(br".\B", b"=a"))
self.assertIsNone(re.match(br".\B", b"=a", re.LOCALE))
self.assertIsNone(re.match(r".\B", "=я"))
self.assertTrue(re.match(r".\B", "=я", re.ASCII))
# There's a non-boundary match inside a word.
self.assertTrue(re.match(r".\B", "ab"))
self.assertTrue(re.match(r".\B", "ab", re.ASCII))
self.assertTrue(re.match(br".\B", b"ab"))
self.assertTrue(re.match(br".\B", b"ab", re.LOCALE))
self.assertTrue(re.match(r".\B", "юя"))
self.assertTrue(re.match(r".\B", "юя", re.ASCII))
# There's a non-boundary match between a non-word characters.
self.assertTrue(re.match(r".\B", "=-"))
self.assertTrue(re.match(r".\B", "=-", re.ASCII))
self.assertTrue(re.match(br".\B", b"=-"))
self.assertTrue(re.match(br".\B", b"=-", re.LOCALE))
# There's a word boundary at the start of a string.
self.assertTrue(re.match(r"\b", "abc"))
self.assertTrue(re.match(r"\b", "abc", re.ASCII))
self.assertTrue(re.match(br"\b", b"abc"))
self.assertTrue(re.match(br"\b", b"abc", re.LOCALE))
self.assertTrue(re.match(r"\b", "ьюя"))
self.assertIsNone(re.match(r"\b", "ьюя", re.ASCII))
# There's a word boundary at the end of a string.
self.assertTrue(re.fullmatch(r".+\b", "abc"))
self.assertTrue(re.fullmatch(r".+\b", "abc", re.ASCII))
self.assertTrue(re.fullmatch(br".+\b", b"abc"))
self.assertTrue(re.fullmatch(br".+\b", b"abc", re.LOCALE))
self.assertTrue(re.fullmatch(r".+\b", "ьюя"))
self.assertIsNone(re.search(r"\b", "ьюя", re.ASCII))
# A non-empty string includes a non-boundary zero-length match.
self.assertTrue(re.search(r"\B", "abc"))
self.assertEqual(re.search(r"\B", "abc").span(), (1, 1))
self.assertEqual(re.search(r"\B", "abc", re.ASCII).span(), (1, 1))
self.assertEqual(re.search(br"\B", b"abc").span(), (1, 1))
self.assertEqual(re.search(br"\B", b"abc", re.LOCALE).span(), (1, 1))
self.assertEqual(re.search(r"\B", "ьюя").span(), (1, 1))
self.assertEqual(re.search(r"\B", "ьюя", re.ASCII).span(), (0, 0))
# There is no non-boundary match at the start of a string.
self.assertFalse(re.match(r"\B", "abc"))
self.assertIsNone(re.match(r"\B", "abc"))
self.assertIsNone(re.match(r"\B", "abc", re.ASCII))
self.assertIsNone(re.match(br"\B", b"abc"))
self.assertIsNone(re.match(br"\B", b"abc", re.LOCALE))
self.assertIsNone(re.match(r"\B", "ьюя"))
self.assertTrue(re.match(r"\B", "ьюя", re.ASCII))
# There is no non-boundary match at the end of a string.
self.assertIsNone(re.fullmatch(r".+\B", "abc"))
self.assertIsNone(re.fullmatch(r".+\B", "abc", re.ASCII))
self.assertIsNone(re.fullmatch(br".+\B", b"abc"))
self.assertIsNone(re.fullmatch(br".+\B", b"abc", re.LOCALE))
self.assertIsNone(re.fullmatch(r".+\B", "ьюя"))
self.assertTrue(re.fullmatch(r".+\B", "ьюя", re.ASCII))
# However, an empty string contains no word boundaries, and also no
# non-boundaries.
self.assertIsNone(re.search(r"\B", ""))
self.assertIsNone(re.search(r"\b", ""))
self.assertIsNone(re.search(r"\b", "", re.ASCII))
self.assertIsNone(re.search(br"\b", b""))
self.assertIsNone(re.search(br"\b", b"", re.LOCALE))
# This one is questionable and different from the perlre behaviour,
# but describes current behavior.
self.assertIsNone(re.search(r"\b", ""))
self.assertIsNone(re.search(r"\B", ""))
self.assertIsNone(re.search(r"\B", "", re.ASCII))
self.assertIsNone(re.search(br"\B", b""))
self.assertIsNone(re.search(br"\B", b"", re.LOCALE))
# A single word-character string has two boundaries, but no
# non-boundary gaps.
self.assertEqual(len(re.findall(r"\b", "a")), 2)
self.assertEqual(len(re.findall(r"\b", "a", re.ASCII)), 2)
self.assertEqual(len(re.findall(br"\b", b"a")), 2)
self.assertEqual(len(re.findall(br"\b", b"a", re.LOCALE)), 2)
self.assertEqual(len(re.findall(r"\B", "a")), 0)
self.assertEqual(len(re.findall(r"\B", "a", re.ASCII)), 0)
self.assertEqual(len(re.findall(br"\B", b"a")), 0)
self.assertEqual(len(re.findall(br"\B", b"a", re.LOCALE)), 0)
# If there are no words, there are no boundaries
self.assertEqual(len(re.findall(r"\b", " ")), 0)
self.assertEqual(len(re.findall(r"\b", " ", re.ASCII)), 0)
self.assertEqual(len(re.findall(br"\b", b" ")), 0)
self.assertEqual(len(re.findall(br"\b", b" ", re.LOCALE)), 0)
self.assertEqual(len(re.findall(r"\b", " ")), 0)
self.assertEqual(len(re.findall(r"\b", " ", re.ASCII)), 0)
self.assertEqual(len(re.findall(br"\b", b" ")), 0)
self.assertEqual(len(re.findall(br"\b", b" ", re.LOCALE)), 0)
# Can match around the whitespace.
self.assertEqual(len(re.findall(r"\B", " ")), 2)
self.assertEqual(len(re.findall(r"\B", " ", re.ASCII)), 2)
self.assertEqual(len(re.findall(br"\B", b" ")), 2)
self.assertEqual(len(re.findall(br"\B", b" ", re.LOCALE)), 2)

def test_bigcharset(self):
self.assertEqual(re.match("([\u2222\u2223])",
Expand Down
Loading