-
Notifications
You must be signed in to change notification settings - Fork 50
Implement atomic non-capturing groups #488
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
bd3ad7d
e08bb64
f3b993e
49a8dda
0db7dc6
e61f5db
e74a113
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -889,8 +889,7 @@ extension RegexTests { | |
input: "Price: 100 dollars", match: nil) | ||
firstMatchTest( | ||
#"(?=\d+ dollars)\d+"#, | ||
input: "Price: 100 dollars", match: "100", | ||
xfail: true) // TODO | ||
input: "Price: 100 dollars", match: "100") | ||
|
||
firstMatchTest( | ||
#"\d+(*pla: dollars)"#, | ||
|
@@ -915,6 +914,14 @@ extension RegexTests { | |
#"\d+(*negative_lookahead: dollars)"#, | ||
input: "Price: 100 pesos", match: "100") | ||
|
||
// More complex lookaheads | ||
firstMatchTests( | ||
#"(?=.*e)(?=.*o)(?!.*z)."#, | ||
(input: "hello", match: "h"), | ||
(input: "hzello", match: "e"), | ||
(input: "hezllo", match: nil), | ||
(input: "helloz", match: nil)) | ||
|
||
firstMatchTest( | ||
#"(?<=USD)\d+"#, input: "Price: USD100", match: "100", xfail: true) | ||
firstMatchTest( | ||
|
@@ -1046,14 +1053,93 @@ extension RegexTests { | |
firstMatchTest( | ||
#"(?:a|.b)c"#, input: "123abcacxyz", match: "abc") | ||
firstMatchTest( | ||
#"(?>a|.b)c"#, input: "123abcacxyz", match: "ac", xfail: true) | ||
#"(?>a|.b)c"#, input: "123abcacxyz", match: "ac") | ||
firstMatchTest( | ||
"(*atomic:a|.b)c", input: "123abcacxyz", match: "ac", xfail: true) | ||
"(*atomic:a|.b)c", input: "123abcacxyz", match: "ac") | ||
firstMatchTest( | ||
#"(?:a+)[a-z]c"#, input: "123aacacxyz", match: "aac") | ||
firstMatchTest( | ||
#"(?>a+)[a-z]c"#, input: "123aacacxyz", match: "ac", xfail: true) | ||
#"(?>a+)[a-z]c"#, input: "123aacacxyz", match: nil) | ||
|
||
// Atomicity should stay in the atomic group | ||
firstMatchTest( | ||
#"(?:(?>a)|.b)c"#, input: "123abcacxyz", match: "abc") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the atomicity getting us in this test? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test is to make sure that the atomicity of |
||
|
||
// Quantifier behavior inside atomic groups | ||
|
||
// (?:a+?) matches as few 'a's as possible, after matching the first | ||
// (?>a+?) always matches exactly one 'a' | ||
firstMatchTests( | ||
#"^(?:a+?)a$"#, | ||
(input: "a", match: nil), | ||
(input: "aa", match: "aa"), | ||
(input: "aaa", match: "aaa")) | ||
firstMatchTests( | ||
#"^(?>a+?)a$"#, | ||
(input: "a", match: nil), | ||
(input: "aa", match: "aa"), | ||
(input: "aaa", match: nil)) | ||
|
||
// (?:a?+) and (?>a?+) are equivalent: they match one 'a' if available | ||
firstMatchTests( | ||
#"^(?:a?+)a$"#, | ||
(input: "a", match: nil), | ||
xfail: true) | ||
firstMatchTests( | ||
#"^(?:a?+)a$"#, | ||
(input: "aa", match: "aa"), | ||
(input: "aaa", match: nil)) | ||
firstMatchTests( | ||
#"^(?>a?+)a$"#, | ||
(input: "a", match: nil), | ||
(input: "aa", match: "aa"), | ||
(input: "aaa", match: nil)) | ||
|
||
// Capture behavior in non-atomic vs atomic groups | ||
firstMatchTests( | ||
#"(\d+)\w+\1"#, | ||
(input: "123x12", match: "123x12"), // `\w+` matches "3x" in this case | ||
(input: "23x23", match: "23x23"), | ||
(input: "123x23", match: "23x23")) | ||
firstMatchTests( | ||
#"(?>(\d+))\w+\1"#, | ||
(input: "123x12", match: nil)) | ||
firstMatchTests( | ||
#"(?>(\d+))\w+\1"#, | ||
(input: "23x23", match: "23x23"), | ||
(input: "123x23", match: "23x23"), | ||
xfail: true) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add positive tests and non-atomic versions? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Noting that for these or all of them, we can also do whole-match semantics, which might be clearer. Or match-from-front, in which case we'd see what alternative was taken There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I forgot about |
||
|
||
// Backreferences in lookaheads | ||
firstMatchTests( | ||
#"^(?=.*(.)(.)\2\1).+$"#, | ||
(input: "abbba", match: nil), | ||
(input: "ABBA", match: "ABBA"), | ||
(input: "defABBAdef", match: "defABBAdef")) | ||
firstMatchTests( | ||
#"^(?=.*(.)(.)\2\1).+\2$"#, | ||
(input: "abbba", match: nil), | ||
(input: "ABBA", match: nil), | ||
(input: "defABBAdef", match: nil)) | ||
// FIXME: Backreferences don't escape positive lookaheads | ||
firstMatchTests( | ||
#"^(?=.*(.)(.)\2\1).+\2$"#, | ||
(input: "ABBAB", match: "ABBAB"), | ||
(input: "defABBAdefB", match: "defABBAdefB"), | ||
xfail: true) | ||
|
||
firstMatchTests( | ||
#"^(?!.*(.)(.)\2\1).+$"#, | ||
(input: "abbba", match: "abbba"), | ||
(input: "ABBA", match: nil), | ||
(input: "defABBAdef", match: nil)) | ||
// Backreferences don't escape negative lookaheads; | ||
// matching only proceeds when the lookahead fails | ||
firstMatchTests( | ||
#"^(?!.*(.)(.)\2\1).+\2$"#, | ||
(input: "abbba", match: nil), | ||
(input: "abbbab", match: nil), | ||
(input: "ABBAB", match: nil)) | ||
|
||
// TODO: Test example where non-atomic is significant | ||
firstMatchTest( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is the only difference between lookahead and atomic the
continuingAt
vsrestoringAt
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, positive lookaheads and atomic groups have the same behavior, except that lookaheads reset the matching position on exit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we parameterize and share code in that case?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Feel like we could come back and do that, but we'd probably want to extricate the negative lookahead code in that case.