-
Notifications
You must be signed in to change notification settings - Fork 50
Allow unbounded quoted sequences \Q...
#432
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -579,7 +579,7 @@ extension Source { | |
|
||
/// Try to consume quoted content | ||
/// | ||
/// Quote -> '\Q' (!'\E' .)* '\E' | ||
/// Quote -> '\Q' (!'\E' .)* '\E'? | ||
/// | ||
/// With `SyntaxOptions.experimentalQuotes`, also accepts | ||
/// | ||
|
@@ -592,9 +592,24 @@ extension Source { | |
mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? { | ||
let str = try recordLoc { src -> String? in | ||
if src.tryEat(sequence: #"\Q"#) { | ||
return try src.expectQuoted(endingWith: #"\E"#).value | ||
let contents = src.lexUntil { src in | ||
src.isEmpty || src.tryEat(sequence: #"\E"#) | ||
}.value | ||
|
||
// In multi-line literals, the quote may not span multiple lines. | ||
if context.syntax.contains(.multilineExtendedSyntax), | ||
contents.spansMultipleLinesInRegexLiteral { | ||
throw ParseError.quoteMayNotSpanMultipleLines | ||
} | ||
|
||
// The sequence must not be empty in a custom character class. | ||
if context.isInCustomCharacterClass && contents.isEmpty { | ||
throw ParseError.expectedNonEmptyContents | ||
} | ||
return contents | ||
} | ||
if context.experimentalQuotes, src.tryEat("\"") { | ||
// TODO: Can experimental quotes be empty? | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suspect we'll be dropping experimental mode in favor of |
||
return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value | ||
} | ||
return nil | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -754,6 +754,14 @@ extension RegexTests { | |
// This follows the PCRE behavior. | ||
parseTest(#"\Q\\E"#, quote("\\")) | ||
|
||
// ICU allows quotes to be empty outside of custom character classes. | ||
parseTest(#"\Q\E"#, quote("")) | ||
|
||
// Quotes may be unterminated. | ||
parseTest(#"\Qab"#, quote("ab")) | ||
parseTest(#"\Q"#, quote("")) | ||
parseTest("\\Qab\\", quote("ab\\")) | ||
|
||
parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"), | ||
syntax: .experimental) | ||
parseTest(#"a" .""b""#, concat("a", quote(" ."), quote("b")), | ||
|
@@ -2539,8 +2547,6 @@ extension RegexTests { | |
diagnosticTest(#"(?P"#, .expected(")")) | ||
diagnosticTest(#"(?R"#, .expected(")")) | ||
|
||
diagnosticTest(#"\Qab"#, .expected("\\E")) | ||
diagnosticTest("\\Qab\\", .expected("\\E")) | ||
diagnosticTest(#""ab"#, .expected("\""), syntax: .experimental) | ||
diagnosticTest(#""ab\""#, .expected("\""), syntax: .experimental) | ||
diagnosticTest("\"ab\\", .expectedEscape, syntax: .experimental) | ||
|
@@ -2619,6 +2625,9 @@ extension RegexTests { | |
// TODO: Custom diagnostic for missing '\Q' | ||
diagnosticTest(#"\E"#, .invalidEscape("E")) | ||
|
||
diagnosticTest(#"[\Q\E]"#, .expectedNonEmptyContents) | ||
diagnosticTest(#"[\Q]"#, .expected("]")) | ||
|
||
// PCRE treats these as octal, but we require a `0` prefix. | ||
diagnosticTest(#"[\1]"#, .invalidEscape("1")) | ||
diagnosticTest(#"[\123]"#, .invalidEscape("1")) | ||
|
@@ -2711,6 +2720,26 @@ extension RegexTests { | |
""", .cannotRemoveExtendedSyntaxInMultilineMode | ||
) | ||
|
||
diagnosticWithDelimitersTest(#""" | ||
#/ | ||
\Q | ||
\E | ||
/# | ||
"""#, .quoteMayNotSpanMultipleLines) | ||
|
||
diagnosticWithDelimitersTest(#""" | ||
#/ | ||
\Qabc | ||
\E | ||
/# | ||
"""#, .quoteMayNotSpanMultipleLines) | ||
|
||
diagnosticWithDelimitersTest(#""" | ||
#/ | ||
\Q | ||
/# | ||
"""#, .quoteMayNotSpanMultipleLines) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this just be the regex There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point! Currently we don't do any whitespace stripping, so if we just allowed it as-is, it would parse as There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm... for now, if someone wants verbatim multi-line content, they have to use a string inside builder for that. We'll eventually get at least interpolation if not quoting as well. Otherwise the multi-line literal ignores the lines and whitespaces. It might be the case these are all the same regex:
For errors:
What about:
Newlines are a consideration of the host language. So perhaps There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By semantic line, do you mean any line that doesn't consist entirely of whitespace? So: #/
re\Q gex
/# would be legal as we'd strip all trailing whitespace after the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A related question here is whether // Okay
#/
(?-x) abc
/#
// Not okay
#/
(?-x)abc
d
/# There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I could go either way, what do you think? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should probably have a consistent behavior between There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. SGTM |
||
|
||
// MARK: Group specifiers | ||
|
||
diagnosticTest(#"(*"#, .unknownGroupKind("*")) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can the quote get terminated by a group or does it have to be terminated by the end of the regex? In that case, it seems more like:
(or whatever our notational convention was for end of input)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It has to be terminated by the end of the regex yeah, I can update the grammar to be explicit about that. I don't believe we ever added a convention for it,
<end-of-input>
seems reasonable to me tho.