Skip to content

Commit ca109ea

Browse files
committed
[Parse] Ban trailing space for /.../ regex literals
Ban space and tab as the last character of a `/.../` regex literal, unless escaped with a backslash. This matches the banning of space and tab as the first character, and helps avoid breaking source in even more cases.
1 parent afa599c commit ca109ea

File tree

5 files changed

+204
-110
lines changed

5 files changed

+204
-110
lines changed

include/swift/AST/DiagnosticsParse.def

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,9 @@ ERROR(lex_invalid_closing_delimiter,none,
143143
ERROR(lex_regex_literal_invalid_starting_char,none,
144144
"regex literal may not start with %0; add backslash to escape",
145145
(StringRef))
146+
ERROR(lex_regex_literal_invalid_ending_char,none,
147+
"regex literal may not end with %0; use extended literal instead",
148+
(StringRef))
146149
ERROR(lex_regex_literal_unterminated,none,
147150
"unterminated regex literal", ())
148151

lib/Parse/Lexer.cpp

Lines changed: 81 additions & 68 deletions
Original file line numberDiff line numberDiff line change
@@ -1984,6 +1984,17 @@ bool Lexer::tryLexRegexLiteral(const char *TokStart) {
19841984
if (!LangOpts.EnableExperimentalStringProcessing || !regexLiteralLexingFn)
19851985
return false;
19861986

1987+
auto spaceOrTabDescription = [](char c) -> StringRef {
1988+
switch (c) {
1989+
case ' ':
1990+
return "space";
1991+
case '\t':
1992+
return "tab";
1993+
default:
1994+
llvm_unreachable("Unhandled case");
1995+
}
1996+
};
1997+
19871998
bool MustBeRegex = true;
19881999
bool IsForwardSlash = (*TokStart == '/');
19892000

@@ -2012,31 +2023,14 @@ bool Lexer::tryLexRegexLiteral(const char *TokStart) {
20122023
// TODO: This heuristic should be sunk into the Swift library once we have a
20132024
// way of doing fix-its from there.
20142025
auto *RegexContentStart = TokStart + 1;
2015-
switch (*RegexContentStart) {
2016-
case ' ':
2017-
case '\t': {
2026+
if (*RegexContentStart == ' ' || *RegexContentStart == '\t') {
20182027
if (!MustBeRegex)
20192028
return false;
20202029

20212030
// We must have a regex, so emit an error for space and tab.
2022-
StringRef DiagChar;
2023-
switch (*RegexContentStart) {
2024-
case ' ':
2025-
DiagChar = "space";
2026-
break;
2027-
case '\t':
2028-
DiagChar = "tab";
2029-
break;
2030-
default:
2031-
llvm_unreachable("Unhandled case");
2032-
}
20332031
diagnose(RegexContentStart, diag::lex_regex_literal_invalid_starting_char,
2034-
DiagChar)
2035-
.fixItInsert(getSourceLoc(RegexContentStart), "\\");
2036-
break;
2037-
}
2038-
default:
2039-
break;
2032+
spaceOrTabDescription(*RegexContentStart))
2033+
.fixItInsert(getSourceLoc(RegexContentStart), "\\");
20402034
}
20412035
}
20422036

@@ -2054,58 +2048,77 @@ bool Lexer::tryLexRegexLiteral(const char *TokStart) {
20542048
if (Ptr == TokStart)
20552049
return false;
20562050

2057-
// If we're lexing `/.../`, error if we ended on the opening of a comment.
2058-
// We prefer to lex the comment as it's more likely than not that is what
2059-
// the user is expecting.
2060-
// TODO: This should be sunk into the Swift library.
2061-
if (IsForwardSlash && Ptr[-1] == '/' && (*Ptr == '*' || *Ptr == '/')) {
2062-
if (!MustBeRegex)
2063-
return false;
2051+
// Perform some additional heuristics to see if we can lex `/.../`.
2052+
// TODO: These should all be sunk into the Swift library.
2053+
if (IsForwardSlash) {
2054+
// If we're lexing `/.../`, error if we ended on the opening of a comment.
2055+
// We prefer to lex the comment as it's more likely than not that is what
2056+
// the user is expecting.
2057+
if (Ptr[-1] == '/' && (*Ptr == '*' || *Ptr == '/')) {
2058+
if (!MustBeRegex)
2059+
return false;
20642060

2065-
diagnose(TokStart, diag::lex_regex_literal_unterminated);
2066-
2067-
// Move the pointer back to the '/' of the comment.
2068-
Ptr--;
2069-
}
2070-
2071-
// If we're tentatively lexing `/.../`, scan to make sure we don't have any
2072-
// unbalanced ')'s. This helps avoid ambiguity with unapplied operator
2073-
// references e.g `reduce(1, /)` and `foo(/, 0) / 2`. This would be invalid
2074-
// regex syntax anyways. This ensures users can surround their operator ref
2075-
// in parens `(/)` to fix the issue. This also applies to prefix operators
2076-
// that can be disambiguated as e.g `(/S.foo)`. Note we need to track whether
2077-
// or not we're in a custom character class `[...]`, as parens are literal
2078-
// there.
2079-
// TODO: This should be sunk into the Swift library.
2080-
if (IsForwardSlash && !MustBeRegex) {
2081-
unsigned CharClassDepth = 0;
2082-
unsigned GroupDepth = 0;
2083-
for (auto *Cursor = TokStart + 1; Cursor < Ptr - 1; Cursor++) {
2084-
switch (*Cursor) {
2085-
case '\\':
2086-
// Skip over the next character of an escape.
2087-
Cursor++;
2088-
break;
2089-
case '(':
2090-
if (CharClassDepth == 0)
2091-
GroupDepth += 1;
2092-
break;
2093-
case ')':
2094-
if (CharClassDepth != 0)
2061+
diagnose(TokStart, diag::lex_regex_literal_unterminated);
2062+
2063+
// Move the pointer back to the '/' of the comment.
2064+
Ptr--;
2065+
}
2066+
auto *TokEnd = Ptr - 1;
2067+
auto *ContentEnd = TokEnd - 1;
2068+
2069+
// We also ban unescaped space and tab at the end of a `/.../` literal.
2070+
if (*TokEnd == '/' && (TokEnd - TokStart > 2) && ContentEnd[-1] != '\\' &&
2071+
(*ContentEnd == ' ' || *ContentEnd == '\t')) {
2072+
if (!MustBeRegex)
2073+
return false;
2074+
2075+
// Diagnose and suggest using a `#/.../#` literal instead. We could
2076+
// suggest escaping, but that would be wrong if the user has written (?x).
2077+
// TODO: Should we suggest this for space-as-first character too?
2078+
diagnose(ContentEnd, diag::lex_regex_literal_invalid_ending_char,
2079+
spaceOrTabDescription(*ContentEnd))
2080+
.fixItInsert(getSourceLoc(TokStart), "#")
2081+
.fixItInsert(getSourceLoc(Ptr), "#");
2082+
}
2083+
2084+
// If we're tentatively lexing `/.../`, scan to make sure we don't have any
2085+
// unbalanced ')'s. This helps avoid ambiguity with unapplied operator
2086+
// references e.g `reduce(1, /)` and `foo(/, 0) / 2`. This would be invalid
2087+
// regex syntax anyways. This ensures users can surround their operator ref
2088+
// in parens `(/)` to fix the issue. This also applies to prefix operators
2089+
// that can be disambiguated as e.g `(/S.foo)`. Note we need to track whether
2090+
// or not we're in a custom character class `[...]`, as parens are literal
2091+
// there.
2092+
if (!MustBeRegex) {
2093+
unsigned CharClassDepth = 0;
2094+
unsigned GroupDepth = 0;
2095+
for (auto *Cursor = TokStart + 1; Cursor < TokEnd; Cursor++) {
2096+
switch (*Cursor) {
2097+
case '\\':
2098+
// Skip over the next character of an escape.
2099+
Cursor++;
20952100
break;
2101+
case '(':
2102+
if (CharClassDepth == 0)
2103+
GroupDepth += 1;
2104+
break;
2105+
case ')':
2106+
if (CharClassDepth != 0)
2107+
break;
20962108

2097-
// Invalid, so bail.
2098-
if (GroupDepth == 0)
2099-
return false;
2109+
// Invalid, so bail.
2110+
if (GroupDepth == 0)
2111+
return false;
21002112

2101-
GroupDepth -= 1;
2102-
break;
2103-
case '[':
2104-
CharClassDepth += 1;
2105-
break;
2106-
case ']':
2107-
if (CharClassDepth != 0)
2108-
CharClassDepth -= 1;
2113+
GroupDepth -= 1;
2114+
break;
2115+
case '[':
2116+
CharClassDepth += 1;
2117+
break;
2118+
case ']':
2119+
if (CharClassDepth != 0)
2120+
CharClassDepth -= 1;
2121+
}
21092122
}
21102123
}
21112124
}

0 commit comments

Comments
 (0)