@@ -1984,6 +1984,17 @@ bool Lexer::tryLexRegexLiteral(const char *TokStart) {
1984
1984
if (!LangOpts.EnableExperimentalStringProcessing || !regexLiteralLexingFn)
1985
1985
return false ;
1986
1986
1987
+ auto spaceOrTabDescription = [](char c) -> StringRef {
1988
+ switch (c) {
1989
+ case ' ' :
1990
+ return " space" ;
1991
+ case ' \t ' :
1992
+ return " tab" ;
1993
+ default :
1994
+ llvm_unreachable (" Unhandled case" );
1995
+ }
1996
+ };
1997
+
1987
1998
bool MustBeRegex = true ;
1988
1999
bool IsForwardSlash = (*TokStart == ' /' );
1989
2000
@@ -2012,31 +2023,14 @@ bool Lexer::tryLexRegexLiteral(const char *TokStart) {
2012
2023
// TODO: This heuristic should be sunk into the Swift library once we have a
2013
2024
// way of doing fix-its from there.
2014
2025
auto *RegexContentStart = TokStart + 1 ;
2015
- switch (*RegexContentStart) {
2016
- case ' ' :
2017
- case ' \t ' : {
2026
+ if (*RegexContentStart == ' ' || *RegexContentStart == ' \t ' ) {
2018
2027
if (!MustBeRegex)
2019
2028
return false ;
2020
2029
2021
2030
// We must have a regex, so emit an error for space and tab.
2022
- StringRef DiagChar;
2023
- switch (*RegexContentStart) {
2024
- case ' ' :
2025
- DiagChar = " space" ;
2026
- break ;
2027
- case ' \t ' :
2028
- DiagChar = " tab" ;
2029
- break ;
2030
- default :
2031
- llvm_unreachable (" Unhandled case" );
2032
- }
2033
2031
diagnose (RegexContentStart, diag::lex_regex_literal_invalid_starting_char,
2034
- DiagChar)
2035
- .fixItInsert (getSourceLoc (RegexContentStart), " \\ " );
2036
- break ;
2037
- }
2038
- default :
2039
- break ;
2032
+ spaceOrTabDescription (*RegexContentStart))
2033
+ .fixItInsert (getSourceLoc (RegexContentStart), " \\ " );
2040
2034
}
2041
2035
}
2042
2036
@@ -2054,58 +2048,77 @@ bool Lexer::tryLexRegexLiteral(const char *TokStart) {
2054
2048
if (Ptr == TokStart)
2055
2049
return false ;
2056
2050
2057
- // If we're lexing `/.../`, error if we ended on the opening of a comment.
2058
- // We prefer to lex the comment as it's more likely than not that is what
2059
- // the user is expecting.
2060
- // TODO: This should be sunk into the Swift library.
2061
- if (IsForwardSlash && Ptr[-1 ] == ' /' && (*Ptr == ' *' || *Ptr == ' /' )) {
2062
- if (!MustBeRegex)
2063
- return false ;
2051
+ // Perform some additional heuristics to see if we can lex `/.../`.
2052
+ // TODO: These should all be sunk into the Swift library.
2053
+ if (IsForwardSlash) {
2054
+ // If we're lexing `/.../`, error if we ended on the opening of a comment.
2055
+ // We prefer to lex the comment as it's more likely than not that is what
2056
+ // the user is expecting.
2057
+ if (Ptr[-1 ] == ' /' && (*Ptr == ' *' || *Ptr == ' /' )) {
2058
+ if (!MustBeRegex)
2059
+ return false ;
2064
2060
2065
- diagnose (TokStart, diag::lex_regex_literal_unterminated);
2066
-
2067
- // Move the pointer back to the '/' of the comment.
2068
- Ptr--;
2069
- }
2070
-
2071
- // If we're tentatively lexing `/.../`, scan to make sure we don't have any
2072
- // unbalanced ')'s. This helps avoid ambiguity with unapplied operator
2073
- // references e.g `reduce(1, /)` and `foo(/, 0) / 2`. This would be invalid
2074
- // regex syntax anyways. This ensures users can surround their operator ref
2075
- // in parens `(/)` to fix the issue. This also applies to prefix operators
2076
- // that can be disambiguated as e.g `(/S.foo)`. Note we need to track whether
2077
- // or not we're in a custom character class `[...]`, as parens are literal
2078
- // there.
2079
- // TODO: This should be sunk into the Swift library.
2080
- if (IsForwardSlash && !MustBeRegex) {
2081
- unsigned CharClassDepth = 0 ;
2082
- unsigned GroupDepth = 0 ;
2083
- for (auto *Cursor = TokStart + 1 ; Cursor < Ptr - 1 ; Cursor++) {
2084
- switch (*Cursor) {
2085
- case ' \\ ' :
2086
- // Skip over the next character of an escape.
2087
- Cursor++;
2088
- break ;
2089
- case ' (' :
2090
- if (CharClassDepth == 0 )
2091
- GroupDepth += 1 ;
2092
- break ;
2093
- case ' )' :
2094
- if (CharClassDepth != 0 )
2061
+ diagnose (TokStart, diag::lex_regex_literal_unterminated);
2062
+
2063
+ // Move the pointer back to the '/' of the comment.
2064
+ Ptr--;
2065
+ }
2066
+ auto *TokEnd = Ptr - 1 ;
2067
+ auto *ContentEnd = TokEnd - 1 ;
2068
+
2069
+ // We also ban unescaped space and tab at the end of a `/.../` literal.
2070
+ if (*TokEnd == ' /' && (TokEnd - TokStart > 2 ) && ContentEnd[-1 ] != ' \\ ' &&
2071
+ (*ContentEnd == ' ' || *ContentEnd == ' \t ' )) {
2072
+ if (!MustBeRegex)
2073
+ return false ;
2074
+
2075
+ // Diagnose and suggest using a `#/.../#` literal instead. We could
2076
+ // suggest escaping, but that would be wrong if the user has written (?x).
2077
+ // TODO: Should we suggest this for space-as-first character too?
2078
+ diagnose (ContentEnd, diag::lex_regex_literal_invalid_ending_char,
2079
+ spaceOrTabDescription (*ContentEnd))
2080
+ .fixItInsert (getSourceLoc (TokStart), " #" )
2081
+ .fixItInsert (getSourceLoc (Ptr), " #" );
2082
+ }
2083
+
2084
+ // If we're tentatively lexing `/.../`, scan to make sure we don't have any
2085
+ // unbalanced ')'s. This helps avoid ambiguity with unapplied operator
2086
+ // references e.g `reduce(1, /)` and `foo(/, 0) / 2`. This would be invalid
2087
+ // regex syntax anyways. This ensures users can surround their operator ref
2088
+ // in parens `(/)` to fix the issue. This also applies to prefix operators
2089
+ // that can be disambiguated as e.g `(/S.foo)`. Note we need to track whether
2090
+ // or not we're in a custom character class `[...]`, as parens are literal
2091
+ // there.
2092
+ if (!MustBeRegex) {
2093
+ unsigned CharClassDepth = 0 ;
2094
+ unsigned GroupDepth = 0 ;
2095
+ for (auto *Cursor = TokStart + 1 ; Cursor < TokEnd; Cursor++) {
2096
+ switch (*Cursor) {
2097
+ case ' \\ ' :
2098
+ // Skip over the next character of an escape.
2099
+ Cursor++;
2095
2100
break ;
2101
+ case ' (' :
2102
+ if (CharClassDepth == 0 )
2103
+ GroupDepth += 1 ;
2104
+ break ;
2105
+ case ' )' :
2106
+ if (CharClassDepth != 0 )
2107
+ break ;
2096
2108
2097
- // Invalid, so bail.
2098
- if (GroupDepth == 0 )
2099
- return false ;
2109
+ // Invalid, so bail.
2110
+ if (GroupDepth == 0 )
2111
+ return false ;
2100
2112
2101
- GroupDepth -= 1 ;
2102
- break ;
2103
- case ' [' :
2104
- CharClassDepth += 1 ;
2105
- break ;
2106
- case ' ]' :
2107
- if (CharClassDepth != 0 )
2108
- CharClassDepth -= 1 ;
2113
+ GroupDepth -= 1 ;
2114
+ break ;
2115
+ case ' [' :
2116
+ CharClassDepth += 1 ;
2117
+ break ;
2118
+ case ' ]' :
2119
+ if (CharClassDepth != 0 )
2120
+ CharClassDepth -= 1 ;
2121
+ }
2109
2122
}
2110
2123
}
2111
2124
}
0 commit comments