@@ -2040,6 +2040,14 @@ const char *Lexer::tryScanRegexLiteral(const char *TokStart, bool MustBeRegex,
2040
2040
2041
2041
bool IsForwardSlash = (*TokStart == ' /' );
2042
2042
2043
+ auto spaceOrTabDescription = [](char c) -> StringRef {
2044
+ switch (c) {
2045
+ case ' ' : return " space" ;
2046
+ case ' \t ' : return " tab" ;
2047
+ default : llvm_unreachable (" Unhandled case" );
2048
+ }
2049
+ };
2050
+
2043
2051
// Check if we're able to lex a `/.../` regex.
2044
2052
if (IsForwardSlash) {
2045
2053
// For `/.../` regex literals, we need to ban space and tab at the start of
@@ -2055,33 +2063,17 @@ const char *Lexer::tryScanRegexLiteral(const char *TokStart, bool MustBeRegex,
2055
2063
// TODO: This heuristic should be sunk into the Swift library once we have a
2056
2064
// way of doing fix-its from there.
2057
2065
auto *RegexContentStart = TokStart + 1 ;
2058
- switch (*RegexContentStart) {
2059
- case ' ' :
2060
- case ' \t ' : {
2066
+ if (*RegexContentStart == ' ' || *RegexContentStart == ' \t ' ) {
2061
2067
if (!MustBeRegex)
2062
2068
return nullptr ;
2063
2069
2064
2070
if (Diags) {
2065
2071
// We must have a regex, so emit an error for space and tab.
2066
- StringRef DiagChar;
2067
- switch (*RegexContentStart) {
2068
- case ' ' :
2069
- DiagChar = " space" ;
2070
- break ;
2071
- case ' \t ' :
2072
- DiagChar = " tab" ;
2073
- break ;
2074
- default :
2075
- llvm_unreachable (" Unhandled case" );
2076
- }
2077
2072
Diags->diagnose (getSourceLoc (RegexContentStart),
2078
- diag::lex_regex_literal_invalid_starting_char, DiagChar)
2073
+ diag::lex_regex_literal_invalid_starting_char,
2074
+ spaceOrTabDescription (*RegexContentStart))
2079
2075
.fixItInsert (getSourceLoc (RegexContentStart), " \\ " );
2080
2076
}
2081
- break ;
2082
- }
2083
- default :
2084
- break ;
2085
2077
}
2086
2078
}
2087
2079
@@ -2098,60 +2090,82 @@ const char *Lexer::tryScanRegexLiteral(const char *TokStart, bool MustBeRegex,
2098
2090
if (Ptr == TokStart)
2099
2091
return nullptr ;
2100
2092
2101
- // If we're lexing `/.../`, error if we ended on the opening of a comment.
2102
- // We prefer to lex the comment as it's more likely than not that is what
2103
- // the user is expecting.
2104
- // TODO: This should be sunk into the Swift library.
2105
- if (IsForwardSlash && Ptr[-1 ] == ' /' && (*Ptr == ' *' || *Ptr == ' /' )) {
2106
- if (!MustBeRegex)
2107
- return nullptr ;
2093
+ // Perform some additional heuristics to see if we can lex `/.../`.
2094
+ // TODO: These should all be sunk into the Swift library.
2095
+ if (IsForwardSlash) {
2096
+ // If we're lexing `/.../`, error if we ended on the opening of a comment.
2097
+ // We prefer to lex the comment as it's more likely than not that is what
2098
+ // the user is expecting.
2099
+ if (Ptr[-1 ] == ' /' && (*Ptr == ' *' || *Ptr == ' /' )) {
2100
+ if (!MustBeRegex)
2101
+ return nullptr ;
2108
2102
2109
- if (Diags) {
2110
- Diags->diagnose (getSourceLoc (TokStart),
2111
- diag::lex_regex_literal_unterminated);
2112
- }
2113
- // Move the pointer back to the '/' of the comment.
2114
- Ptr--;
2115
- }
2116
-
2117
- // If we're tentatively lexing `/.../`, scan to make sure we don't have any
2118
- // unbalanced ')'s. This helps avoid ambiguity with unapplied operator
2119
- // references e.g `reduce(1, /)` and `foo(/, 0) / 2`. This would be invalid
2120
- // regex syntax anyways. This ensures users can surround their operator ref
2121
- // in parens `(/)` to fix the issue. This also applies to prefix operators
2122
- // that can be disambiguated as e.g `(/S.foo)`. Note we need to track whether
2123
- // or not we're in a custom character class `[...]`, as parens are literal
2124
- // there.
2125
- // TODO: This should be sunk into the Swift library.
2126
- if (IsForwardSlash && !MustBeRegex) {
2127
- unsigned CharClassDepth = 0 ;
2128
- unsigned GroupDepth = 0 ;
2129
- for (auto *Cursor = TokStart + 1 ; Cursor < Ptr - 1 ; Cursor++) {
2130
- switch (*Cursor) {
2131
- case ' \\ ' :
2132
- // Skip over the next character of an escape.
2133
- Cursor++;
2134
- break ;
2135
- case ' (' :
2136
- if (CharClassDepth == 0 )
2137
- GroupDepth += 1 ;
2138
- break ;
2139
- case ' )' :
2140
- if (CharClassDepth != 0 )
2103
+ if (Diags) {
2104
+ Diags->diagnose (getSourceLoc (TokStart),
2105
+ diag::lex_regex_literal_unterminated);
2106
+ }
2107
+ // Move the pointer back to the '/' of the comment.
2108
+ Ptr--;
2109
+ }
2110
+ auto *TokEnd = Ptr - 1 ;
2111
+ auto *ContentEnd = TokEnd - 1 ;
2112
+
2113
+ // We also ban unescaped space and tab at the end of a `/.../` literal.
2114
+ if (*TokEnd == ' /' && (TokEnd - TokStart > 2 ) && ContentEnd[-1 ] != ' \\ ' &&
2115
+ (*ContentEnd == ' ' || *ContentEnd == ' \t ' )) {
2116
+ if (!MustBeRegex)
2117
+ return nullptr ;
2118
+
2119
+ if (Diags) {
2120
+ // Diagnose and suggest using a `#/.../#` literal instead. We could
2121
+ // suggest escaping, but that would be wrong if the user has written (?x).
2122
+ // TODO: Should we suggest this for space-as-first character too?
2123
+ Diags->diagnose (getSourceLoc (ContentEnd),
2124
+ diag::lex_regex_literal_invalid_ending_char,
2125
+ spaceOrTabDescription (*ContentEnd))
2126
+ .fixItInsert (getSourceLoc (TokStart), " #" )
2127
+ .fixItInsert (getSourceLoc (Ptr), " #" );
2128
+ }
2129
+ }
2130
+
2131
+ // If we're tentatively lexing `/.../`, scan to make sure we don't have any
2132
+ // unbalanced ')'s. This helps avoid ambiguity with unapplied operator
2133
+ // references e.g `reduce(1, /)` and `foo(/, 0) / 2`. This would be invalid
2134
+ // regex syntax anyways. This ensures users can surround their operator ref
2135
+ // in parens `(/)` to fix the issue. This also applies to prefix operators
2136
+ // that can be disambiguated as e.g `(/S.foo)`. Note we need to track whether
2137
+ // or not we're in a custom character class `[...]`, as parens are literal
2138
+ // there.
2139
+ if (!MustBeRegex) {
2140
+ unsigned CharClassDepth = 0 ;
2141
+ unsigned GroupDepth = 0 ;
2142
+ for (auto *Cursor = TokStart + 1 ; Cursor < TokEnd; Cursor++) {
2143
+ switch (*Cursor) {
2144
+ case ' \\ ' :
2145
+ // Skip over the next character of an escape.
2146
+ Cursor++;
2147
+ break ;
2148
+ case ' (' :
2149
+ if (CharClassDepth == 0 )
2150
+ GroupDepth += 1 ;
2141
2151
break ;
2152
+ case ' )' :
2153
+ if (CharClassDepth != 0 )
2154
+ break ;
2142
2155
2143
- // Invalid, so bail.
2144
- if (GroupDepth == 0 )
2145
- return nullptr ;
2156
+ // Invalid, so bail.
2157
+ if (GroupDepth == 0 )
2158
+ return nullptr ;
2146
2159
2147
- GroupDepth -= 1 ;
2148
- break ;
2149
- case ' [' :
2150
- CharClassDepth += 1 ;
2151
- break ;
2152
- case ' ]' :
2153
- if (CharClassDepth != 0 )
2154
- CharClassDepth -= 1 ;
2160
+ GroupDepth -= 1 ;
2161
+ break ;
2162
+ case ' [' :
2163
+ CharClassDepth += 1 ;
2164
+ break ;
2165
+ case ' ]' :
2166
+ if (CharClassDepth != 0 )
2167
+ CharClassDepth -= 1 ;
2168
+ }
2155
2169
}
2156
2170
}
2157
2171
}
0 commit comments