Skip to content

Commit f1a7990

Browse files
committed
[Parse] Introduce /.../ regex literals
Start parsing regex literals with `/.../` delimiters. rdar://83253726
1 parent 9f384d3 commit f1a7990

File tree

11 files changed

+522
-31
lines changed

11 files changed

+522
-31
lines changed

include/swift/AST/DiagnosticsParse.def

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -143,7 +143,10 @@ ERROR(lex_invalid_escape_delimiter,none,
143143
ERROR(lex_invalid_closing_delimiter,none,
144144
"too many '#' characters in closing delimiter", ())
145145

146-
ERROR(lex_unterminated_regex,none,
146+
ERROR(lex_regex_literal_invalid_starting_char,none,
147+
"regex literal may not start with %0; add backslash to escape",
148+
(StringRef))
149+
ERROR(lex_regex_literal_unterminated,none,
147150
"unterminated regex literal", ())
148151

149152
ERROR(lex_invalid_unicode_scalar,none,

include/swift/Parse/Lexer.h

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,16 @@ enum class LexerMode {
6161
SIL
6262
};
6363

64+
/// Whether or not the lexer should attempt to lex a `/.../` regex literal.
65+
enum class LexerForwardSlashRegexMode {
66+
/// No `/.../` regex literals will be lexed.
67+
None,
68+
/// A `/.../` regex literal will be lexed, but only if successful.
69+
Tentative,
70+
/// A `/.../` regex literal will always be lexed for a '/' character.
71+
Always
72+
};
73+
6474
/// Kinds of conflict marker which the lexer might encounter.
6575
enum class ConflictMarkerKind {
6676
/// A normal or diff3 conflict marker, initiated by at least 7 "<"s,
@@ -112,6 +122,10 @@ class Lexer {
112122
/// a .sil file.
113123
const LexerMode LexMode;
114124

125+
/// Whether or not a `/.../` literal will be lexed.
126+
LexerForwardSlashRegexMode ForwardSlashRegexMode =
127+
LexerForwardSlashRegexMode::None;
128+
115129
/// True if we should skip past a `#!` line at the start of the file.
116130
const bool IsHashbangAllowed;
117131

@@ -551,6 +565,11 @@ class Lexer {
551565
void operator=(const SILBodyRAII&) = delete;
552566
};
553567

568+
/// Attempt to re-lex a regex literal with forward slashes `/.../` from a
569+
/// given lexing state. If \p mustBeRegex is set to true, a regex literal will
570+
/// always be lexed. Otherwise, it will not be lexed if it may be ambiguous.
571+
void tryLexForwardSlashRegexLiteralFrom(State S, bool mustBeRegex);
572+
554573
private:
555574
/// Nul character meaning kind.
556575
enum class NulCharacterKind {
@@ -615,8 +634,8 @@ class Lexer {
615634
void lexStringLiteral(unsigned CustomDelimiterLen = 0);
616635
void lexEscapedIdentifier();
617636

618-
/// Attempt to lex a regex literal, returning true if a regex literal was
619-
/// lexed, false if this is not a regex literal.
637+
/// Attempt to lex a regex literal, returning true if lexing should continue,
638+
/// false if this is not a regex literal.
620639
bool tryLexRegexLiteral(const char *TokStart);
621640

622641
void tryLexEditorPlaceholder();

include/swift/Parse/Parser.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -559,6 +559,11 @@ class Parser {
559559
return f(backtrackScope);
560560
}
561561

562+
/// Discard the current token. This will avoid interface hashing or updating
563+
/// the previous loc. Only should be used if you've completely re-lexed
564+
/// a different token at that position.
565+
SourceLoc discardToken();
566+
562567
/// Consume a token that we created on the fly to correct the original token
563568
/// stream from lexer.
564569
void consumeExtraToken(Token K);
@@ -1752,8 +1757,17 @@ class Parser {
17521757
ParserResult<Expr>
17531758
parseExprPoundCodeCompletion(Optional<StmtKind> ParentKind);
17541759

1760+
UnresolvedDeclRefExpr *makeExprOperator(const Token &opToken);
17551761
UnresolvedDeclRefExpr *parseExprOperator();
17561762

1763+
/// Try re-lex a '/' operator character as a regex literal. This should be
1764+
/// called when parsing in an expression position to ensure a regex literal is
1765+
/// correctly parsed.
1766+
///
1767+
/// If \p mustBeRegex is set to true, a regex literal will always be lexed if
1768+
/// enabled. Otherwise, it will not be lexed if it may be ambiguous.
1769+
void tryLexRegexLiteral(bool mustBeRegex);
1770+
17571771
void validateCollectionElement(ParserResult<Expr> element);
17581772

17591773
//===--------------------------------------------------------------------===//

lib/Parse/Lexer.cpp

Lines changed: 108 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1969,22 +1969,109 @@ bool Lexer::tryLexRegexLiteral(const char *TokStart) {
19691969
if (!LangOpts.EnableExperimentalStringProcessing || !regexLiteralLexingFn)
19701970
return false;
19711971

1972-
// Ask libswift to try and lex a regex literal.
1972+
bool MustBeRegex = true;
1973+
bool IsForwardSlash = (*TokStart == '/');
1974+
1975+
// Check if we're able to lex a `/.../` regex.
1976+
if (IsForwardSlash) {
1977+
switch (ForwardSlashRegexMode) {
1978+
case LexerForwardSlashRegexMode::None:
1979+
return false;
1980+
case LexerForwardSlashRegexMode::Tentative:
1981+
MustBeRegex = false;
1982+
break;
1983+
case LexerForwardSlashRegexMode::Always:
1984+
break;
1985+
}
1986+
1987+
// For `/.../` regex literals, we need to ban space and tab at the start of
1988+
// a regex to avoid ambiguity with operator chains, e.g:
1989+
//
1990+
// Builder {
1991+
// 0
1992+
// / 1 /
1993+
// 2
1994+
// }
1995+
//
1996+
// This takes advantage of the consistent operator spacing rule. We also
1997+
// need to ban ')' to avoid ambiguity with unapplied operator references e.g
1998+
// `reduce(1, /)`. This would be invalid regex syntax anyways. Note this
1999+
// doesn't totally save us from e.g `foo(/, 0)`, but it should at least
2000+
// help, and it ensures users can always surround their operator ref in
2001+
// parens `(/)` to fix the issue.
2002+
// TODO: This heuristic should be sunk into the Swift library once we have a
2003+
// way of doing fix-its from there.
2004+
auto *RegexContentStart = TokStart + 1;
2005+
switch (*RegexContentStart) {
2006+
case ')': {
2007+
if (!MustBeRegex)
2008+
return false;
2009+
2010+
// ')' is invalid anyway, so we can let the parser diagnose it.
2011+
break;
2012+
}
2013+
case ' ':
2014+
case '\t': {
2015+
if (!MustBeRegex)
2016+
return false;
2017+
2018+
// We must have a regex, so emit an error for space and tab.
2019+
StringRef DiagChar;
2020+
switch (*RegexContentStart) {
2021+
case ' ':
2022+
DiagChar = "space";
2023+
break;
2024+
case '\t':
2025+
DiagChar = "tab";
2026+
break;
2027+
default:
2028+
llvm_unreachable("Unhandled case");
2029+
}
2030+
diagnose(RegexContentStart, diag::lex_regex_literal_invalid_starting_char,
2031+
DiagChar)
2032+
.fixItInsert(getSourceLoc(RegexContentStart), "\\");
2033+
break;
2034+
}
2035+
default:
2036+
break;
2037+
}
2038+
}
2039+
2040+
// Ask the Swift library to try and lex a regex literal.
19732041
// - Ptr will not be advanced if this is not for a regex literal.
19742042
// - ErrStr will be set if there is any error to emit.
19752043
// - CompletelyErroneous will be set if there was an error that cannot be
19762044
// recovered from.
19772045
auto *Ptr = TokStart;
19782046
const char *ErrStr = nullptr;
19792047
bool CompletelyErroneous = regexLiteralLexingFn(&Ptr, BufferEnd, &ErrStr);
1980-
if (ErrStr)
1981-
diagnose(TokStart, diag::regex_literal_parsing_error, ErrStr);
19822048

19832049
// If we didn't make any lexing progress, this isn't a regex literal and we
19842050
// should fallback to lexing as something else.
19852051
if (Ptr == TokStart)
19862052
return false;
19872053

2054+
if (ErrStr) {
2055+
if (!MustBeRegex)
2056+
return false;
2057+
2058+
diagnose(TokStart, diag::regex_literal_parsing_error, ErrStr);
2059+
}
2060+
2061+
// If we're lexing `/.../`, error if we ended on the opening of a comment.
2062+
// We prefer to lex the comment as it's more likely than not that is what
2063+
// the user is expecting.
2064+
// TODO: This should be sunk into the Swift library.
2065+
if (IsForwardSlash && Ptr[-1] == '/' && (*Ptr == '*' || *Ptr == '/')) {
2066+
if (!MustBeRegex)
2067+
return false;
2068+
2069+
diagnose(TokStart, diag::lex_regex_literal_unterminated);
2070+
2071+
// Move the pointer back to the '/' of the comment.
2072+
Ptr--;
2073+
}
2074+
19882075
// Update to point to where we ended regex lexing.
19892076
assert(Ptr > TokStart && Ptr <= BufferEnd);
19902077
CurPtr = Ptr;
@@ -1996,12 +2083,23 @@ bool Lexer::tryLexRegexLiteral(const char *TokStart) {
19962083
return true;
19972084
}
19982085

1999-
// Otherwise, we either had a successful lex, or something that was
2000-
// recoverable.
2086+
// We either had a successful lex, or something that was recoverable.
20012087
formToken(tok::regex_literal, TokStart);
20022088
return true;
20032089
}
20042090

2091+
void Lexer::tryLexForwardSlashRegexLiteralFrom(State S, bool mustBeRegex) {
2092+
if (!LangOpts.EnableBareSlashRegexLiterals)
2093+
return;
2094+
2095+
// Try re-lex with forward slash enabled.
2096+
llvm::SaveAndRestore<LexerForwardSlashRegexMode> RegexLexingScope(
2097+
ForwardSlashRegexMode, mustBeRegex
2098+
? LexerForwardSlashRegexMode::Always
2099+
: LexerForwardSlashRegexMode::Tentative);
2100+
restoreState(S, /*enableDiagnostics*/ true);
2101+
}
2102+
20052103
/// lexEscapedIdentifier:
20062104
/// identifier ::= '`' identifier '`'
20072105
///
@@ -2483,8 +2581,7 @@ void Lexer::lexImpl() {
24832581
if (unsigned CustomDelimiterLen = advanceIfCustomDelimiter(CurPtr, Diags))
24842582
return lexStringLiteral(CustomDelimiterLen);
24852583

2486-
// If we have experimental string processing enabled, try lex a regex
2487-
// literal.
2584+
// Try lex a regex literal.
24882585
if (tryLexRegexLiteral(TokStart))
24892586
return;
24902587

@@ -2505,6 +2602,10 @@ void Lexer::lexImpl() {
25052602
"Non token comment should be eaten by lexTrivia as LeadingTrivia");
25062603
return formToken(tok::comment, TokStart);
25072604
}
2605+
// Try lex a regex literal.
2606+
if (tryLexRegexLiteral(TokStart))
2607+
return;
2608+
25082609
return lexOperatorIdentifier();
25092610
case '%':
25102611
// Lex %[0-9a-zA-Z_]+ as a local SIL value

lib/Parse/ParseExpr.cpp

Lines changed: 66 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -511,6 +511,10 @@ ParserResult<Expr> Parser::parseExprSequenceElement(Diag<> message,
511511
ParserResult<Expr> Parser::parseExprUnary(Diag<> Message, bool isExprBasic) {
512512
SyntaxParsingContext UnaryContext(SyntaxContext, SyntaxContextKind::Expr);
513513
UnresolvedDeclRefExpr *Operator;
514+
515+
// First check to see if we have the start of a regex literal `/.../`.
516+
tryLexRegexLiteral(/*mustBeRegex*/ true);
517+
514518
switch (Tok.getKind()) {
515519
default:
516520
// If the next token is not an operator, just parse this as expr-postfix.
@@ -532,16 +536,32 @@ ParserResult<Expr> Parser::parseExprUnary(Diag<> Message, bool isExprBasic) {
532536
case tok::backslash:
533537
return parseExprKeyPath();
534538

535-
case tok::oper_postfix:
539+
case tok::oper_postfix: {
536540
// Postfix operators cannot start a subexpression, but can happen
537541
// syntactically because the operator may just follow whatever precedes this
538542
// expression (and that may not always be an expression).
539543
diagnose(Tok, diag::invalid_postfix_operator);
540544
Tok.setKind(tok::oper_prefix);
541-
LLVM_FALLTHROUGH;
542-
case tok::oper_prefix:
543545
Operator = parseExprOperator();
544546
break;
547+
}
548+
case tok::oper_prefix: {
549+
// Check to see if we can split a prefix operator containing `/`, e.g `!/`,
550+
// which might be a prefix operator on a regex literal.
551+
if (Context.LangOpts.EnableBareSlashRegexLiterals) {
552+
auto slashIdx = Tok.getText().find("/");
553+
if (slashIdx != StringRef::npos) {
554+
auto prefix = Tok.getText().take_front(slashIdx);
555+
if (!prefix.empty()) {
556+
Operator = makeExprOperator({Tok.getKind(), prefix});
557+
consumeStartingCharacterOfCurrentToken(Tok.getKind(), prefix.size());
558+
break;
559+
}
560+
}
561+
}
562+
Operator = parseExprOperator();
563+
break;
564+
}
545565
case tok::oper_binary_spaced:
546566
case tok::oper_binary_unspaced: {
547567
// For recovery purposes, accept an oper_binary here.
@@ -860,19 +880,52 @@ static DeclRefKind getDeclRefKindForOperator(tok kind) {
860880
}
861881
}
862882

863-
/// parseExprOperator - Parse an operator reference expression. These
864-
/// are not "proper" expressions; they can only appear in binary/unary
865-
/// operators.
866-
UnresolvedDeclRefExpr *Parser::parseExprOperator() {
883+
UnresolvedDeclRefExpr *Parser::makeExprOperator(const Token &Tok) {
867884
assert(Tok.isAnyOperator());
868885
DeclRefKind refKind = getDeclRefKindForOperator(Tok.getKind());
869886
SourceLoc loc = Tok.getLoc();
870887
DeclNameRef name(Context.getIdentifier(Tok.getText()));
871-
consumeToken();
872888
// Bypass local lookup.
873889
return new (Context) UnresolvedDeclRefExpr(name, refKind, DeclNameLoc(loc));
874890
}
875891

892+
/// parseExprOperator - Parse an operator reference expression. These
893+
/// are not "proper" expressions; they can only appear in binary/unary
894+
/// operators.
895+
UnresolvedDeclRefExpr *Parser::parseExprOperator() {
896+
auto *op = makeExprOperator(Tok);
897+
consumeToken();
898+
return op;
899+
}
900+
901+
void Parser::tryLexRegexLiteral(bool mustBeRegex) {
902+
if (!Context.LangOpts.EnableBareSlashRegexLiterals)
903+
return;
904+
905+
// Check to see if we have the start of a regex literal `/.../`.
906+
switch (Tok.getKind()) {
907+
case tok::oper_prefix:
908+
case tok::oper_binary_spaced:
909+
case tok::oper_binary_unspaced: {
910+
if (!Tok.getText().startswith("/"))
911+
break;
912+
913+
// Try re-lex as a `/.../` regex literal.
914+
auto state = getParserPosition().LS;
915+
L->tryLexForwardSlashRegexLiteralFrom(state, mustBeRegex);
916+
917+
// Discard the current token, which will be replaced by the re-lexed token,
918+
// which may or may not be a regex literal token.
919+
discardToken();
920+
921+
assert(Tok.getText().startswith("/"));
922+
break;
923+
}
924+
default:
925+
break;
926+
}
927+
}
928+
876929
/// parseExprSuper
877930
///
878931
/// expr-super:
@@ -3160,6 +3213,11 @@ ParserStatus Parser::parseExprList(tok leftTok, tok rightTok,
31603213
SourceLoc FieldNameLoc;
31613214
parseOptionalArgumentLabel(FieldName, FieldNameLoc);
31623215

3216+
// First check to see if we have the start of a regex literal `/.../`. We
3217+
// need to do this before handling unapplied operator references, as e.g
3218+
// `(/, /)` might be a regex literal.
3219+
tryLexRegexLiteral(/*mustBeRegex*/ false);
3220+
31633221
// See if we have an operator decl ref '(<op>)'. The operator token in
31643222
// this case lexes as a binary operator because it neither leads nor
31653223
// follows a proper subexpression.

lib/Parse/Parser.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -579,13 +579,16 @@ const Token &Parser::peekToken() {
579579
return L->peekNextToken();
580580
}
581581

582-
SourceLoc Parser::consumeTokenWithoutFeedingReceiver() {
583-
SourceLoc Loc = Tok.getLoc();
582+
SourceLoc Parser::discardToken() {
584583
assert(Tok.isNot(tok::eof) && "Lexing past eof!");
584+
SourceLoc Loc = Tok.getLoc();
585+
L->lex(Tok, LeadingTrivia, TrailingTrivia);
586+
return Loc;
587+
}
585588

589+
SourceLoc Parser::consumeTokenWithoutFeedingReceiver() {
586590
recordTokenHash(Tok);
587-
588-
L->lex(Tok, LeadingTrivia, TrailingTrivia);
591+
auto Loc = discardToken();
589592
PreviousLoc = Loc;
590593
return Loc;
591594
}

test/IDE/complete_regex.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
// RUN: %target-swift-ide-test -enable-bare-slash-regex -batch-code-completion -source-filename %s -filecheck %raw-FileCheck -completion-output-dir %t
55

66
func testLiteral() {
7-
#/foo/#.#^RE_LITERAL_MEMBER^#
7+
/foo/.#^RE_LITERAL_MEMBER^#
88
// RE_LITERAL_MEMBER: Begin completions
99
// RE_LITERAL_MEMBER-DAG: Keyword[self]/CurrNominal: self[#Regex<Substring>#];
1010
// RE_LITERAL_MEMBER: End completions

0 commit comments

Comments
 (0)