Skip to content

Commit 128f5d4

Browse files
committed
Update regex literal lexing and emission
Update the lexing implementation to defer to the regex library, which will pass back the pointer from to resume lexing, and update the emission to call the new `Regex(_regexString:version:)` overload, that will accept the regex string with delimiters. Because this uses the library's lexing implementation, the delimiters are now `'/.../'` and `'|...|'` instead of plain `'...'`.
1 parent 300cbab commit 128f5d4

File tree

18 files changed

+174
-111
lines changed

18 files changed

+174
-111
lines changed

include/swift/AST/ASTContext.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,7 @@ class ASTContext final {
621621
KnownProtocolKind builtinProtocol,
622622
llvm::function_ref<DeclName (ASTContext &ctx)> initName) const;
623623

624-
/// Retrieve _StringProcessing.Regex.init(_regexString: String).
624+
/// Retrieve _StringProcessing.Regex.init(_regexString: String, version: Int).
625625
ConcreteDeclRef getRegexInitDecl(Type regexType) const;
626626

627627
/// Retrieve the declaration of Swift.<(Int, Int) -> Bool.

include/swift/AST/Expr.h

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -966,18 +966,23 @@ class InterpolatedStringLiteralExpr : public LiteralExpr {
966966
class RegexLiteralExpr : public LiteralExpr {
967967
SourceLoc Loc;
968968
StringRef RegexText;
969+
unsigned Version;
969970

970-
RegexLiteralExpr(SourceLoc loc, StringRef regexText, bool isImplicit)
971+
RegexLiteralExpr(SourceLoc loc, StringRef regexText, unsigned version,
972+
bool isImplicit)
971973
: LiteralExpr(ExprKind::RegexLiteral, isImplicit), Loc(loc),
972-
RegexText(regexText) {}
974+
RegexText(regexText), Version(version) {}
973975

974976
public:
975977
static RegexLiteralExpr *createParsed(ASTContext &ctx, SourceLoc loc,
976-
StringRef regexText);
978+
StringRef regexText, unsigned version);
977979

978980
/// Retrieve the raw regex text.
979981
StringRef getRegexText() const { return RegexText; }
980982

983+
/// Retrieve the version of the regex string.
984+
unsigned getVersion() const { return Version; }
985+
981986
SourceRange getSourceRange() const { return Loc; }
982987

983988
static bool classof(const Expr *E) {

include/swift/AST/KnownIdentifiers.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -252,6 +252,7 @@ IDENTIFIER(zero)
252252
// String processing
253253
IDENTIFIER(Regex)
254254
IDENTIFIER_(regexString)
255+
IDENTIFIER(version)
255256
IDENTIFIER_(StringProcessing)
256257

257258
// Distributed actors

include/swift/Parse/ExperimentalRegexBridging.h

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,17 +5,43 @@
55
extern "C" {
66
#endif
77

8-
typedef const char *(* ParseRegexStrawperson)(const char *);
9-
10-
void Parser_registerParseRegexStrawperson(ParseRegexStrawperson fn);
11-
bool Parser_hasParseRegexStrawperson();
8+
/// Attempt to lex a regex literal string. Takes the following arguments:
9+
///
10+
/// - CurPtrPtr: A pointer to the current pointer of lexer, which should be the
11+
/// start of the literal. This will be advanced to the point at
12+
/// which the lexer should resume, or will remain the same if this
13+
/// is not a regex literal.
14+
/// - BufferEnd: A pointer to the end of the buffer, which should not be lexed
15+
/// past.
16+
/// - ErrorOut: If an error is encountered, this will be set to the error
17+
/// string.
18+
///
19+
/// Returns: A bool indicating whether lexing was completely erroneous, and
20+
/// cannot be recovered from, or false if there either was no error,
21+
/// or there was a recoverable error.
22+
typedef bool(* RegexLiteralLexingFn)(/*CurPtrPtr*/ const char **,
23+
/*BufferEnd*/ const char *,
24+
/*ErrorOut*/ const char **);
25+
void Parser_registerRegexLiteralLexingFn(RegexLiteralLexingFn fn);
26+
27+
/// Parse a regex literal string. Takes the following arguments:
28+
///
29+
/// - InputPtr: A null-terminated C string of the regex literal.
30+
/// - ErrorOut: A buffer accepting an error string upon error.
31+
/// - VersionOut: A buffer accepting a regex literal format version.
32+
/// - CaptureStructureOut: A buffer accepting a byte sequence representing the
33+
/// capture structure of the literal.
34+
/// - CaptureStructureSize: The size of the capture structure buffer. Must be
35+
/// greater than or equal to `strlen(InputPtr)`.
36+
typedef void(* RegexLiteralParsingFn)(/*InputPtr*/ const char *,
37+
/*ErrorOut*/ const char **,
38+
/*VersionOut*/ unsigned *,
39+
/*CaptureStructureOut*/ char *,
40+
/*CaptureStructureSize*/ unsigned);
41+
void Parser_registerRegexLiteralParsingFn(RegexLiteralParsingFn fn);
1242

1343
#ifdef __cplusplus
1444
} // extern "C"
1545
#endif
1646

1747
#endif // EXPERIMENTAL_REGEX_BRIDGING
18-
19-
20-
//const char* experimental_regex_strawperson(const char *in);
21-

include/swift/Parse/Lexer.h

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -595,7 +595,9 @@ class Lexer {
595595
void lexStringLiteral(unsigned CustomDelimiterLen = 0);
596596
void lexEscapedIdentifier();
597597

598-
void lexRegexLiteral(const char *TokStart);
598+
/// Attempt to lex a regex literal, returning true if a regex literal was
599+
/// lexed, false if this is not a regex literal.
600+
bool tryLexRegexLiteral(const char *TokStart);
599601

600602
void tryLexEditorPlaceholder();
601603
const char *findEndOfCurlyQuoteStringLiteral(const char *,
@@ -612,9 +614,6 @@ class Lexer {
612614

613615
/// Emit diagnostics for single-quote string and suggest replacement
614616
/// with double-quoted equivalent.
615-
///
616-
/// Or, if we're in strawperson mode, we will emit a custom
617-
/// error message instead, determined by the Swift library.
618617
void diagnoseSingleQuoteStringLiteral(const char *TokStart,
619618
const char *TokEnd);
620619

lib/AST/ASTContext.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1221,7 +1221,7 @@ ConcreteDeclRef ASTContext::getRegexInitDecl(Type regexType) const {
12211221
auto *spModule = getLoadedModule(Id_StringProcessing);
12221222
DeclName name(*const_cast<ASTContext *>(this),
12231223
DeclBaseName::createConstructor(),
1224-
{Id_regexString});
1224+
{Id_regexString, Id_version});
12251225
SmallVector<ValueDecl *, 1> results;
12261226
spModule->lookupQualified(getRegexType(), DeclNameRef(name),
12271227
NL_IncludeUsableFromInline, results);

lib/AST/Expr.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2246,8 +2246,9 @@ SourceLoc TapExpr::getEndLoc() const {
22462246

22472247
RegexLiteralExpr *
22482248
RegexLiteralExpr::createParsed(ASTContext &ctx, SourceLoc loc,
2249-
StringRef regexText) {
2250-
return new (ctx) RegexLiteralExpr(loc, regexText, /*implicit*/ false);
2249+
StringRef regexText, unsigned version) {
2250+
return new (ctx) RegexLiteralExpr(loc, regexText, version,
2251+
/*implicit*/ false);
22512252
}
22522253

22532254
void swift::simple_display(llvm::raw_ostream &out, const ClosureExpr *CE) {

lib/Parse/Lexer.cpp

Lines changed: 46 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,13 @@
3333

3434
#include <limits>
3535

36+
// Regex lexing delivered via libSwift.
37+
#include "swift/Parse/ExperimentalRegexBridging.h"
38+
static RegexLiteralLexingFn regexLiteralLexingFn = nullptr;
39+
void Parser_registerRegexLiteralLexingFn(RegexLiteralLexingFn fn) {
40+
regexLiteralLexingFn = fn;
41+
}
42+
3643
using namespace swift;
3744
using namespace swift::syntax;
3845

@@ -1951,36 +1958,46 @@ const char *Lexer::findEndOfCurlyQuoteStringLiteral(const char *Body,
19511958
}
19521959
}
19531960

1954-
void Lexer::lexRegexLiteral(const char *TokStart) {
1961+
bool Lexer::tryLexRegexLiteral(const char *TokStart) {
19551962
assert(*TokStart == '\'');
19561963

1957-
bool HadError = false;
1958-
while (true) {
1959-
// Check if we reached the end of the literal without terminating.
1960-
if (CurPtr >= BufferEnd || *CurPtr == '\n' || *CurPtr == '\r') {
1961-
diagnose(TokStart, diag::lex_unterminated_regex);
1962-
return formToken(tok::unknown, TokStart);
1963-
}
1964+
// We need to have experimental string processing enabled, and have the
1965+
// parsing logic for regex literals available.
1966+
if (!LangOpts.EnableExperimentalStringProcessing || !regexLiteralLexingFn)
1967+
return false;
19641968

1965-
const auto *CharStart = CurPtr;
1966-
uint32_t CharValue = validateUTF8CharacterAndAdvance(CurPtr, BufferEnd);
1967-
if (CharValue == ~0U) {
1968-
diagnose(CharStart, diag::lex_invalid_utf8);
1969-
HadError = true;
1970-
continue;
1971-
}
1972-
if (CharValue == '\\' && (*CurPtr == '\'' || *CurPtr == '\\')) {
1973-
// Skip escaped delimiter or \.
1974-
CurPtr++;
1975-
} else if (CharValue == '\'') {
1976-
// End of literal, stop.
1977-
break;
1978-
}
1969+
// Ask libswift to try and lex a regex literal.
1970+
// - Ptr will not be advanced if this is not for a regex literal.
1971+
// - ErrStr will be set if there is any error to emit.
1972+
// - CompletelyErroneous will be set if there was an error that cannot be
1973+
// recovered from.
1974+
auto *Ptr = TokStart;
1975+
const char *ErrStr = nullptr;
1976+
bool CompletelyErroneous = regexLiteralLexingFn(&Ptr, BufferEnd, &ErrStr);
1977+
if (ErrStr)
1978+
diagnose(TokStart, diag::regex_literal_parsing_error, ErrStr);
1979+
1980+
// If we didn't make any lexing progress, this isn't a regex literal and we
1981+
// should fallback to lexing as something else.
1982+
if (Ptr == TokStart)
1983+
return false;
1984+
1985+
// Update to point to where we ended regex lexing.
1986+
assert(Ptr > TokStart && Ptr <= BufferEnd);
1987+
CurPtr = Ptr;
1988+
1989+
// If the lexing was completely erroneous, form an unknown token.
1990+
if (CompletelyErroneous) {
1991+
assert(ErrStr);
1992+
formToken(tok::unknown, TokStart);
1993+
return true;
19791994
}
1980-
if (HadError)
1981-
return formToken(tok::unknown, TokStart);
19821995

1996+
// Otherwise, we either had a successful lex, or something that was
1997+
// recoverable.
1998+
assert(ErrStr || CurPtr[-1] == '\'');
19831999
formToken(tok::regex_literal, TokStart);
2000+
return true;
19842001
}
19852002

19862003
/// lexEscapedIdentifier:
@@ -2528,11 +2545,11 @@ void Lexer::lexImpl() {
25282545

25292546
case '\'':
25302547
// If we have experimental string processing enabled, and have the parsing
2531-
// logic for regex literals, lex a single quoted string as a regex literal.
2532-
if (LangOpts.EnableExperimentalStringProcessing &&
2533-
Parser_hasParseRegexStrawperson()) {
2534-
return lexRegexLiteral(TokStart);
2535-
}
2548+
// logic for regex literals, try to lex a single quoted string as a regex
2549+
// literal.
2550+
if (tryLexRegexLiteral(TokStart))
2551+
return;
2552+
25362553
// Otherwise lex as a string literal and emit a diagnostic.
25372554
LLVM_FALLTHROUGH;
25382555
case '"':

lib/Parse/ParseRegex.cpp

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,38 +22,36 @@
2222

2323
// Regex parser delivered via libSwift
2424
#include "swift/Parse/ExperimentalRegexBridging.h"
25-
static ParseRegexStrawperson parseRegexStrawperson = nullptr;
26-
void Parser_registerParseRegexStrawperson(ParseRegexStrawperson fn) {
27-
parseRegexStrawperson = fn;
28-
}
29-
// Exposes the presence of the regex parsing function to the lexer.
30-
bool Parser_hasParseRegexStrawperson() {
31-
return parseRegexStrawperson != nullptr;
25+
static RegexLiteralParsingFn regexLiteralParsingFn = nullptr;
26+
void Parser_registerRegexLiteralParsingFn(RegexLiteralParsingFn fn) {
27+
regexLiteralParsingFn = fn;
3228
}
3329

3430
using namespace swift;
3531
using namespace swift::syntax;
3632

3733
ParserResult<Expr> Parser::parseExprRegexLiteral() {
3834
assert(Tok.is(tok::regex_literal));
39-
assert(parseRegexStrawperson);
35+
assert(regexLiteralParsingFn);
4036

4137
SyntaxParsingContext LocalContext(SyntaxContext,
4238
SyntaxKind::RegexLiteralExpr);
43-
// Strip off delimiters.
44-
auto rawText = Tok.getText();
45-
assert(rawText.front() == '\'' && rawText.back() == '\'');
46-
auto regexText = rawText.slice(1, rawText.size() - 1);
39+
40+
auto regexText = Tok.getText();
4741

4842
// Let the Swift library parse the contents, returning an error, or null if
4943
// successful.
5044
// TODO: We need to be able to pass back a source location to emit the error
5145
// at.
52-
auto *errorStr = parseRegexStrawperson(regexText.str().c_str());
46+
const char *errorStr = nullptr;
47+
unsigned version;
48+
regexLiteralParsingFn(regexText.str().c_str(), &errorStr, &version,
49+
/*captureStructureOut*/ nullptr,
50+
/*captureStructureSize*/ 0);
5351
if (errorStr)
5452
diagnose(Tok, diag::regex_literal_parsing_error, errorStr);
5553

5654
auto loc = consumeToken();
5755
return makeParserResult(
58-
RegexLiteralExpr::createParsed(Context, loc, regexText));
56+
RegexLiteralExpr::createParsed(Context, loc, regexText, version));
5957
}

lib/SILGen/SILGenApply.cpp

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1910,10 +1910,31 @@ buildBuiltinLiteralArgs(SILGenFunction &SGF, SGFContext C,
19101910
RValue string = SGF.emitApplyAllocatingInitializer(
19111911
expr, strInitDecl, std::move(strLiteralArgs),
19121912
/*overriddenSelfType*/ Type(), SGFContext());
1913-
PreparedArguments args(
1914-
ArrayRef<AnyFunctionType::Param>({
1915-
AnyFunctionType::Param(ctx.getStringType())}));
1913+
1914+
// The version of the regex string.
1915+
// %3 = integer_literal $Builtin.IntLiteral <version>
1916+
auto versionIntLiteral =
1917+
ManagedValue::forUnmanaged(SGF.B.createIntegerLiteral(
1918+
expr, SILType::getBuiltinIntegerLiteralType(SGF.getASTContext()),
1919+
expr->getVersion()));
1920+
1921+
using Param = AnyFunctionType::Param;
1922+
auto builtinIntTy = versionIntLiteral.getType().getASTType();
1923+
PreparedArguments versionIntBuiltinArgs(ArrayRef<Param>{Param(builtinIntTy)});
1924+
versionIntBuiltinArgs.add(
1925+
expr, RValue(SGF, {versionIntLiteral}, builtinIntTy));
1926+
1927+
// %4 = function_ref Int.init(_builtinIntegerLiteral: Builtin.IntLiteral)
1928+
// %5 = apply %5(%3, ...) -> $Int
1929+
auto intLiteralInit = ctx.getIntBuiltinInitDecl(ctx.getIntDecl());
1930+
RValue versionInt = SGF.emitApplyAllocatingInitializer(
1931+
expr, intLiteralInit, std::move(versionIntBuiltinArgs),
1932+
/*overriddenSelfType*/ Type(), SGFContext());
1933+
1934+
PreparedArguments args(ArrayRef<Param>{Param(ctx.getStringType()),
1935+
Param(ctx.getIntType())});
19161936
args.add(expr, std::move(string));
1937+
args.add(expr, std::move(versionInt));
19171938
return args;
19181939
}
19191940

Lines changed: 2 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,6 @@
11
import ExperimentalRegexBridging
22

3-
public func experimental_regex_strawperson(
4-
_ ptr: UnsafePointer<CChar>?
5-
) -> UnsafePointer<CChar>? {
6-
guard let s = ptr else { return nil }
7-
8-
func makeError(_ str: String) -> UnsafePointer<CChar>? {
9-
let count = str.utf8.count + 1
10-
return str.withCString {
11-
assert($0[count-1] == 0)
12-
let ptr = UnsafeMutablePointer<CChar>.allocate(capacity: count)
13-
ptr.initialize(from: $0, count: count)
14-
return UnsafePointer(ptr)
15-
}
16-
}
17-
18-
let str = String(cString: s)
19-
do {
20-
let _ = try parse(str, .traditional)
21-
return nil
22-
} catch {
23-
return makeError(
24-
"cannot parse regular expression: \(String(describing: error))")
25-
}
26-
}
27-
283
public func registerRegexParser() {
29-
Parser_registerParseRegexStrawperson({ experimental_regex_strawperson($0) })
4+
Parser_registerRegexLiteralParsingFn(libswiftParseRegexLiteral)
5+
Parser_registerRegexLiteralLexingFn(libswiftLexRegexLiteral)
306
}
Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
// RUN: %target-typecheck-verify-swift -enable-experimental-string-processing
22
// REQUIRES: libswift
33

4-
_ = 'abc'
4+
_ = '/abc/'
55

6-
_ = ('[*', '+]', '.]')
6+
_ = ('/[*/', '/+]/', '/.]/')
77
// expected-error@-1 {{cannot parse regular expression}}
88

9-
_ = '\w+'
10-
_ = '\'\\'
9+
_ = '/\w+/'
10+
_ = '/\'\\/'
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
// RUN: %target-typecheck-verify-swift -enable-experimental-string-processing
2+
// REQUIRES: libswift
3+
4+
// Note there is purposefully no trailing newline here.
5+
// expected-error@+1 {{unterminated regex literal}}
6+
var unterminated = '/xy
Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// RUN: %target-typecheck-verify-swift -enable-experimental-string-processing
22
// REQUIRES: libswift
33

4-
let s = '\\'' // expected-error {{unterminated regex literal}}
4+
let s = '/\\/''/ // expected-error {{unterminated regex literal}}
55

66
// expected-error@+1 {{unterminated regex literal}}
7-
var unterminated = 'xy
7+
var unterminated = '/xy

0 commit comments

Comments
 (0)