Skip to content

[5.7][SwiftSyntax] Parse regex literals using a fallback lexer implemented in C++ #58998

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions include/swift/Parse/SyntaxRegexFallbackLexing.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
//===--- SyntaxRegexFallbackLexing.h --------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

namespace swift {
/// SwiftSyntax parsing currently doesn't link against
/// SwiftExperimentalStringProcessing and is thus missing the regex lexing
/// functions defined in it. This registers a fallback regex-lexing function
/// implemented in C++ that is sufficient to generate a valid SwiftSyntax tree.
/// The regex parser registered by this function will accept all regex literals
/// and is not suited for normal compilation.
void registerSyntaxFallbackRegexParser();
} // end namespace swift
3 changes: 2 additions & 1 deletion lib/Parse/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ add_swift_host_library(swiftParse STATIC
ParseType.cpp
PersistentParserState.cpp
SyntaxParsingCache.cpp
SyntaxParsingContext.cpp)
SyntaxParsingContext.cpp
SyntaxRegexFallbackLexing.cpp)
_swift_gyb_target_sources(swiftParse PRIVATE
ParsedSyntaxBuilders.cpp.gyb
ParsedSyntaxNodes.cpp.gyb
Expand Down
153 changes: 153 additions & 0 deletions lib/Parse/SyntaxRegexFallbackLexing.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
//===--- SyntaxRegexFallbackLexing.cpp ------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

#include "swift/Parse/SyntaxRegexFallbackLexing.h"
#include "swift/AST/DiagnosticEngine.h"
#include "swift/AST/DiagnosticsParse.h"
#include "swift/Parse/Lexer.h"
#include "swift/Parse/RegexParserBridging.h"
#include <mutex>

using namespace swift;

template <typename... DiagArgTypes, typename... ArgTypes>
static void diagnose(BridgedOptionalDiagnosticEngine bridgedDiag,
const char *ptr, Diag<DiagArgTypes...> DiagID,
ArgTypes &&...Args) {
if (auto *Diag = static_cast<DiagnosticEngine *>(bridgedDiag.object)) {
Diag->diagnose(SourceLoc(llvm::SMLoc::getFromPointer(ptr)), DiagID,
std::forward<ArgTypes>(Args)...);
}
}

bool syntaxparse_lexRegexLiteral(
const char **InputPtr, const char *BufferEnd, bool MustBeRegex,
BridgedOptionalDiagnosticEngine BridgedDiagEngine) {

const char *Ptr = *InputPtr;

// Count leading '#'.
while (*Ptr == '#') {
++Ptr;
}
if (*Ptr != '/') {
// This wasn't a regex literal.
return true;
}

unsigned customDelimiterLen = Ptr - *InputPtr;

++Ptr;

// If the delimiter allows multi-line, try skipping over any whitespace to a
// newline character. If we can do that, we enter multi-line mode.
bool allowsMultiline = customDelimiterLen != 0;
const char *firstNewline = nullptr;
if (allowsMultiline) {
while (Ptr != BufferEnd) {
switch (*Ptr) {
case ' ':
case '\t':
++Ptr;
continue;
case '\r':
case '\n':
firstNewline = Ptr;
break;
default:
break;
}
break;
}
}

bool isMultilineLiteral = (firstNewline != nullptr);

while (true) {
switch (*Ptr++) {
case '\r':
case '\n':
if (!isMultilineLiteral) {
diagnose(BridgedDiagEngine, Ptr, diag::lex_regex_literal_unterminated);
*InputPtr = Ptr - 1;
return false;
}
break;
case '\\':
if (Ptr != BufferEnd) {
if (!isMultilineLiteral && (*Ptr == '\r' || *Ptr == '\n')) {
diagnose(BridgedDiagEngine, Ptr, diag::lex_regex_literal_unterminated);
*InputPtr = Ptr - 1;
return false;
}
if (validateUTF8CharacterAndAdvance(Ptr, BufferEnd) == ~0U)
diagnose(BridgedDiagEngine, Ptr, diag::lex_invalid_utf8);
}
break;
case '/': {
const char *AfterSlashPos = Ptr;

// Eat '#' up to the open delimeter length.
while (*Ptr == '#' && (Ptr - AfterSlashPos) <= customDelimiterLen) {
++Ptr;
}

if ((Ptr - AfterSlashPos) != customDelimiterLen) {
// '#' count didn't match. Reset the cursor after the '/' and move on.
Ptr = AfterSlashPos;
break;
}

// Found the closing delimiter. Finish.
*InputPtr = Ptr;
return false;
}
case '\0': {
if (Ptr - 1 == BufferEnd) {
// Reached to EOF.
diagnose(BridgedDiagEngine, Ptr - 1, diag::lex_regex_literal_unterminated);
// In multi-line mode, we don't want to skip over what is likely
// otherwise valid Swift code, so resume from the first newline.
*InputPtr = firstNewline ? firstNewline : (Ptr - 1);
return false;
}

// TODO: Warn to match the behavior of String literal lexer?
// For now, just ignore them.
break;
}
default: {
--Ptr;
if (validateUTF8CharacterAndAdvance(Ptr, BufferEnd) == ~0U)
diagnose(BridgedDiagEngine, Ptr, diag::lex_invalid_utf8);
break;
}
}
}
}

bool syntaxparse_parseRegexLiteral(const char *InputPtr, unsigned *VersionOut,
void *CaptureStructureOut,
unsigned CaptureStructureSize,
BridgedSourceLoc DiagnosticBaseLoc,
BridgedDiagnosticEngine BridgedDiagEngine) {
*VersionOut = ~0u;
return /*hasError*/ false;
}

void swift::registerSyntaxFallbackRegexParser() {
static std::once_flag flag;
std::call_once(flag, []() {
Parser_registerRegexLiteralLexingFn(syntaxparse_lexRegexLiteral);
Parser_registerRegexLiteralParsingFn(syntaxparse_parseRegexLiteral);
});
}
6 changes: 6 additions & 0 deletions test/Syntax/Parser/unterminated_multiline_regex.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// RUN: %swift-syntax-parser-test -dump-diags %s | %FileCheck %s
// CHECK: 7:1 Error: unterminated regex literal
// CHECK: 1 error(s) 0 warnings(s) 0 note(s)

#/
unterminatedLiteral
6 changes: 6 additions & 0 deletions test/Syntax/Parser/unterminated_regex.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// RUN: %swift-syntax-parser-test -dump-diags %s | %FileCheck %s
// CHECK: 6:21 Error: unterminated regex literal
// CHECK: 1 error(s) 0 warnings(s) 0 note(s)

// IMPORTANT: This file must not contain a trailing newline
/unterminatedLiteral
33 changes: 33 additions & 0 deletions test/Syntax/round_trip_regex.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// RUN: %empty-directory(%t)
// RUN: %swift-syntax-test -input-source-filename %s -parse-gen -fail-on-parse-error > %t/afterRoundtrip.swift
// RUN: diff -u %s %t/afterRoundtrip.swift

_ = /abc/
_ = #/abc/#
_ = ##/abc/##

func foo<T>(_ x: T...) {}
foo(/abc/, #/abc/#, ##/abc/##)

let arr = [/abc/, #/abc/#, ##/abc/##]

_ = /\w+/.self
_ = #/\w+/#.self
_ = ##/\w+/##.self

_ = /#\/\#\\/
_ = #/#/\/\#\\/#
_ = ##/#|\|\#\\/##

_ = #/
multiline
/#

_ = #/
double
multiline
/#

_ = #/
\
/#
10 changes: 10 additions & 0 deletions test/Syntax/round_trip_regex_escape_newline.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// RUN: %empty-directory(%t)
// RUN: not %swift-syntax-test -input-source-filename %s -parse-gen -fail-on-parse-error > %t/afterRoundtrip.swift 2> %t/errors.swift
// RUN: diff -u %s %t/afterRoundtrip.swift
// RUN: cat %t/errors.swift | %FileCheck %s

// Escaping newlines is not supported
_ = /\
/

// CHECK: 7:7: error: unterminated regex literal
16 changes: 16 additions & 0 deletions test/Syntax/round_trip_regex_invalid.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// RUN: %empty-directory(%t)
// RUN: %swift-syntax-test -input-source-filename %s -parse-gen > %t/afterRoundtrip.swift
// RUN: diff -u %s %t/afterRoundtrip.swift

_ = /abc
_ = #/abc
_ = #/abc/
_ = ##/abc/#

_ #/x
/#
_ #/
x/#

_ //#
_ /x/#
6 changes: 6 additions & 0 deletions tools/libSwiftSyntaxParser/libSwiftSyntaxParser.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "swift/Basic/SourceManager.h"
#include "swift/Parse/Parser.h"
#include "swift/Parse/SyntaxParseActions.h"
#include "swift/Parse/SyntaxRegexFallbackLexing.h"
#include "swift/Syntax/Serialization/SyntaxSerialization.h"
#include "swift/Syntax/SyntaxNodes.h"
#include "swift/Subsystems.h"
Expand Down Expand Up @@ -478,6 +479,7 @@ struct SynParserDiagConsumer: public DiagnosticConsumer {
};

swiftparse_client_node_t SynParser::parse(const char *source, size_t len) {
registerSyntaxFallbackRegexParser();
SourceManager SM;
unsigned bufID = SM.addNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(
StringRef(source, len), "syntax_parse_source"));
Expand All @@ -489,6 +491,10 @@ swiftparse_client_node_t SynParser::parse(const char *source, size_t len) {
// Not ready yet:
// langOpts.EnableASTScopeLookup = true;

// Always enable bare /.../ regex literal in syntax parser.
langOpts.EnableExperimentalStringProcessing = true;
langOpts.EnableBareSlashRegexLiterals = true;

auto parseActions =
std::make_shared<CLibParseActions>(*this, SM, bufID);
// We have to use SourceFileKind::Main to avoid diagnostics like
Expand Down
Loading