[SwiftSyntax] Parse regex literals using a fallback lexer implemented in C++

ahoppen · rintaro · ahoppen · commit 00aff1b9501a · 2022-05-20T12:53:36.000+02:00
`libInternalSwiftSyntaxParser.dylib` currently doesn’t link against `SwiftExperimentalStringProcessing`, so it can’t use the regex lexing functions defined within. This caused SwiftSyntax to fail if the source code contained regex literals.

Implement a fallback regex lexing function in C++ and use it for SwiftSyntax parsing.

rdar://93580240

Co-authored-by: Rintaro Ishizaki &lt;rishizaki@apple.com&gt;
diff --git a/include/swift/Parse/SyntaxRegexFallbackLexing.h b/include/swift/Parse/SyntaxRegexFallbackLexing.h
@@ -0,0 +1,21 @@
+//===--- SyntaxRegexFallbackLexing.h --------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2014 - 2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+
+namespace swift {
+/// SwiftSyntax parsing currently doesn't link against
+/// SwiftExperimentalStringProcessing and is thus missing the regex lexing
+/// functions defined in it. This registers a fallback regex-lexing function
+/// implemented in C++ that is sufficient to generate a valid SwiftSyntax tree.
+/// The regex parser registered by this function will accept all regex literals
+/// and is not suited for normal compilation.
+void registerSyntaxFallbackRegexParser();
+} // end namespace swift
diff --git a/lib/Parse/CMakeLists.txt b/lib/Parse/CMakeLists.txt
@@ -24,7 +24,8 @@ add_swift_host_library(swiftParse STATIC
   ParseType.cpp
   PersistentParserState.cpp
   SyntaxParsingCache.cpp
-  SyntaxParsingContext.cpp)
+  SyntaxParsingContext.cpp
+  SyntaxRegexFallbackLexing.cpp)
 _swift_gyb_target_sources(swiftParse PRIVATE
     ParsedSyntaxBuilders.cpp.gyb
     ParsedSyntaxNodes.cpp.gyb
diff --git a/lib/Parse/SyntaxRegexFallbackLexing.cpp b/lib/Parse/SyntaxRegexFallbackLexing.cpp
@@ -0,0 +1,147 @@
+//===--- SyntaxRegexFallbackLexing.cpp ------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2014 - 2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+
+#include "swift/Parse/SyntaxRegexFallbackLexing.h"
+#include "swift/AST/DiagnosticEngine.h"
+#include "swift/AST/DiagnosticsParse.h"
+#include "swift/Parse/Lexer.h"
+#include "swift/Parse/RegexParserBridging.h"
+#include <mutex>
+
+using namespace swift;
+
+template <typename... DiagArgTypes, typename... ArgTypes>
+static void diagnose(BridgedOptionalDiagnosticEngine bridgedDiag,
+                     const char *ptr, Diag<DiagArgTypes...> DiagID,
+                     ArgTypes &&...Args) {
+  if (auto *Diag = static_cast<DiagnosticEngine *>(bridgedDiag.object)) {
+    Diag->diagnose(SourceLoc(llvm::SMLoc::getFromPointer(ptr)), DiagID,
+                   std::forward<ArgTypes>(Args)...);
+  }
+}
+
+bool syntaxparse_lexRegexLiteral(
+    const char **InputPtr, const char *BufferEnd, bool MustBeRegex,
+    BridgedOptionalDiagnosticEngine BridgedDiagEngine) {
+
+  const char *Ptr = *InputPtr;
+
+  // Count leading '#'.
+  while (*Ptr == '#') {
+    ++Ptr;
+  }
+  if (*Ptr != '/') {
+    // This wasn't a regex literal.
+    return true;
+  }
+
+  unsigned customDelimiterLen = Ptr - *InputPtr;
+
+  ++Ptr;
+
+  // If the delimiter allows multi-line, try skipping over any whitespace to a
+  // newline character. If we can do that, we enter multi-line mode.
+  bool allowsMultiline = customDelimiterLen != 0;
+  const char *firstNewline = nullptr;
+  if (allowsMultiline) {
+    while (Ptr != BufferEnd) {
+      switch (*Ptr) {
+      case ' ':
+      case '\t':
+        ++Ptr;
+        continue;
+      case '\r':
+      case '\n':
+        firstNewline = Ptr;
+        break;
+      default:
+        break;
+      }
+      break;
+    }
+  }
+
+  while (true) {
+    switch (*Ptr++) {
+    case '\r':
+    case '\n':
+      if (firstNewline == nullptr) {
+        // not a multiline literal
+        diagnose(BridgedDiagEngine, Ptr, diag::lex_regex_literal_unterminated);
+        *InputPtr = Ptr - 1;
+        return false;
+      }
+      break;
+    case '\\':
+      if (Ptr != BufferEnd) {
+        if (validateUTF8CharacterAndAdvance(Ptr, BufferEnd) == ~0U)
+          diagnose(BridgedDiagEngine, Ptr, diag::lex_invalid_utf8);
+      }
+      break;
+    case '/': {
+      const char *AfterSlashPos = Ptr;
+
+      // Eat '#' up to the open delimeter length.
+      while (*Ptr == '#' && (Ptr - AfterSlashPos) <= customDelimiterLen) {
+        ++Ptr;
+      }
+
+      if ((Ptr - AfterSlashPos) != customDelimiterLen) {
+        // '#' count didn't match. Reset the cursor after the '/' and move on.
+        Ptr = AfterSlashPos;
+        break;
+      }
+
+      // Found the closing delimiter. Finish.
+      *InputPtr = Ptr;
+      return false;
+    }
+    case '\0': {
+      if (Ptr - 1 == BufferEnd) {
+        // Reached to EOF.
+        diagnose(BridgedDiagEngine, Ptr, diag::lex_regex_literal_unterminated);
+        // In multi-line mode, we don't want to skip over what is likely
+        // otherwise valid Swift code, so resume from the first newline.
+        *InputPtr = firstNewline ? firstNewline : (Ptr - 1);
+        return false;
+      }
+
+      // TODO: Warn to match the behavior of String literal lexer?
+      // For now, just ignore them.
+      break;
+    }
+    default: {
+      --Ptr;
+      if (validateUTF8CharacterAndAdvance(Ptr, BufferEnd) == ~0U)
+        diagnose(BridgedDiagEngine, Ptr, diag::lex_invalid_utf8);
+      break;
+    }
+    }
+  }
+}
+
+bool syntaxparse_parseRegexLiteral(const char *InputPtr, unsigned *VersionOut,
+                                   void *CaptureStructureOut,
+                                   unsigned CaptureStructureSize,
+                                   BridgedSourceLoc DiagnosticBaseLoc,
+                                   BridgedDiagnosticEngine BridgedDiagEngine) {
+  *VersionOut = ~0u;
+  return /*hasError*/ false;
+}
+
+void swift::registerSyntaxFallbackRegexParser() {
+  static std::once_flag flag;
+  std::call_once(flag, []() {
+    Parser_registerRegexLiteralLexingFn(syntaxparse_lexRegexLiteral);
+    Parser_registerRegexLiteralParsingFn(syntaxparse_parseRegexLiteral);
+  });
+}
diff --git a/test/Syntax/round_trip_regex.swift b/test/Syntax/round_trip_regex.swift
@@ -0,0 +1,29 @@
+// RUN: rm -rf %t
+// RUN: %swift-syntax-test -input-source-filename %s -parse-gen -fail-on-parse-error > %t
+// RUN: diff -u %s %t
+
+_ = /abc/
+_ = #/abc/#
+_ = ##/abc/##
+
+func foo<T>(_ x: T...) {}
+foo(/abc/, #/abc/#, ##/abc/##)
+
+let arr = [/abc/, #/abc/#, ##/abc/##]
+
+_ = /\w+/.self
+_ = #/\w+/#.self
+_ = ##/\w+/##.self
+
+_ = /#\/\#\\/
+_ = #/#/\/\#\\/#
+_ = ##/#|\|\#\\/##
+
+_ = #/
+multiline
+/#
+
+_ = #/
+double
+multiline
+/#
diff --git a/test/Syntax/round_trip_regex_invalid.swift b/test/Syntax/round_trip_regex_invalid.swift
@@ -0,0 +1,16 @@
+// RUN: rm -rf %t
+// RUN: %swift-syntax-test -input-source-filename %s -parse-gen > %t
+// RUN: diff -u %s %t
+
+_ = /abc
+_ = #/abc
+_ = #/abc/
+_ = ##/abc/#
+
+_ #/x
+/#
+_ #/
+x/#
+
+_ //#
+_ /x/#
diff --git a/tools/libSwiftSyntaxParser/libSwiftSyntaxParser.cpp b/tools/libSwiftSyntaxParser/libSwiftSyntaxParser.cpp
@@ -21,6 +21,7 @@
 #include "swift/Basic/SourceManager.h"
 #include "swift/Parse/Parser.h"
 #include "swift/Parse/SyntaxParseActions.h"
+#include "swift/Parse/SyntaxRegexFallbackLexing.h"
 #include "swift/Syntax/Serialization/SyntaxSerialization.h"
 #include "swift/Syntax/SyntaxNodes.h"
 #include "swift/Subsystems.h"
@@ -478,6 +479,7 @@ struct SynParserDiagConsumer: public DiagnosticConsumer {
 };
 
 swiftparse_client_node_t SynParser::parse(const char *source, size_t len) {
+  registerSyntaxFallbackRegexParser();
   SourceManager SM;
   unsigned bufID = SM.addNewSourceBuffer(llvm::MemoryBuffer::getMemBuffer(
       StringRef(source, len), "syntax_parse_source"));
@@ -489,6 +491,10 @@ swiftparse_client_node_t SynParser::parse(const char *source, size_t len) {
   // Not ready yet:
   // langOpts.EnableASTScopeLookup = true;
 
+  // Always enable bare /.../ regex literal in syntax parser.
+  langOpts.EnableExperimentalStringProcessing = true;
+  langOpts.EnableBareSlashRegexLiterals = true;
+
   auto parseActions =
     std::make_shared<CLibParseActions>(*this, SM, bufID);
   // We have to use SourceFileKind::Main to avoid diagnostics like
diff --git a/tools/swift-syntax-test/swift-syntax-test.cpp b/tools/swift-syntax-test/swift-syntax-test.cpp