swiftlang · hamishknight · Apr 1, 2022 · Apr 1, 2022 · Apr 1, 2022 · Apr 1, 2022
diff --git a/Sources/_RegexParser/Regex/AST/MatchingOptions.swift b/Sources/_RegexParser/Regex/AST/MatchingOptions.swift
@@ -137,7 +137,8 @@ extension AST {
   /// Global matching option specifiers. Unlike `MatchingOptionSequence`,
   /// these must appear at the start of the pattern, and apply globally.
   public struct GlobalMatchingOption: _ASTNode, Hashable {
-    /// Determines the definition of a newline for the '.' character class.
+    /// Determines the definition of a newline for the '.' character class and
+    /// when parsing end-of-line comments.
     public enum NewlineMatching: Hashable {
       /// (*CR*)
       case carriageReturnOnly

diff --git a/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift b/Sources/_RegexParser/Regex/Parse/DelimiterLexing.swift
@@ -9,42 +9,78 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: mock up multi-line soon
-
-enum Delimiter: Hashable, CaseIterable {
-  case traditional
-  case experimental
-  case reSingleQuote
-  case rxSingleQuote
-
-  var openingAndClosing: (opening: String, closing: String) {
-    switch self {
-    case .traditional: return ("#/", "/#")
-    case .experimental: return ("#|", "|#")
-    case .reSingleQuote: return ("re'", "'")
-    case .rxSingleQuote: return ("rx'", "'")
+struct Delimiter: Hashable {
+  let kind: Kind
+  let poundCount: Int
+
+  init(_ kind: Kind, poundCount: Int) {
+    precondition(kind.allowsExtendedPoundSyntax || poundCount == 0)
+    self.kind = kind
+    self.poundCount = poundCount
+  }
+
+  var opening: String {
+    String(repeating: "#", count: poundCount) + kind.opening
+  }
+  var closing: String {
+    kind.closing + String(repeating: "#", count: poundCount)
+  }
+
+  /// Whether or not multi-line mode is permitted.
+  var allowsMultiline: Bool {
+    switch kind {
+    case .forwardSlash:
+      return poundCount > 0
+    case .experimental, .reSingleQuote, .rxSingleQuote:
+      return false
     }
   }
-  var opening: String { openingAndClosing.opening }
-  var closing: String { openingAndClosing.closing }
-
-  /// The default set of syntax options that the delimiter indicates.
-  var defaultSyntaxOptions: SyntaxOptions {
-    switch self {
-    case .traditional, .reSingleQuote:
-      return .traditional
-    case .experimental, .rxSingleQuote:
-      return .experimental
+
+  /// The delimiters which are currently enabled.
+  static var enabledDelimiters: [Kind] { [.forwardSlash] }
+
+  /// All known delimiters.
+  static var allDelimiters: [Kind] { Kind.allCases }
+}
+
+extension Delimiter {
+  enum Kind: Hashable, CaseIterable {
+    case forwardSlash
+    case experimental
+    case reSingleQuote
+    case rxSingleQuote
+
+    var openingAndClosing: (opening: String, closing: String) {
+      switch self {
+      case .forwardSlash: return ("/", "/")
+      case .experimental: return ("#|", "|#")
+      case .reSingleQuote: return ("re'", "'")
+      case .rxSingleQuote: return ("rx'", "'")
+      }
+    }
+    var opening: String { openingAndClosing.opening }
+    var closing: String { openingAndClosing.closing }
+
+    /// Whether or not extended pound syntax e.g `##/.../##` is allowed with
+    /// this delimiter.
+    var allowsExtendedPoundSyntax: Bool {
+      switch self {
+      case .forwardSlash:
+        return true
+      case .experimental, .reSingleQuote, .rxSingleQuote:
+        return false
+      }
     }
   }
 }
 
 struct DelimiterLexError: Error, CustomStringConvertible {
   enum Kind: Hashable {
-    case endOfString
+    case unterminated
     case invalidUTF8 // TODO: better range reporting
     case unknownDelimiter
     case unprintableASCII
+    case multilineClosingNotOnNewline
   }
 
   var kind: Kind
@@ -59,10 +95,11 @@ struct DelimiterLexError: Error, CustomStringConvertible {
 
   var description: String {
     switch kind {
-    case .endOfString: return "unterminated regex literal"
+    case .unterminated: return "unterminated regex literal"
     case .invalidUTF8: return "invalid UTF-8 found in source file"
     case .unknownDelimiter: return "unknown regex literal delimiter"
     case .unprintableASCII: return "unprintable ASCII character found in source file"
+    case .multilineClosingNotOnNewline: return "closing delimiter must appear on new line"
     }
   }
 }
@@ -72,11 +109,18 @@ fileprivate struct DelimiterLexer {
   var cursor: UnsafeRawPointer
   let end: UnsafeRawPointer
 
-  init(start: UnsafeRawPointer, end: UnsafeRawPointer) {
+  var firstNewline: UnsafeRawPointer?
+  var isMultiline: Bool { firstNewline != nil }
+
+  let delimiters: [Delimiter.Kind]
+
+  init(start: UnsafeRawPointer, end: UnsafeRawPointer,
+       delimiters: [Delimiter.Kind]) {
     precondition(start <= end)
     self.start = start
     self.cursor = start
     self.end = end
+    self.delimiters = delimiters
   }
 
   func ascii(_ s: Unicode.Scalar) -> UInt8 {
@@ -120,25 +164,34 @@ fileprivate struct DelimiterLexer {
     precondition(cursor <= end, "Cannot advance past end")
   }
 
-  /// Check to see if a UTF-8 sequence can be eaten from the current cursor.
-  func canEat(_ utf8: String.UTF8View) -> Bool {
-    guard let slice = slice(utf8.count) else { return false }
-    return slice.elementsEqual(utf8)
+  /// Check to see if a byte sequence can be eaten from the current cursor.
+  func canEat<C : Collection>(_ bytes: C) -> Bool where C.Element == UInt8 {
+    guard let slice = slice(bytes.count) else { return false }
+    return slice.elementsEqual(bytes)
   }
 
-  /// Attempt to eat a UTF-8 byte sequence, returning `true` if successful.
-  mutating func tryEat(_ utf8: String.UTF8View) -> Bool {
-    guard canEat(utf8) else { return false }
-    advanceCursor(utf8.count)
+  /// Attempt to eat a byte sequence, returning `true` if successful.
+  mutating func tryEat<C : Collection>(
+    _ bytes: C
+  ) -> Bool where C.Element == UInt8 {
+    guard canEat(bytes) else { return false }
+    advanceCursor(bytes.count)
+    return true
+  }
+
+  /// Attempt to eat an ascii scalar, returning `true` if successful.
+  mutating func tryEat(ascii s: Unicode.Scalar) -> Bool {
+    guard load() == ascii(s) else { return false }
+    advanceCursor()
     return true
   }
 
   /// Attempt to skip over a closing delimiter character that is unlikely to be
   /// the actual closing delimiter.
   mutating func trySkipDelimiter(_ delimiter: Delimiter) {
     // Only the closing `'` for re'...'/rx'...' can potentially be skipped over.
-    switch delimiter {
-    case .traditional, .experimental:
+    switch delimiter.kind {
+    case .forwardSlash, .experimental:
       return
     case .reSingleQuote, .rxSingleQuote:
       break
@@ -222,12 +275,23 @@ fileprivate struct DelimiterLexer {
     let contentsEnd = cursor
     guard tryEat(delimiter.closing.utf8) else { return nil }
 
-    // Form a string from the contents and make sure it's valid UTF-8.
     let count = contentsEnd - contentsStart
     let contents = UnsafeRawBufferPointer(
       start: contentsStart, count: count)
-    let s = String(decoding: contents, as: UTF8.self)
 
+    // In multi-line mode, we must be on a new line. So scan backwards and make
+    // sure we only have whitespace until the newline.
+    if isMultiline {
+      let idx = contents.lastIndex(
+        where: { $0 == ascii("\n") || $0 == ascii("\r") })! + 1
+      guard contents[idx...].all({ $0 == ascii(" ") || $0 == ascii("\t") })
+      else {
+        throw DelimiterLexError(.multilineClosingNotOnNewline, resumeAt: cursor)
+      }
+    }
+
+    // Form a string from the contents and make sure it's valid UTF-8.
+    let s = String(decoding: contents, as: UTF8.self)
     guard s.utf8.elementsEqual(contents) else {
       throw DelimiterLexError(.invalidUTF8, resumeAt: cursor)
     }
@@ -238,7 +302,10 @@ fileprivate struct DelimiterLexer {
   /// the end of the buffer is reached.
   mutating func advance(escaped: Bool = false) throws {
     guard let next = load() else {
-      throw DelimiterLexError(.endOfString, resumeAt: cursor)
+      // We've hit the end of the buffer. In multi-line mode, we don't want to
+      // skip over what is likely otherwise valid Swift code, so resume from the
+      // first newline.
+      throw DelimiterLexError(.unterminated, resumeAt: firstNewline ?? cursor)
     }
     switch UnicodeScalar(next) {
     case let next where !next.isASCII:
@@ -249,7 +316,10 @@ fileprivate struct DelimiterLexer {
       advanceCursor()
 
     case "\n", "\r":
-      throw DelimiterLexError(.endOfString, resumeAt: cursor)
+      guard isMultiline else {
+        throw DelimiterLexError(.unterminated, resumeAt: cursor)
+      }
+      advanceCursor()
 
     case "\0":
       // TODO: Warn to match the behavior of String literal lexer? Or should
@@ -261,8 +331,12 @@ fileprivate struct DelimiterLexer {
       advanceCursor()
       try advance(escaped: true)
 
-    case let next where !next.isPrintableASCII:
+    case let next
+      where !next.isPrintableASCII && !(isMultiline && next == "\t"):
       // Diagnose unprintable ASCII.
+      // Note that tabs are allowed in multi-line literals.
+      // TODO: This matches the string literal behavior, but should we allow
+      // tabs for single-line regex literals too?
       // TODO: Ideally we would recover and continue to lex until the ending
       // delimiter.
       throw DelimiterLexError(.unprintableASCII, resumeAt: cursor.successor())
@@ -272,17 +346,60 @@ fileprivate struct DelimiterLexer {
     }
   }
 
+  mutating func tryLexOpeningDelimiter(poundCount: Int) -> Delimiter? {
+    for kind in delimiters {
+      // If the delimiter allows extended pound syntax, or there are no pounds,
+      // we just need to lex it.
+      let opening = kind.opening.utf8
+      if kind.allowsExtendedPoundSyntax || poundCount == 0 {
+        guard tryEat(opening) else { continue }
+        return Delimiter(kind, poundCount: poundCount)
+      }
+
+      // The delimiter doesn't allow extended pound syntax, so the pounds must be
+      // part of the delimiter.
+      guard
+        poundCount < opening.count,
+        opening.prefix(poundCount)
+          .elementsEqual(repeatElement(ascii("#"), count: poundCount)),
+        tryEat(opening.dropFirst(poundCount))
+      else { continue }
+
+      return Delimiter(kind, poundCount: 0)
+    }
+    return nil
+  }
+
   /*consuming*/ mutating func lex(
   ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
+    // We can consume any number of pound signs.
+    var poundCount = 0
+    while tryEat(ascii: "#") {
+      poundCount += 1
+    }
 
     // Try to lex the opening delimiter.
-    guard let delimiter = Delimiter.allCases.first(
-      where: { tryEat($0.opening.utf8) }
-    ) else {
+    guard let delimiter = tryLexOpeningDelimiter(poundCount: poundCount) else {
       throw DelimiterLexError(.unknownDelimiter, resumeAt: cursor.successor())
     }
-
     let contentsStart = cursor
+
+    // If the delimiter allows multi-line, try skipping over any whitespace to a
+    // newline character. If we can do that, we enter multi-line mode.
+    if delimiter.allowsMultiline {
+      while let next = load() {
+        switch next {
+        case ascii(" "), ascii("\t"):
+          advanceCursor()
+          continue
+        case ascii("\n"), ascii("\r"):
+          firstNewline = cursor
+        default:
+          break
+        }
+        break
+      }
+    }
     while true {
       // Check to see if we're at a character that looks like a delimiter, but
       // likely isn't. In such a case, we can attempt to skip over it.
@@ -302,20 +419,34 @@ fileprivate struct DelimiterLexer {
 /// Drop a set of regex delimiters from the input string, returning the contents
 /// and the delimiter used. The input string must have valid delimiters.
 func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
-  func stripDelimiter(_ delim: Delimiter) -> String? {
+  func stripDelimiter(_ kind: Delimiter.Kind) -> (String, Delimiter)? {
+    var slice = str.utf8[...]
+
+    // Try strip any number of opening '#'s.
+    var poundCount = 0
+    if kind.allowsExtendedPoundSyntax {
+      poundCount = slice.prefix(while: {
+        $0 == UInt8(("#" as UnicodeScalar).value)
+      }).count
+      slice = slice.dropFirst(poundCount)
+    }
+
     // The opening delimiter must match.
-    guard var slice = str.utf8.tryDropPrefix(delim.opening.utf8)
+    guard var slice = slice.tryDropPrefix(kind.opening.utf8)
     else { return nil }
 
     // The closing delimiter may optionally match, as it may not be present in
     // invalid code.
+    let delim = Delimiter(kind, poundCount: poundCount)
     if let newSlice = slice.tryDropSuffix(delim.closing.utf8) {
       slice = newSlice
     }
-    return String(slice)
+    let result = String(decoding: slice, as: UTF8.self)
+    precondition(result.utf8.elementsEqual(slice))
+    return (result, delim)
   }
-  for d in Delimiter.allCases {
-    if let contents = stripDelimiter(d) {
+  for kind in Delimiter.allDelimiters {
+    if let (contents, d) = stripDelimiter(kind) {
       return (contents, d)
     }
   }
@@ -325,8 +456,9 @@ func droppingRegexDelimiters(_ str: String) -> (String, Delimiter) {
 /// Attempt to lex a regex literal between `start` and `end`, returning either
 /// the contents and pointer from which to resume lexing, or an error.
 func lexRegex(
-  start: UnsafeRawPointer, end: UnsafeRawPointer
+  start: UnsafeRawPointer, end: UnsafeRawPointer,
+  delimiters: [Delimiter.Kind] = Delimiter.enabledDelimiters
 ) throws -> (contents: String, Delimiter, end: UnsafeRawPointer) {
-  var lexer = DelimiterLexer(start: start, end: end)
+  var lexer = DelimiterLexer(start: start, end: end, delimiters: delimiters)
   return try lexer.lex()
 }
diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
@@ -70,6 +70,8 @@ enum ParseError: Error, Hashable {
 
   case cannotRemoveTextSegmentOptions
   case cannotRemoveSemanticsOptions
+  case cannotRemoveExtendedSyntaxInMultilineMode
+
   case expectedCalloutArgument
 }
 
@@ -158,6 +160,8 @@ extension ParseError: CustomStringConvertible {
       return "text segment mode cannot be unset, only changed"
     case .cannotRemoveSemanticsOptions:
       return "semantic level cannot be unset, only changed"
+    case .cannotRemoveExtendedSyntaxInMultilineMode:
+      return "extended syntax may not be disabled in multi-line mode"
     case .expectedCalloutArgument:
       return "expected argument to callout"
     }