Merge pull request #493 from hamishknight/syntax-tweaks-5.7

hamishknight · web-flow · commit 9807fcdae20d · 2022-06-17T16:38:47.000+01:00
diff --git a/Sources/_RegexParser/Regex/Parse/Diagnostics.swift b/Sources/_RegexParser/Regex/Parse/Diagnostics.swift
@@ -44,6 +44,9 @@ enum ParseError: Error, Hashable {
   case invalidEscape(Character)
   case confusableCharacter(Character)
 
+  case quoteMayNotSpanMultipleLines
+  case unsetExtendedSyntaxMayNotSpanMultipleLines
+
   case cannotReferToWholePattern
 
   case quantifierRequiresOperand(String)
@@ -79,6 +82,7 @@ enum ParseError: Error, Hashable {
   case cannotRemoveTextSegmentOptions
   case cannotRemoveSemanticsOptions
   case cannotRemoveExtendedSyntaxInMultilineMode
+  case cannotResetExtendedSyntaxInMultilineMode
 
   case expectedCalloutArgument
 
@@ -139,6 +143,10 @@ extension ParseError: CustomStringConvertible {
       return "invalid escape sequence '\\\(c)'"
     case .confusableCharacter(let c):
       return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
+    case .quoteMayNotSpanMultipleLines:
+      return "quoted sequence may not span multiple lines in multi-line literal"
+    case .unsetExtendedSyntaxMayNotSpanMultipleLines:
+      return "group that unsets extended syntax may not span multiple lines in multi-line literal"
     case .cannotReferToWholePattern:
       return "cannot refer to whole pattern here"
     case .quantifierRequiresOperand(let q):
@@ -190,6 +198,8 @@ extension ParseError: CustomStringConvertible {
       return "semantic level cannot be unset, only changed"
     case .cannotRemoveExtendedSyntaxInMultilineMode:
       return "extended syntax may not be disabled in multi-line mode"
+    case .cannotResetExtendedSyntaxInMultilineMode:
+      return "extended syntax may not be disabled in multi-line mode; use '(?^x)' instead"
     case .expectedCalloutArgument:
       return "expected argument to callout"
     case .unrecognizedScript(let value):
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -342,8 +342,8 @@ extension Source {
     }.value
   }
 
-  /// Eat a scalar off the front, starting from after the
-  /// backslash and base character (e.g. `\u` or `\x`).
+  /// Try to eat a scalar off the front, starting from after the backslash and
+  /// base character (e.g. `\u` or `\x`).
   ///
   ///     UniScalar -> 'u{' UniScalarSequence '}'
   ///                | 'u'  HexDigit{4}
@@ -353,60 +353,60 @@ extension Source {
   ///                | 'o{' OctalDigit{1...} '}'
   ///                | '0' OctalDigit{0...3}
   ///
-  mutating func expectUnicodeScalar(
-    escapedCharacter base: Character
-  ) throws -> AST.Atom.Kind {
+  mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? {
     try recordLoc { src in
+      try src.tryEating { src in
 
-      func nullScalar() -> AST.Atom.Kind {
-        let pos = src.currentPosition
-        return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
-      }
-
-      // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
-      switch base {
-      // Hex numbers.
-      case "u" where src.tryEat("{"):
-        return try src.expectUnicodeScalarSequence(eating: "}")
-
-      case "x" where src.tryEat("{"):
-        let str = try src.lexUntil(eating: "}")
-        return .scalar(try Source.validateUnicodeScalar(str, .hex))
-
-      case "x":
-        // \x expects *up to* 2 digits.
-        guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
-        else {
-          // In PCRE, \x without any valid hex digits is \u{0}.
-          // TODO: This doesn't appear to be followed by ICU or Oniguruma, so
-          // could be changed to throw an error if we had a parsing mode for
-          // them.
-          return nullScalar()
+        func nullScalar() -> AST.Atom.Kind {
+          let pos = src.currentPosition
+          return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
         }
-        return .scalar(try Source.validateUnicodeScalar(digits, .hex))
 
-      case "u":
-        return .scalar(try src.expectUnicodeScalar(numDigits: 4))
-      case "U":
-        return .scalar(try src.expectUnicodeScalar(numDigits: 8))
+        // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
+        switch src.tryEat() {
+        // Hex numbers.
+        case "u" where src.tryEat("{"):
+          return try src.expectUnicodeScalarSequence(eating: "}")
+
+        case "x" where src.tryEat("{"):
+          let str = try src.lexUntil(eating: "}")
+          return .scalar(try Source.validateUnicodeScalar(str, .hex))
+
+        case "x":
+          // \x expects *up to* 2 digits.
+          guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
+          else {
+            // In PCRE, \x without any valid hex digits is \u{0}.
+            // TODO: This doesn't appear to be followed by ICU or Oniguruma, so
+            // could be changed to throw an error if we had a parsing mode for
+            // them.
+            return nullScalar()
+          }
+          return .scalar(try Source.validateUnicodeScalar(digits, .hex))
+
+        case "u":
+          return .scalar(try src.expectUnicodeScalar(numDigits: 4))
+        case "U":
+          return .scalar(try src.expectUnicodeScalar(numDigits: 8))
+
+        // Octal numbers.
+        case "o" where src.tryEat("{"):
+          let str = try src.lexUntil(eating: "}")
+          return .scalar(try Source.validateUnicodeScalar(str, .octal))
+
+        case "0":
+          // We can read *up to* 3 more octal digits.
+          // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
+          // PCRE mode, we should limit it here.
+          guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
+          else {
+            return nullScalar()
+          }
+          return .scalar(try Source.validateUnicodeScalar(digits, .octal))
 
-      // Octal numbers.
-      case "o" where src.tryEat("{"):
-        let str = try src.lexUntil(eating: "}")
-        return .scalar(try Source.validateUnicodeScalar(str, .octal))
-
-      case "0":
-        // We can read *up to* 3 more octal digits.
-        // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
-        // PCRE mode, we should limit it here.
-        guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
-        else {
-          return nullScalar()
+        default:
+          return nil
         }
-        return .scalar(try Source.validateUnicodeScalar(digits, .octal))
-
-      default:
-        fatalError("Unexpected scalar start")
       }
     }.value
   }
@@ -579,7 +579,7 @@ extension Source {
 
   /// Try to consume quoted content
   ///
-  ///     Quote -> '\Q' (!'\E' .)* '\E'
+  ///     Quote -> '\Q' (!'\E' .)* '\E'?
   ///
   /// With `SyntaxOptions.experimentalQuotes`, also accepts
   ///
@@ -592,9 +592,24 @@ extension Source {
   mutating func lexQuote(context: ParsingContext) throws -> AST.Quote? {
     let str = try recordLoc { src -> String? in
       if src.tryEat(sequence: #"\Q"#) {
-        return try src.expectQuoted(endingWith: #"\E"#).value
+        let contents = src.lexUntil { src in
+          src.isEmpty || src.tryEat(sequence: #"\E"#)
+        }.value
+
+        // In multi-line literals, the quote may not span multiple lines.
+        if context.syntax.contains(.multilineCompilerLiteral),
+            contents.spansMultipleLinesInRegexLiteral {
+          throw ParseError.quoteMayNotSpanMultipleLines
+        }
+
+        // The sequence must not be empty in a custom character class.
+        if context.isInCustomCharacterClass && contents.isEmpty {
+          throw ParseError.expectedNonEmptyContents
+        }
+        return contents
       }
       if context.experimentalQuotes, src.tryEat("\"") {
+        // TODO: Can experimental quotes be empty?
         return try src.expectQuoted(endingWith: "\"", ignoreEscaped: true).value
       }
       return nil
@@ -787,6 +802,11 @@ extension Source {
   mutating func lexMatchingOptionSequence(
     context: ParsingContext
   ) throws -> AST.MatchingOptionSequence? {
+    // PCRE accepts '(?)'
+    // TODO: This is a no-op, should we warn?
+    if peek() == ")" {
+      return .init(caretLoc: nil, adding: [], minusLoc: nil, removing: [])
+    }
     let ateCaret = recordLoc { $0.tryEat("^") }
 
     // TODO: Warn on duplicate options, and options appearing in both adding
@@ -820,11 +840,6 @@ extension Source {
         if opt.isSemanticMatchingLevel {
           throw ParseError.cannotRemoveSemanticsOptions
         }
-        // Extended syntax may not be removed if in multi-line mode.
-        if context.syntax.contains(.multilineExtendedSyntax) &&
-            opt.isAnyExtended {
-          throw ParseError.cannotRemoveExtendedSyntaxInMultilineMode
-        }
         removing.append(opt)
       }
       return .init(caretLoc: nil, adding: adding, minusLoc: ateMinus.location,
@@ -1692,6 +1707,11 @@ extension Source {
         return ref
       }
 
+      // Hexadecimal and octal unicode scalars.
+      if let scalar = try src.lexUnicodeScalar() {
+        return scalar
+      }
+
       guard let char = src.tryEat() else {
         throw ParseError.expectedEscape
       }
@@ -1703,14 +1723,6 @@ extension Source {
         return .escaped(builtin)
       }
 
-      switch char {
-      // Hexadecimal and octal unicode scalars.
-      case "u", "x", "U", "o", "0":
-        return try src.expectUnicodeScalar(escapedCharacter: char)
-      default:
-        break
-      }
-
       // We only allow unknown escape sequences for non-letter non-number ASCII,
       // and non-ASCII whitespace.
       // TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.
diff --git a/Sources/_RegexParser/Regex/Parse/Parse.swift b/Sources/_RegexParser/Regex/Parse/Parse.swift
@@ -289,8 +289,8 @@ extension Parser {
   /// Apply the syntax options of a given matching option sequence to the
   /// current set of options.
   private mutating func applySyntaxOptions(
-    of opts: AST.MatchingOptionSequence
-  ) {
+    of opts: AST.MatchingOptionSequence, isScoped: Bool
+  ) throws {
     func mapOption(_ option: SyntaxOptions,
                    _ pred: (AST.MatchingOption) -> Bool) {
       if opts.resetsCurrentOptions {
@@ -311,22 +311,41 @@ extension Parser {
     mapOption(.namedCapturesOnly, .namedCapturesOnly)
 
     // (?x), (?xx)
-    // We skip this for multi-line, as extended syntax is always enabled there.
+    // This cannot be unset in a multi-line literal, unless in a scoped group
+    // e.g (?-x:...). We later enforce that such a group does not span multiple
+    // lines.
     // TODO: PCRE differentiates between (?x) and (?xx) where only the latter
     // handles non-semantic whitespace in a custom character class. Other
     // engines such as Oniguruma, Java, and ICU do this under (?x). Therefore,
     // treat (?x) and (?xx) as the same option here. If we ever get a strict
     // PCRE mode, we will need to change this to handle that.
-    if !context.syntax.contains(.multilineExtendedSyntax) {
+    if !isScoped && context.syntax.contains(.multilineCompilerLiteral) {
+      // An unscoped removal of extended syntax is not allowed in a multi-line
+      // literal.
+      if let opt = opts.removing.first(where: \.isAnyExtended) {
+        throw Source.LocatedError(
+          ParseError.cannotRemoveExtendedSyntaxInMultilineMode, opt.location)
+      }
+      if opts.resetsCurrentOptions {
+        throw Source.LocatedError(
+          ParseError.cannotResetExtendedSyntaxInMultilineMode, opts.caretLoc!)
+      }
+      // The only remaning case is an unscoped addition of extended syntax,
+      // which is a no-op.
+    } else {
+      // We either have a scoped change of extended syntax, or this is a
+      // single-line literal.
       mapOption(.extendedSyntax, \.isAnyExtended)
     }
   }
 
   /// Apply the syntax options of a matching option changing group to the
   /// current set of options.
-  private mutating func applySyntaxOptions(of group: AST.Group.Kind) {
+  private mutating func applySyntaxOptions(
+    of group: AST.Group.Kind, isScoped: Bool
+  ) throws {
     if case .changeMatchingOptions(let seq) = group {
-      applySyntaxOptions(of: seq)
+      try applySyntaxOptions(of: seq, isScoped: isScoped)
     }
   }
 
@@ -337,14 +356,25 @@ extension Parser {
     context.recordGroup(kind.value)
 
     let currentSyntax = context.syntax
-    applySyntaxOptions(of: kind.value)
+    try applySyntaxOptions(of: kind.value, isScoped: true)
     defer {
       context.syntax = currentSyntax
     }
-
+    let unsetsExtendedSyntax = currentSyntax.contains(.extendedSyntax) &&
+                              !context.syntax.contains(.extendedSyntax)
     let child = try parseNode()
     try source.expect(")")
-    return .init(kind, child, loc(start))
+    let groupLoc = loc(start)
+
+    // In multi-line literals, the body of a group that unsets extended syntax
+    // may not span multiple lines.
+    if unsetsExtendedSyntax &&
+        context.syntax.contains(.multilineCompilerLiteral) &&
+        source[child.location.range].spansMultipleLinesInRegexLiteral {
+      throw Source.LocatedError(
+        ParseError.unsetExtendedSyntaxMayNotSpanMultipleLines, groupLoc)
+    }
+    return .init(kind, child, groupLoc)
   }
 
   /// Consume the body of an absent function.
@@ -438,7 +468,7 @@ extension Parser {
       // If we have a change matching options atom, apply the syntax options. We
       // already take care of scoping syntax options within a group.
       if case .changeMatchingOptions(let opts) = atom.kind {
-        applySyntaxOptions(of: opts)
+        try applySyntaxOptions(of: opts, isScoped: false)
       }
       // TODO: track source locations
       return .atom(atom)
@@ -592,6 +622,13 @@ public func parse<S: StringProtocol>(
   return ast
 }
 
+extension StringProtocol {
+  /// Whether the given string is considered multi-line for a regex literal.
+  var spansMultipleLinesInRegexLiteral: Bool {
+    unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" })
+  }
+}
+
 /// Retrieve the default set of syntax options that a delimiter and literal
 /// contents indicates.
 fileprivate func defaultSyntaxOptions(
@@ -601,9 +638,8 @@ fileprivate func defaultSyntaxOptions(
   case .forwardSlash:
     // For an extended syntax forward slash e.g #/.../#, extended syntax is
     // permitted if it spans multiple lines.
-    if delim.poundCount > 0 &&
-        contents.unicodeScalars.contains(where: { $0 == "\n" || $0 == "\r" }) {
-      return .multilineExtendedSyntax
+    if delim.poundCount > 0 && contents.spansMultipleLinesInRegexLiteral {
+      return [.multilineCompilerLiteral, .extendedSyntax]
     }
     return .traditional
   case .reSingleQuote:
diff --git a/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift b/Sources/_RegexParser/Regex/Parse/SyntaxOptions.swift
@@ -58,10 +58,10 @@ public struct SyntaxOptions: OptionSet {
   /// `(_: .*)` == `(?:.*)`
   public static var experimentalCaptures: Self { Self(1 << 5) }
 
-  /// The default syntax for a multi-line regex literal.
-  public static var multilineExtendedSyntax: Self {
-    return [Self(1 << 6), .extendedSyntax]
-  }
+  /// The syntax kind of a multi-line literal. This will always be set when
+  /// parsing a multi-line `#/.../#` literal. Note this does not imply extended
+  /// syntax, as that may be temporarily disabled while parsing.
+  public static var multilineCompilerLiteral: Self { Self(1 << 6) }
 
   /// `(?n)`
   public static var namedCapturesOnly: Self { Self(1 << 7) }
@@ -76,8 +76,8 @@ public struct SyntaxOptions: OptionSet {
   public static var traditional: Self { Self(0) }
 
   public static var experimental: Self {
-    // Experimental syntax enables everything except end-of-line comments.
-    Self(~0).subtracting(.endOfLineComments)
+    [.nonSemanticWhitespace, .experimentalQuotes, .experimentalComments,
+     .experimentalRanges, .experimentalCaptures]
   }
 
   // TODO: Probably want to model strict-PCRE etc. options too.
diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift