swiftlang · hamishknight · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022 · Jun 16, 2022
diff --git a/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift b/Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
@@ -342,8 +342,8 @@ extension Source {
     }.value
   }
 
-  /// Eat a scalar off the front, starting from after the
-  /// backslash and base character (e.g. `\u` or `\x`).
+  /// Try to eat a scalar off the front, starting from after the backslash and
+  /// base character (e.g. `\u` or `\x`).
   ///
   ///     UniScalar -> 'u{' UniScalarSequence '}'
   ///                | 'u'  HexDigit{4}
@@ -353,60 +353,60 @@ extension Source {
   ///                | 'o{' OctalDigit{1...} '}'
   ///                | '0' OctalDigit{0...3}
   ///
-  mutating func expectUnicodeScalar(
-    escapedCharacter base: Character
-  ) throws -> AST.Atom.Kind {
+  mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? {
     try recordLoc { src in
+      try src.tryEating { src in
 
-      func nullScalar() -> AST.Atom.Kind {
-        let pos = src.currentPosition
-        return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
-      }
-
-      // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
-      switch base {
-      // Hex numbers.
-      case "u" where src.tryEat("{"):
-        return try src.expectUnicodeScalarSequence(eating: "}")
-
-      case "x" where src.tryEat("{"):
-        let str = try src.lexUntil(eating: "}")
-        return .scalar(try Source.validateUnicodeScalar(str, .hex))
-
-      case "x":
-        // \x expects *up to* 2 digits.
-        guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
-        else {
-          // In PCRE, \x without any valid hex digits is \u{0}.
-          // TODO: This doesn't appear to be followed by ICU or Oniguruma, so
-          // could be changed to throw an error if we had a parsing mode for
-          // them.
-          return nullScalar()
+        func nullScalar() -> AST.Atom.Kind {
+          let pos = src.currentPosition
+          return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
         }
-        return .scalar(try Source.validateUnicodeScalar(digits, .hex))
 
-      case "u":
-        return .scalar(try src.expectUnicodeScalar(numDigits: 4))
-      case "U":
-        return .scalar(try src.expectUnicodeScalar(numDigits: 8))
+        // TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
+        switch src.tryEat() {
+        // Hex numbers.
+        case "u" where src.tryEat("{"):
+          return try src.expectUnicodeScalarSequence(eating: "}")
+
+        case "x" where src.tryEat("{"):
+          let str = try src.lexUntil(eating: "}")
+          return .scalar(try Source.validateUnicodeScalar(str, .hex))
+
+        case "x":
+          // \x expects *up to* 2 digits.
+          guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
+          else {
+            // In PCRE, \x without any valid hex digits is \u{0}.
+            // TODO: This doesn't appear to be followed by ICU or Oniguruma, so
+            // could be changed to throw an error if we had a parsing mode for
+            // them.
+            return nullScalar()
+          }
+          return .scalar(try Source.validateUnicodeScalar(digits, .hex))
+
+        case "u":
+          return .scalar(try src.expectUnicodeScalar(numDigits: 4))
+        case "U":
+          return .scalar(try src.expectUnicodeScalar(numDigits: 8))
+
+        // Octal numbers.
+        case "o" where src.tryEat("{"):
+          let str = try src.lexUntil(eating: "}")
+          return .scalar(try Source.validateUnicodeScalar(str, .octal))
+
+        case "0":
+          // We can read *up to* 3 more octal digits.
+          // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
+          // PCRE mode, we should limit it here.
+          guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
+          else {
+            return nullScalar()
+          }
+          return .scalar(try Source.validateUnicodeScalar(digits, .octal))
 
-      // Octal numbers.
-      case "o" where src.tryEat("{"):
-        let str = try src.lexUntil(eating: "}")
-        return .scalar(try Source.validateUnicodeScalar(str, .octal))
-
-      case "0":
-        // We can read *up to* 3 more octal digits.
-        // FIXME: PCRE can only read up to 2 octal digits, if we get a strict
-        // PCRE mode, we should limit it here.
-        guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
-        else {
-          return nullScalar()
+        default:
+          return nil
         }
-        return .scalar(try Source.validateUnicodeScalar(digits, .octal))
-
-      default:
-        fatalError("Unexpected scalar start")
       }
     }.value
   }
@@ -802,6 +802,11 @@ extension Source {
   mutating func lexMatchingOptionSequence(
     context: ParsingContext
   ) throws -> AST.MatchingOptionSequence? {
+    // PCRE accepts '(?)'
+    // TODO: This is a no-op, should we warn?
+    if peek() == ")" {
+      return .init(caretLoc: nil, adding: [], minusLoc: nil, removing: [])
+    }
     let ateCaret = recordLoc { $0.tryEat("^") }
 
     // TODO: Warn on duplicate options, and options appearing in both adding
@@ -1707,6 +1712,11 @@ extension Source {
         return ref
       }
 
+      // Hexadecimal and octal unicode scalars.
+      if let scalar = try src.lexUnicodeScalar() {
+        return scalar
+      }
+
       guard let char = src.tryEat() else {
         throw ParseError.expectedEscape
       }
@@ -1718,14 +1728,6 @@ extension Source {
         return .escaped(builtin)
       }
 
-      switch char {
-      // Hexadecimal and octal unicode scalars.
-      case "u", "x", "U", "o", "0":
-        return try src.expectUnicodeScalar(escapedCharacter: char)
-      default:
-        break
-      }
-
       // We only allow unknown escape sequences for non-letter non-number ASCII,
       // and non-ASCII whitespace.
       // TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.

diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
@@ -61,41 +61,6 @@ extension RegexTests {
       _ = try src.lexNumber()
     }
 
-    func diagnoseUniScalarOverflow(_ input: String, base: Character) {
-      let scalars = input.first == "{"
-                  ? String(input.dropFirst().dropLast())
-                  : input
-      diagnose(
-        input,
-        expecting: .numberOverflow(scalars)
-      ) { src in
-        _ = try src.expectUnicodeScalar(escapedCharacter: base)
-      }
-    }
-    func diagnoseUniScalar(
-      _ input: String,
-      base: Character,
-      expectedDigits numDigits: Int
-    ) {
-      let scalars = input.first == "{"
-                  ? String(input.dropFirst().dropLast())
-                  : input
-      diagnose(
-        input,
-        expecting: .expectedNumDigits(scalars, numDigits)
-      ) { src in
-        _ = try src.expectUnicodeScalar(escapedCharacter: base)
-      }
-      _ = scalars
-    }
-
-    diagnoseUniScalar(
-      "12", base: "u", expectedDigits: 4)
-    diagnoseUniScalar(
-      "12", base: "U", expectedDigits: 8)
-    diagnoseUniScalarOverflow("{123456789}", base: "u")
-    diagnoseUniScalarOverflow("{123456789}", base: "x")
-
     // TODO: want to dummy print out source ranges, etc, test that.
   }
 

diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift
@@ -1002,6 +1002,9 @@ extension RegexTests {
               concat("a", atomicScriptRun("b"), "c"), throwsError: .unsupported)
 
     // Matching option changing groups.
+    parseTest("(?)", changeMatchingOptions(
+      matchingOptions()
+    ))
     parseTest("(?-)", changeMatchingOptions(
       matchingOptions()
     ))
@@ -2666,6 +2669,8 @@ extension RegexTests {
 
     diagnosticTest("\\", .expectedEscape)
 
+    diagnosticTest(#"\o"#, .invalidEscape("o"))
+
     // TODO: Custom diagnostic for control sequence
     diagnosticTest(#"\c"#, .unexpectedEndOfInput)
 
@@ -2877,6 +2882,11 @@ extension RegexTests {
     diagnosticTest(#"[\d--\u{a b}]"#, .unsupported("scalar sequence in custom character class"))
     diagnosticTest(#"[\d--[\u{a b}]]"#, .unsupported("scalar sequence in custom character class"))
 
+    diagnosticTest(#"\u12"#, .expectedNumDigits("12", 4))
+    diagnosticTest(#"\U12"#, .expectedNumDigits("12", 8))
+    diagnosticTest(#"\u{123456789}"#, .numberOverflow("123456789"))
+    diagnosticTest(#"\x{123456789}"#, .numberOverflow("123456789"))
+
     // MARK: Matching options
 
     diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret)