Octal disambiguation

hamishknight · hamishknight · commit 3b5533fb8efd · 2021-12-18T10:49:40.000Z
Implement octal disambiguation for the `\nnn`
syntax where a backreference is only formed if
there have been that many prior groups, or it
begins with 8 or 9, or is less than 10.

In addition, generalize the \0nn syntax to
support arbitrary \nnn octal sequences inside and
outside character classes.
diff --git a/Sources/_MatchingEngine/Regex/AST/Atom.swift b/Sources/_MatchingEngine/Regex/AST/Atom.swift
@@ -373,9 +373,7 @@ extension AST.Atom.CharacterProperty {
 public enum Reference: Hashable {
   // \n \gn \g{n} \g<n> \g'n' (?n) (?(n)...
   // Oniguruma: \k<n>, \k'n'
-  // If the reference was written as \n, and n could potentially be an octal
-  // sequence, `couldBeOctal` will be set to true.
-  case absolute(Int, couldBeOctal: Bool = false)
+  case absolute(Int)
 
   // \g{-n} \g<+n> \g'+n' \g<-n> \g'-n' (?+n) (?-n)
   // (?(+n)... (?(-n)...
diff --git a/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift b/Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift
@@ -234,7 +234,7 @@ extension Source {
   ///                | 'x'  HexDigit{2}
   ///                | 'U'  HexDigit{8}
   ///                | 'o{' OctalDigit{1...} '}'
-  ///                | '0'  OctalDigit{0...2}
+  ///                | OctalDigit{1...3}
   ///
   mutating func expectUnicodeScalar(
     escapedCharacter base: Character
@@ -257,12 +257,12 @@ extension Source {
         let str = try src.lexUntil(eating: "}").value
         return try Source.validateUnicodeScalar(str, .octal)
 
-      case "0":
+      case let c where c.isOctalDigit:
         // We can read *up to* 2 more octal digits per PCRE.
-        // FIXME: ICU can read up to 3 octal digits, we should have a parser
-        // mode to switch.
-        guard let str = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)?.string
-        else { return Unicode.Scalar(0) }
+        // FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
+        // we should have a parser mode to switch.
+        let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)
+        let str = String(c) + (nextDigits?.string ?? "")
         return try Source.validateUnicodeScalar(str, .octal)
 
       default:
@@ -661,6 +661,10 @@ extension Source {
     }
   }
 
+  /// Try to lex an absolute or relative numbered reference.
+  ///
+  ///     NumberRef -> ('+' | '-')? <Decimal Number>
+  ///
   private mutating func lexNumberedReference(
   ) throws -> Located<Reference>? {
     try recordLoc { src in
@@ -677,6 +681,10 @@ extension Source {
     }
   }
 
+  /// Try to lex a numbered reference, or otherwise a named reference.
+  ///
+  ///     NameOrNumberRef -> NumberRef | <String>
+  ///
   private mutating func expectNamedOrNumberedReference(
     endingWith ending: String
   ) throws -> Located<Reference> {
@@ -712,9 +720,8 @@ extension Source {
   ///                       | 'k{' <String> '}'
   ///                       | [1-9] [0-9]+
   ///
-  ///     NumberRef -> ('+' | '-')? <Decimal Number>
-  ///     NameOrNumberRef -> NumberRef | <String>
   private mutating func lexEscapedReference(
+    priorGroupCount: Int
   ) throws -> Located<AST.Atom.Kind>? {
     try recordLoc { src in
       if src.tryEat("g") {
@@ -754,14 +761,27 @@ extension Source {
         return .char("k")
       }
 
-      // If we can lex a number other than 0 (as that's an octal sequence),
-      // it's a backreference. Though we should make a note of whether it could
-      // feasibly be an octal sequence, as the matching engine may need to
-      // treat it as such.
-      if src.peek() != "0", let num = try src.lexNumber()  {
-        let digits = src.input[num.location.range]
-        let couldBeOctal = digits.count > 1 && digits.all(\.isOctalDigit)
-        return .backreference(.absolute(num.value, couldBeOctal: couldBeOctal))
+      // Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE it
+      // is treated as a backreference if its first digit is not 0 (as that is
+      // always octal) and one of the following holds:
+      //
+      // - It's 0 < n < 10 (as octal would be pointless here)
+      // - Its first digit is 8 or 9 (as not valid octal)
+      // - There have been as many prior groups as the reference.
+      //
+      // Oniguruma follows the same rules except the second one. e.g \81 and \91
+      // are instead treated as literal 81 and 91 respectively.
+      // TODO: If we want a strict Oniguruma mode, we'll need to add a check
+      // here.
+      if src.peek() != "0", let digits = src.peekPrefix(\.isNumber) {
+        // First lex out the decimal digits and see if we can treat this as a
+        // backreference.
+        let num = try Source.validateNumber(digits.string, Int.self, .decimal)
+        if num < 10 || digits.first == "8" || digits.first == "9" ||
+            num <= priorGroupCount {
+          src.advance(digits.count)
+          return .backreference(.absolute(num))
+        }
       }
       return nil
     }
@@ -774,7 +794,7 @@ extension Source {
   ///                       | EscapedReference
   ///
   mutating func expectEscaped(
-    isInCustomCharacterClass ccc: Bool
+    isInCustomCharacterClass ccc: Bool, priorGroupCount: Int
   ) throws -> Located<AST.Atom.Kind> {
     try recordLoc { src in
       // Keyboard control/meta
@@ -799,16 +819,13 @@ extension Source {
       }
 
       // References using escape syntax, e.g \1, \g{1}, \k<...>, ...
-      if let ref = try src.lexEscapedReference()?.value {
+      // These are not valid inside custom character classes.
+      if !ccc, let ref = try src.lexEscapedReference(
+        priorGroupCount: priorGroupCount
+      )?.value {
         return ref
       }
 
-      // Hexadecimal and octal unicode scalars.
-      if let char = src.tryEat(anyOf: "u", "x", "U", "o", "0") {
-        return try .scalar(
-          src.expectUnicodeScalar(escapedCharacter: char).value)
-      }
-
       let char = src.eat()
 
       // Single-character builtins.
@@ -817,7 +834,17 @@ extension Source {
       ) {
         return .escaped(builtin)
       }
-      return .char(char)
+
+      switch char {
+      // Hexadecimal and octal unicode scalars. This must be done after
+      // backreference lexing due to the ambiguity with \nnn.
+      case let c where c.isOctalDigit: fallthrough
+      case "u", "x", "U", "o":
+        return try .scalar(
+          src.expectUnicodeScalar(escapedCharacter: char).value)
+      default:
+        return .char(char)
+      }
     }
   }
 
@@ -834,7 +861,7 @@ extension Source {
   ///     ExpGroupStart -> '(_:'
   ///
   mutating func lexAtom(
-    isInCustomCharacterClass customCC: Bool
+    isInCustomCharacterClass customCC: Bool, priorGroupCount: Int
   ) throws -> AST.Atom? {
     let kind: Located<AST.Atom.Kind>? = try recordLoc { src in
       // Check for not-an-atom, e.g. parser recursion termination
@@ -867,7 +894,8 @@ extension Source {
 
       // Escaped
       case "\\": return try src.expectEscaped(
-          isInCustomCharacterClass: customCC).value
+        isInCustomCharacterClass: customCC,
+        priorGroupCount: priorGroupCount).value
 
       case "]":
         assert(!customCC, "parser should have prevented this")
@@ -882,13 +910,16 @@ extension Source {
 
   /// Try to lex the end of a range in a custom character class, which consists
   /// of a '-' character followed by an atom.
-  mutating func lexCustomCharClassRangeEnd() throws -> AST.Atom? {
+  mutating func lexCustomCharClassRangeEnd(
+    priorGroupCount: Int
+  ) throws -> AST.Atom? {
     // Make sure we don't have a binary operator e.g '--', and the '-' is not
     // ending the custom character class (in which case it is literal).
     guard peekCCBinOp() == nil && !starts(with: "-]") && tryEat("-") else {
       return nil
     }
-    return try lexAtom(isInCustomCharacterClass: true)
+    return try lexAtom(isInCustomCharacterClass: true,
+                       priorGroupCount: priorGroupCount)
   }
 }
 
diff --git a/Sources/_MatchingEngine/Regex/Parse/Parse.swift b/Sources/_MatchingEngine/Regex/Parse/Parse.swift
@@ -43,8 +43,14 @@ Lexical analysis provides the following:
 private struct Parser {
   var source: Source
 
+  /// Tracks the number of parent custom character classes to allow us to
+  /// determine whether or not to lex with custom character class syntax.
   fileprivate var customCharacterClassDepth = 0
 
+  /// Tracks the number of group openings we've seen, to disambiguate the '\n'
+  /// syntax as a backreference or an octal sequence.
+  fileprivate var priorGroupCount = 0
+
   init(_ source: Source) {
     self.source = source
   }
@@ -163,6 +169,7 @@ extension Parser {
     let _start = source.currentPosition
 
     if let kind = try source.lexGroupStart() {
+      priorGroupCount += 1
       let child = try parse()
       try source.expect(")")
       return .group(.init(kind, child, loc(_start)))
@@ -173,7 +180,8 @@ extension Parser {
     }
 
     if let atom = try source.lexAtom(
-      isInCustomCharacterClass: isInCustomCharacterClass
+      isInCustomCharacterClass: isInCustomCharacterClass,
+      priorGroupCount: priorGroupCount
     ) {
       // TODO: track source locations
       return .atom(atom)
@@ -247,11 +255,14 @@ extension Parser {
         continue
       }
 
-      guard let atom = try source.lexAtom(isInCustomCharacterClass: true)
-        else { break }
+      guard let atom = try source.lexAtom(
+        isInCustomCharacterClass: true, priorGroupCount: priorGroupCount)
+      else { break }
 
       // Range between atoms.
-      if let rhs = try source.lexCustomCharClassRangeEnd() {
+      if let rhs = try source.lexCustomCharClassRangeEnd(
+        priorGroupCount: priorGroupCount
+      ) {
         guard atom.literalCharacterValue != nil &&
               rhs.literalCharacterValue != nil else {
           throw ParseError.invalidCharacterClassRangeOperand
diff --git a/Sources/_MatchingEngine/Regex/Parse/Source.swift b/Sources/_MatchingEngine/Regex/Parse/Source.swift
@@ -119,6 +119,15 @@ extension Source {
   mutating func tryEatPrefix(
     maxLength: Int? = nil,
     _ f: (Char) -> Bool
+  ) -> Input.SubSequence? {
+    guard let pre = peekPrefix(maxLength: maxLength, f) else { return nil }
+    defer { self.advance(pre.count) }
+    return pre
+  }
+
+  func peekPrefix(
+    maxLength: Int? = nil,
+    _ f: (Char) -> Bool
   ) -> Input.SubSequence? {
     let chunk: Input.SubSequence
     if let maxLength = maxLength {
@@ -129,7 +138,6 @@ extension Source {
     let pre = chunk.prefix(while: f)
     guard !pre.isEmpty else { return nil }
 
-    defer { self.advance(pre.count) }
     return pre
   }
 
diff --git a/Sources/_StringProcessing/ASTBuilder.swift b/Sources/_StringProcessing/ASTBuilder.swift
@@ -30,6 +30,10 @@ func concat(_ asts: AST...) -> AST {
   concat(asts)
 }
 
+func empty() -> AST {
+  .empty(.init(.fake))
+}
+
 func group(
   _ kind: AST.Group.Kind, _ child: AST
 ) -> AST {
@@ -182,6 +186,19 @@ func escaped(
 func scalar(_ s: Unicode.Scalar) -> AST {
   atom(.scalar(s))
 }
+func scalar_m(_ s: Unicode.Scalar) -> AST.CustomCharacterClass.Member {
+  atom_m(.scalar(s))
+}
+
+func backreference(_ r: Reference) -> AST {
+  atom(.backreference(r))
+}
+func subpattern(_ r: Reference) -> AST {
+  atom(.subpattern(r))
+}
+func condition(_ r: Reference) -> AST {
+  atom(.condition(r))
+}
 
 func prop(
   _ kind: AST.Atom.CharacterProperty.Kind,
diff --git a/Tests/RegexTests/LexTests.swift b/Tests/RegexTests/LexTests.swift
@@ -29,6 +29,13 @@ func diagnose(
   }
 }
 
+extension Source {
+  @discardableResult
+  fileprivate mutating func lexBasicAtom() throws -> AST.Atom? {
+    try lexAtom(isInCustomCharacterClass: false, priorGroupCount: 0)
+  }
+}
+
 extension RegexTests {
   func testLexicalAnalysis() {
     diagnose("a", expecting: .expected("b")) { src in
@@ -92,27 +99,13 @@ extension RegexTests {
     }
 
     // Test expected closing delimiters.
-    diagnose(#"\u{5"#, expecting: .expected("}")) { src in
-      _ = try src.lexAtom(isInCustomCharacterClass: false)
-    }
-    diagnose(#"\x{5"#, expecting: .expected("}")) { src in
-      _ = try src.lexAtom(isInCustomCharacterClass: false)
-    }
-    diagnose(#"\N{A"#, expecting: .expected("}")) { src in
-      _ = try src.lexAtom(isInCustomCharacterClass: false)
-    }
-    diagnose(#"\N{U+A"#, expecting: .expected("}")) { src in
-      _ = try src.lexAtom(isInCustomCharacterClass: false)
-    }
-    diagnose(#"\p{a"#, expecting: .expected("}")) { src in
-      _ = try src.lexAtom(isInCustomCharacterClass: false)
-    }
-    diagnose(#"\p{a="#, expecting: .expected("}")) { src in
-      _ = try src.lexAtom(isInCustomCharacterClass: false)
-    }
-    diagnose(#"(?#"#, expecting: .expected(")")) { src in
-      _ = try src.lexComment()
-    }
+    diagnose(#"\u{5"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
+    diagnose(#"\x{5"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
+    diagnose(#"\N{A"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
+    diagnose(#"\N{U+A"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
+    diagnose(#"\p{a"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
+    diagnose(#"\p{a="#, expecting: .expected("}")) { try $0.lexBasicAtom() }
+    diagnose(#"(?#"#, expecting: .expected(")")) { _ = try $0.lexComment() }
 
     // TODO: want to dummy print out source ranges, etc, test that.
   }
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
@@ -531,7 +531,7 @@ extension RegexTests {
     matchTest(#"(.)\1"#, input: "112", match: "11", xfail: true)
     matchTest(#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
               input: "aaaaaaaaabbc", match: "aaaaaaaaabb", xfail: true)
-    matchTest(#"(.)\10"#, input: "a\u{8}b", match: "a\u{8}", xfail: true)
+    matchTest(#"(.)\10"#, input: "a\u{8}b", match: "a\u{8}")
 
     matchTest(#"(.)\g001"#, input: "112", match: "11", xfail: true)
     matchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true)
diff --git a/Tests/RegexTests/ParseTests.swift b/Tests/RegexTests/ParseTests.swift