Convert scalar escape sequences to DSL scalars

hamishknight · hamishknight · commit ed9f72c50c5c · 2022-04-04T12:33:26.000+01:00
Convert AST escape sequences that represent a
scalar value (e.g `\f`, `n`, `\a`) into scalars in
the DSL tree. This allows the matching engine to
match against them.
diff --git a/Sources/_RegexParser/Regex/AST/Atom.swift b/Sources/_RegexParser/Regex/AST/Atom.swift
@@ -631,6 +631,41 @@ extension AST.Atom {
   }
 }
 
+extension AST.Atom.EscapedBuiltin {
+  /// If the escape sequence represents a unicode scalar value, returns the
+  /// value, otherwise `nil`.
+  public var scalarValue: UnicodeScalar? {
+    switch self {
+    // TODO: Should we separate these into a separate enum? Or move the
+    // specifics of the scalar to the DSL tree?
+    case .alarm:
+      return "\u{7}"
+    case .backspace:
+      return "\u{8}"
+    case .escape:
+      return "\u{1B}"
+    case .formfeed:
+      return "\u{C}"
+    case .newline:
+      return "\n"
+    case .carriageReturn:
+      return "\r"
+    case .tab:
+      return "\t"
+
+    case .singleDataUnit, .decimalDigit, .notDecimalDigit,
+        .horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
+        .newlineSequence, .whitespace, .notWhitespace, .verticalTab,
+        .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
+        .wordBoundary, .notWordBoundary, .startOfSubject,
+        .endOfSubjectBeforeNewline, .endOfSubject,
+        .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
+        .textSegment, .notTextSegment:
+      return nil
+    }
+  }
+}
+
 extension AST.Atom {
   /// Retrieve the character value of the atom if it represents a literal
   /// character or unicode scalar, nil otherwise.
@@ -642,34 +677,7 @@ extension AST.Atom {
       return Character(s)
 
     case .escaped(let c):
-      switch c {
-      // TODO: Should we separate these into a separate enum? Or move the
-      // specifics of the scalar to the DSL tree?
-      case .alarm:
-        return "\u{7}"
-      case .backspace:
-        return "\u{8}"
-      case .escape:
-        return "\u{1B}"
-      case .formfeed:
-        return "\u{C}"
-      case .newline:
-        return "\n"
-      case .carriageReturn:
-        return "\r"
-      case .tab:
-        return "\t"
-
-      case .singleDataUnit, .decimalDigit, .notDecimalDigit,
-          .horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
-          .newlineSequence, .whitespace, .notWhitespace, .verticalTab,
-          .notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
-          .wordBoundary, .notWordBoundary, .startOfSubject,
-          .endOfSubjectBeforeNewline, .endOfSubject,
-          .firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
-          .textSegment, .notTextSegment:
-        return nil
-      }
+      return c.scalarValue.map(Character.init)
 
     case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
       // TODO: These should have unicode scalar values.
diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift
@@ -211,6 +211,9 @@ extension AST.Atom {
     case .any:                  return .any
     case let .backreference(r): return .backreference(r)
 
+    case .escaped(let c) where c.scalarValue != nil:
+      return .scalar(c.scalarValue!)
+
     default: return .unconverted(self)
     }
   }
diff --git a/Tests/RegexTests/MatchTests.swift b/Tests/RegexTests/MatchTests.swift
@@ -281,6 +281,15 @@ extension RegexTests {
     // code point sequence
     firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
 
+    // Escape sequences that represent scalar values.
+    firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
+                   input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
+                   match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
+    firstMatchTest(#"[\a][\b][\e][\f][\n][\r][\t]"#,
+                   input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
+                   match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
+
+    firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n")
 
     // MARK: Quotes
 
@@ -596,24 +605,20 @@ extension RegexTests {
 
     func scalar(_ u: UnicodeScalar) -> UInt32 { u.value }
 
-    // Currently not supported in the matching engine.
     for s in scalar("\u{C}") ... scalar("\u{1B}") {
       let u = UnicodeScalar(s)!
-      firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)",
-                     xfail: true)
+      firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)")
     }
     for u: UnicodeScalar in ["\u{7}", "\u{8}"] {
-      firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)",
-                     xfail: true)
+      firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)")
     }
     for s in scalar("\u{A}") ... scalar("\u{D}") {
       let u = UnicodeScalar(s)!
-      firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)",
-                     xfail: true)
+      firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)")
     }
-    firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}",
-                   xfail: true)
+    firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}")
 
+    // Currently not supported in the matching engine.
     for c: UnicodeScalar in ["a", "b", "c"] {
       firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
                      xfail: true)

Original file line number	Diff line number	Diff line change
`@@ -211,6 +211,9 @@ extension AST.Atom {`
`211`	`211`	`case .any: return .any`
`212`	`212`	`case let .backreference(r): return .backreference(r)`
`213`	`213`
	`214`	`+ case .escaped(let c) where c.scalarValue != nil:`
	`215`	`+ return .scalar(c.scalarValue!)`
	`216`	`+`
`214`	`217`	`default: return .unconverted(self)`
`215`	`218`	`}`
`216`	`219`	`}`