Skip to content

Commit ed9f72c

Browse files
committed
Convert scalar escape sequences to DSL scalars
Convert AST escape sequences that represent a scalar value (e.g `\f`, `n`, `\a`) into scalars in the DSL tree. This allows the matching engine to match against them.
1 parent 36f7160 commit ed9f72c

File tree

3 files changed

+53
-37
lines changed

3 files changed

+53
-37
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 36 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,41 @@ extension AST.Atom {
631631
}
632632
}
633633

634+
extension AST.Atom.EscapedBuiltin {
635+
/// If the escape sequence represents a unicode scalar value, returns the
636+
/// value, otherwise `nil`.
637+
public var scalarValue: UnicodeScalar? {
638+
switch self {
639+
// TODO: Should we separate these into a separate enum? Or move the
640+
// specifics of the scalar to the DSL tree?
641+
case .alarm:
642+
return "\u{7}"
643+
case .backspace:
644+
return "\u{8}"
645+
case .escape:
646+
return "\u{1B}"
647+
case .formfeed:
648+
return "\u{C}"
649+
case .newline:
650+
return "\n"
651+
case .carriageReturn:
652+
return "\r"
653+
case .tab:
654+
return "\t"
655+
656+
case .singleDataUnit, .decimalDigit, .notDecimalDigit,
657+
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
658+
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
659+
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
660+
.wordBoundary, .notWordBoundary, .startOfSubject,
661+
.endOfSubjectBeforeNewline, .endOfSubject,
662+
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
663+
.textSegment, .notTextSegment:
664+
return nil
665+
}
666+
}
667+
}
668+
634669
extension AST.Atom {
635670
/// Retrieve the character value of the atom if it represents a literal
636671
/// character or unicode scalar, nil otherwise.
@@ -642,34 +677,7 @@ extension AST.Atom {
642677
return Character(s)
643678

644679
case .escaped(let c):
645-
switch c {
646-
// TODO: Should we separate these into a separate enum? Or move the
647-
// specifics of the scalar to the DSL tree?
648-
case .alarm:
649-
return "\u{7}"
650-
case .backspace:
651-
return "\u{8}"
652-
case .escape:
653-
return "\u{1B}"
654-
case .formfeed:
655-
return "\u{C}"
656-
case .newline:
657-
return "\n"
658-
case .carriageReturn:
659-
return "\r"
660-
case .tab:
661-
return "\t"
662-
663-
case .singleDataUnit, .decimalDigit, .notDecimalDigit,
664-
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
665-
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
666-
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
667-
.wordBoundary, .notWordBoundary, .startOfSubject,
668-
.endOfSubjectBeforeNewline, .endOfSubject,
669-
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
670-
.textSegment, .notTextSegment:
671-
return nil
672-
}
680+
return c.scalarValue.map(Character.init)
673681

674682
case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
675683
// TODO: These should have unicode scalar values.

Sources/_StringProcessing/Regex/ASTConversion.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,9 @@ extension AST.Atom {
211211
case .any: return .any
212212
case let .backreference(r): return .backreference(r)
213213

214+
case .escaped(let c) where c.scalarValue != nil:
215+
return .scalar(c.scalarValue!)
216+
214217
default: return .unconverted(self)
215218
}
216219
}

Tests/RegexTests/MatchTests.swift

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,15 @@ extension RegexTests {
281281
// code point sequence
282282
firstMatchTest(#"\u{61 62 63}"#, input: "123abcxyz", match: "abc", xfail: true)
283283

284+
// Escape sequences that represent scalar values.
285+
firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
286+
input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
287+
match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
288+
firstMatchTest(#"[\a][\b][\e][\f][\n][\r][\t]"#,
289+
input: "\u{7}\u{8}\u{1B}\u{C}\n\r\t",
290+
match: "\u{7}\u{8}\u{1B}\u{C}\n\r\t")
291+
292+
firstMatchTest(#"\r\n"#, input: "\r\n", match: "\r\n")
284293

285294
// MARK: Quotes
286295

@@ -596,24 +605,20 @@ extension RegexTests {
596605

597606
func scalar(_ u: UnicodeScalar) -> UInt32 { u.value }
598607

599-
// Currently not supported in the matching engine.
600608
for s in scalar("\u{C}") ... scalar("\u{1B}") {
601609
let u = UnicodeScalar(s)!
602-
firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)",
603-
xfail: true)
610+
firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)")
604611
}
605612
for u: UnicodeScalar in ["\u{7}", "\u{8}"] {
606-
firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)",
607-
xfail: true)
613+
firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)")
608614
}
609615
for s in scalar("\u{A}") ... scalar("\u{D}") {
610616
let u = UnicodeScalar(s)!
611-
firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)",
612-
xfail: true)
617+
firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)")
613618
}
614-
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}",
615-
xfail: true)
619+
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}")
616620

621+
// Currently not supported in the matching engine.
617622
for c: UnicodeScalar in ["a", "b", "c"] {
618623
firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
619624
xfail: true)

0 commit comments

Comments
 (0)