Skip to content

Commit 87caf13

Browse files
committed
Allow certain escape sequences in character class ranges
Certain escape sequences express a unicode scalar and as such are valid in a range.
1 parent 617edb9 commit 87caf13

File tree

4 files changed

+118
-7
lines changed

4 files changed

+118
-7
lines changed

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -641,17 +641,67 @@ extension AST.Atom {
641641
case .scalar(let s):
642642
return Character(s)
643643

644+
case .escaped(let c):
645+
switch c {
646+
// TODO: Should we separate these into a separate enum? Or move the
647+
// specifics of the scalar to the DSL tree?
648+
case .alarm:
649+
return "\u{7}"
650+
case .backspace:
651+
return "\u{8}"
652+
case .escape:
653+
return "\u{1B}"
654+
case .formfeed:
655+
return "\u{C}"
656+
case .newline:
657+
return "\n"
658+
case .carriageReturn:
659+
return "\r"
660+
case .tab:
661+
return "\t"
662+
663+
case .singleDataUnit, .decimalDigit, .notDecimalDigit,
664+
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
665+
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
666+
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
667+
.wordBoundary, .notWordBoundary, .startOfSubject,
668+
.endOfSubjectBeforeNewline, .endOfSubject,
669+
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
670+
.textSegment, .notTextSegment:
671+
return nil
672+
}
673+
644674
case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
645-
// TODO: Not a character per-say, what should we do?
646-
fallthrough
675+
// TODO: These should have unicode scalar values.
676+
return nil
647677

648-
case .property, .escaped, .any, .startOfLine, .endOfLine,
649-
.backreference, .subpattern, .namedCharacter, .callout,
650-
.backtrackingDirective:
678+
case .namedCharacter:
679+
// TODO: This should have a unicode scalar value depending on the name
680+
// given.
681+
// TODO: Do we want to validate and assign a scalar value when building
682+
// the AST? Or defer for the matching engine?
683+
return nil
684+
685+
case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern,
686+
.callout, .backtrackingDirective:
651687
return nil
652688
}
653689
}
654690

691+
/// Whether this atom is valid as the operand of a custom character class
692+
/// range.
693+
public var isValidCharacterClassRangeBound: Bool {
694+
// If we have a literal character value for this, it can be used as a bound.
695+
if literalCharacterValue != nil { return true }
696+
switch kind {
697+
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
698+
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:
699+
return true
700+
default:
701+
return false
702+
}
703+
}
704+
655705
/// Produce a string literal representation of the atom, if possible
656706
///
657707
/// Individual characters will be returned, Unicode scalars will be

Sources/_MatchingEngine/Regex/Parse/Parse.swift

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -498,10 +498,11 @@ extension Parser {
498498
// Range between atoms.
499499
if let (dashLoc, rhs) =
500500
try source.lexCustomCharClassRangeEnd(context: context) {
501-
guard atom.literalCharacterValue != nil &&
502-
rhs.literalCharacterValue != nil else {
501+
guard atom.isValidCharacterClassRangeBound &&
502+
rhs.isValidCharacterClassRangeBound else {
503503
throw ParseError.invalidCharacterClassRangeOperand
504504
}
505+
// TODO: Validate lower <= upper?
505506
members.append(.range(.init(atom, dashLoc, rhs)))
506507
continue
507508
}

Tests/RegexTests/MatchTests.swift

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -585,6 +585,35 @@ extension RegexTests {
585585

586586
firstMatchTest("[[:script=Greek:]]", input: "123αβγxyz", match: "α")
587587

588+
func scalar(_ u: UnicodeScalar) -> UInt32 { u.value }
589+
590+
// Currently not supported in the matching engine.
591+
for s in scalar("\u{C}") ... scalar("\u{1B}") {
592+
let u = UnicodeScalar(s)!
593+
firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)",
594+
xfail: true)
595+
}
596+
for u: UnicodeScalar in ["\u{7}", "\u{8}"] {
597+
firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)",
598+
xfail: true)
599+
}
600+
for s in scalar("\u{A}") ... scalar("\u{D}") {
601+
let u = UnicodeScalar(s)!
602+
firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)",
603+
xfail: true)
604+
}
605+
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}",
606+
xfail: true)
607+
608+
for c: UnicodeScalar in ["a", "b", "c"] {
609+
firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
610+
xfail: true)
611+
}
612+
for c: UnicodeScalar in ["$", "%", "&", "'"] {
613+
firstMatchTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#,
614+
input: "#()\(c)", match: "\(c)", xfail: true)
615+
}
616+
588617
// MARK: Operators
589618

590619
firstMatchTest(

Tests/RegexTests/ParseTests.swift

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -494,6 +494,25 @@ extension RegexTests {
494494
parseTest("[*]", charClass("*"))
495495
parseTest("[{0}]", charClass("{", "0", "}"))
496496

497+
parseTest(#"[\f-\e]"#, charClass(
498+
range_m(.escaped(.formfeed), .escaped(.escape))))
499+
parseTest(#"[\a-\b]"#, charClass(
500+
range_m(.escaped(.alarm), .escaped(.backspace))))
501+
parseTest(#"[\n-\r]"#, charClass(
502+
range_m(.escaped(.newline), .escaped(.carriageReturn))))
503+
parseTest(#"[\t-\t]"#, charClass(
504+
range_m(.escaped(.tab), .escaped(.tab))))
505+
506+
parseTest(#"[\cX-\cY\C-A-\C-B\M-\C-A-\M-\C-B\M-A-\M-B]"#, charClass(
507+
range_m(.keyboardControl("X"), .keyboardControl("Y")),
508+
range_m(.keyboardControl("A"), .keyboardControl("B")),
509+
range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")),
510+
range_m(.keyboardMeta("A"), .keyboardMeta("B"))
511+
))
512+
513+
parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass(
514+
range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))))
515+
497516
// MARK: Operators
498517

499518
parseTest(
@@ -575,6 +594,15 @@ extension RegexTests {
575594
// Escaped U+3000 IDEOGRAPHIC SPACE.
576595
parseTest(#"\\#u{3000}"#, "\u{3000}")
577596

597+
// Control and meta controls.
598+
parseTest(#"\c "#, atom(.keyboardControl(" ")))
599+
parseTest(#"\c!"#, atom(.keyboardControl("!")))
600+
parseTest(#"\c~"#, atom(.keyboardControl("~")))
601+
parseTest(#"\C--"#, atom(.keyboardControl("-")))
602+
parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")))
603+
parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")))
604+
parseTest(#"\M-a"#, atom(.keyboardMeta("a")))
605+
578606
// MARK: Comments
579607

580608
parseTest(
@@ -1877,6 +1905,9 @@ extension RegexTests {
18771905

18781906
diagnosticTest("\\", .expectedEscape)
18791907

1908+
// TODO: Custom diagnostic for control sequence
1909+
diagnosticTest(#"\c"#, .unexpectedEndOfInput)
1910+
18801911
// TODO: Custom diagnostic for expected backref
18811912
diagnosticTest(#"\g"#, .invalidEscape("g"))
18821913
diagnosticTest(#"\k"#, .invalidEscape("k"))

0 commit comments

Comments
 (0)