Skip to content

Commit 9889ae7

Browse files
authored
Merge pull request #226 from hamishknight/esc
2 parents d2ff78f + cdf98c5 commit 9889ae7

File tree

7 files changed

+177
-93
lines changed

7 files changed

+177
-93
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -641,17 +641,67 @@ extension AST.Atom {
641641
case .scalar(let s):
642642
return Character(s)
643643

644+
case .escaped(let c):
645+
switch c {
646+
// TODO: Should we separate these into a separate enum? Or move the
647+
// specifics of the scalar to the DSL tree?
648+
case .alarm:
649+
return "\u{7}"
650+
case .backspace:
651+
return "\u{8}"
652+
case .escape:
653+
return "\u{1B}"
654+
case .formfeed:
655+
return "\u{C}"
656+
case .newline:
657+
return "\n"
658+
case .carriageReturn:
659+
return "\r"
660+
case .tab:
661+
return "\t"
662+
663+
case .singleDataUnit, .decimalDigit, .notDecimalDigit,
664+
.horizontalWhitespace, .notHorizontalWhitespace, .notNewline,
665+
.newlineSequence, .whitespace, .notWhitespace, .verticalTab,
666+
.notVerticalTab, .wordCharacter, .notWordCharacter, .graphemeCluster,
667+
.wordBoundary, .notWordBoundary, .startOfSubject,
668+
.endOfSubjectBeforeNewline, .endOfSubject,
669+
.firstMatchingPositionInSubject, .resetStartOfMatch, .trueAnychar,
670+
.textSegment, .notTextSegment:
671+
return nil
672+
}
673+
644674
case .keyboardControl, .keyboardMeta, .keyboardMetaControl:
645-
// TODO: Not a character per-say, what should we do?
646-
fallthrough
675+
// TODO: These should have unicode scalar values.
676+
return nil
647677

648-
case .property, .escaped, .any, .startOfLine, .endOfLine,
649-
.backreference, .subpattern, .namedCharacter, .callout,
650-
.backtrackingDirective:
678+
case .namedCharacter:
679+
// TODO: This should have a unicode scalar value depending on the name
680+
// given.
681+
// TODO: Do we want to validate and assign a scalar value when building
682+
// the AST? Or defer for the matching engine?
683+
return nil
684+
685+
case .property, .any, .startOfLine, .endOfLine, .backreference, .subpattern,
686+
.callout, .backtrackingDirective:
651687
return nil
652688
}
653689
}
654690

691+
/// Whether this atom is valid as the operand of a custom character class
692+
/// range.
693+
public var isValidCharacterClassRangeBound: Bool {
694+
// If we have a literal character value for this, it can be used as a bound.
695+
if literalCharacterValue != nil { return true }
696+
switch kind {
697+
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
698+
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:
699+
return true
700+
default:
701+
return false
702+
}
703+
}
704+
655705
/// Produce a string literal representation of the atom, if possible
656706
///
657707
/// Individual characters will be returned, Unicode scalars will be

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ enum ParseError: Error, Hashable {
3939

4040
case expectedNonEmptyContents
4141
case expectedEscape
42+
case invalidEscape(Character)
4243

4344
case cannotReferToWholePattern
4445

@@ -107,6 +108,8 @@ extension ParseError: CustomStringConvertible {
107108
return "expected non-empty contents"
108109
case .expectedEscape:
109110
return "expected escape sequence"
111+
case .invalidEscape(let c):
112+
return "invalid escape sequence '\\\(c)'"
110113
case .cannotReferToWholePattern:
111114
return "cannot refer to whole pattern here"
112115
case .notQuantifiable:

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1489,8 +1489,17 @@ extension Source {
14891489
return try .scalar(
14901490
src.expectUnicodeScalar(escapedCharacter: char).value)
14911491
default:
1492-
return .char(char)
1492+
break
14931493
}
1494+
1495+
// We only allow unknown escape sequences for non-letter ASCII, and
1496+
// non-ASCII whitespace.
1497+
guard (char.isASCII && !char.isLetter) ||
1498+
(!char.isASCII && char.isWhitespace)
1499+
else {
1500+
throw ParseError.invalidEscape(char)
1501+
}
1502+
return .char(char)
14941503
}
14951504
}
14961505

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,12 @@ extension Parser {
425425
try source.expectNonEmpty()
426426

427427
var members: Array<Member> = []
428+
429+
// We can eat an initial ']', as PCRE, Oniguruma, and ICU forbid empty
430+
// character classes, and assume an initial ']' is literal.
431+
if let loc = source.tryEatWithLoc("]") {
432+
members.append(.atom(.init(.char("]"), loc)))
433+
}
428434
try parseCCCMembers(into: &members)
429435

430436
// If we have a binary set operator, parse it and the next members. Note
@@ -489,10 +495,11 @@ extension Parser {
489495
// Range between atoms.
490496
if let (dashLoc, rhs) =
491497
try source.lexCustomCharClassRangeEnd(context: context) {
492-
guard atom.literalCharacterValue != nil &&
493-
rhs.literalCharacterValue != nil else {
498+
guard atom.isValidCharacterClassRangeBound &&
499+
rhs.isValidCharacterClassRangeBound else {
494500
throw ParseError.invalidCharacterClassRangeOperand
495501
}
502+
// TODO: Validate lower <= upper?
496503
members.append(.range(.init(atom, dashLoc, rhs)))
497504
continue
498505
}

Sources/_StringProcessing/CharacterClass.swift

Lines changed: 0 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -319,21 +319,6 @@ extension CharacterClass {
319319
}
320320
}
321321

322-
extension AST.Node {
323-
/// If this has a character class representation, whether built-in or custom, return it.
324-
///
325-
/// TODO: Not sure if this the right model type, but I suspect we'll want to produce
326-
/// something like this on demand
327-
var characterClass: CharacterClass? {
328-
switch self {
329-
case let .customCharacterClass(cc): return cc.modelCharacterClass
330-
case let .atom(a): return a.characterClass
331-
332-
default: return nil
333-
}
334-
}
335-
}
336-
337322
extension DSLTree.Node {
338323
var characterClass: CharacterClass? {
339324
switch self {
@@ -502,66 +487,6 @@ extension DSLTree.CustomCharacterClass {
502487
}
503488
}
504489

505-
extension AST.CustomCharacterClass {
506-
/// The model character class for this custom character class.
507-
var modelCharacterClass: CharacterClass? {
508-
typealias Component = CharacterClass.CharacterSetComponent
509-
func getComponents(_ members: [Member]) -> [Component]? {
510-
var result = Array<Component>()
511-
for m in members {
512-
switch m {
513-
case .custom(let cc):
514-
guard let cc = cc.modelCharacterClass else {
515-
return nil
516-
}
517-
result.append(.characterClass(cc))
518-
case .range(let r):
519-
result.append(.range(
520-
r.lhs.literalCharacterValue! ...
521-
r.rhs.literalCharacterValue!))
522-
523-
case .atom(let a):
524-
if let cc = a.characterClass {
525-
result.append(.characterClass(cc))
526-
} else if let lit = a.literalCharacterValue {
527-
result.append(.character(lit))
528-
} else {
529-
return nil
530-
}
531-
532-
case .quote(let q):
533-
// Decompose quoted literal into literal characters.
534-
result += q.literal.map { .character($0) }
535-
536-
case .trivia:
537-
// Not semantically important.
538-
break
539-
540-
case .setOperation(let lhs, let op, let rhs):
541-
// FIXME: CharacterClass wasn't designed for set operations with
542-
// multiple components in each operand, we should fix that. For now,
543-
// just produce custom components.
544-
guard let lhs = getComponents(lhs),
545-
let rhs = getComponents(rhs)
546-
else {
547-
return nil
548-
}
549-
result.append(.setOperation(.init(
550-
lhs: .characterClass(.custom(lhs)),
551-
op: op.value,
552-
rhs: .characterClass(.custom(rhs)))))
553-
}
554-
}
555-
return result
556-
}
557-
guard let comps = getComponents(members) else {
558-
return nil
559-
}
560-
let cc = CharacterClass.custom(comps)
561-
return self.isInverted ? cc.inverted : cc
562-
}
563-
}
564-
565490
extension CharacterClass {
566491
// FIXME: Calling on inverted sets wont be the same as the
567492
// inverse of a boundary if at the start or end of the

Tests/RegexTests/MatchTests.swift

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,35 @@ extension RegexTests {
594594

595595
firstMatchTest("[[:script=Greek:]]", input: "123αβγxyz", match: "α")
596596

597+
func scalar(_ u: UnicodeScalar) -> UInt32 { u.value }
598+
599+
// Currently not supported in the matching engine.
600+
for s in scalar("\u{C}") ... scalar("\u{1B}") {
601+
let u = UnicodeScalar(s)!
602+
firstMatchTest(#"[\f-\e]"#, input: "\u{B}\u{1C}\(u)", match: "\(u)",
603+
xfail: true)
604+
}
605+
for u: UnicodeScalar in ["\u{7}", "\u{8}"] {
606+
firstMatchTest(#"[\a-\b]"#, input: "\u{6}\u{9}\(u)", match: "\(u)",
607+
xfail: true)
608+
}
609+
for s in scalar("\u{A}") ... scalar("\u{D}") {
610+
let u = UnicodeScalar(s)!
611+
firstMatchTest(#"[\n-\r]"#, input: "\u{9}\u{E}\(u)", match: "\(u)",
612+
xfail: true)
613+
}
614+
firstMatchTest(#"[\t-\t]"#, input: "\u{8}\u{A}\u{9}", match: "\u{9}",
615+
xfail: true)
616+
617+
for c: UnicodeScalar in ["a", "b", "c"] {
618+
firstMatchTest(#"[\c!-\C-#]"#, input: "def\(c)", match: "\(c)",
619+
xfail: true)
620+
}
621+
for c: UnicodeScalar in ["$", "%", "&", "'"] {
622+
firstMatchTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#,
623+
input: "#()\(c)", match: "\(c)", xfail: true)
624+
}
625+
597626
// MARK: Operators
598627

599628
firstMatchTest(

Tests/RegexTests/ParseTests.swift

Lines changed: 71 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,10 @@ extension RegexTests {
428428

429429
parseTest("[-]", charClass("-"))
430430

431+
// Empty character classes are forbidden, therefore this is a character
432+
// class of literal ']'.
433+
parseTest("[]]", charClass("]"))
434+
431435
// These are metacharacters in certain contexts, but normal characters
432436
// otherwise.
433437
parseTest(
@@ -494,6 +498,25 @@ extension RegexTests {
494498
parseTest("[*]", charClass("*"))
495499
parseTest("[{0}]", charClass("{", "0", "}"))
496500

501+
parseTest(#"[\f-\e]"#, charClass(
502+
range_m(.escaped(.formfeed), .escaped(.escape))))
503+
parseTest(#"[\a-\b]"#, charClass(
504+
range_m(.escaped(.alarm), .escaped(.backspace))))
505+
parseTest(#"[\n-\r]"#, charClass(
506+
range_m(.escaped(.newline), .escaped(.carriageReturn))))
507+
parseTest(#"[\t-\t]"#, charClass(
508+
range_m(.escaped(.tab), .escaped(.tab))))
509+
510+
parseTest(#"[\cX-\cY\C-A-\C-B\M-\C-A-\M-\C-B\M-A-\M-B]"#, charClass(
511+
range_m(.keyboardControl("X"), .keyboardControl("Y")),
512+
range_m(.keyboardControl("A"), .keyboardControl("B")),
513+
range_m(.keyboardMetaControl("A"), .keyboardMetaControl("B")),
514+
range_m(.keyboardMeta("A"), .keyboardMeta("B"))
515+
))
516+
517+
parseTest(#"[\N{DOLLAR SIGN}-\N{APOSTROPHE}]"#, charClass(
518+
range_m(.namedCharacter("DOLLAR SIGN"), .namedCharacter("APOSTROPHE"))))
519+
497520
// MARK: Operators
498521

499522
parseTest(
@@ -544,9 +567,8 @@ extension RegexTests {
544567
#"a\Q \Q \\.\Eb"#,
545568
concat("a", quote(#" \Q \\."#), "b"))
546569

547-
// These follow the PCRE behavior.
570+
// This follows the PCRE behavior.
548571
parseTest(#"\Q\\E"#, quote("\\"))
549-
parseTest(#"\E"#, "E")
550572

551573
parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
552574
syntax: .experimental)
@@ -566,6 +588,25 @@ extension RegexTests {
566588

567589
parseTest(#"["-"]"#, charClass(range_m("\"", "\"")))
568590

591+
// MARK: Escapes
592+
593+
// Not metachars, but we allow their escape as ASCII.
594+
parseTest(#"\<"#, "<")
595+
parseTest(#"\ "#, " ")
596+
parseTest(#"\\"#, "\\")
597+
598+
// Escaped U+3000 IDEOGRAPHIC SPACE.
599+
parseTest(#"\\#u{3000}"#, "\u{3000}")
600+
601+
// Control and meta controls.
602+
parseTest(#"\c "#, atom(.keyboardControl(" ")))
603+
parseTest(#"\c!"#, atom(.keyboardControl("!")))
604+
parseTest(#"\c~"#, atom(.keyboardControl("~")))
605+
parseTest(#"\C--"#, atom(.keyboardControl("-")))
606+
parseTest(#"\M-\C-a"#, atom(.keyboardMetaControl("a")))
607+
parseTest(#"\M-\C--"#, atom(.keyboardMetaControl("-")))
608+
parseTest(#"\M-a"#, atom(.keyboardMeta("a")))
609+
569610
// MARK: Comments
570611

571612
parseTest(
@@ -989,13 +1030,6 @@ extension RegexTests {
9891030
// Backreferences are not valid in custom character classes.
9901031
parseTest(#"[\8]"#, charClass("8"))
9911032
parseTest(#"[\9]"#, charClass("9"))
992-
parseTest(#"[\g]"#, charClass("g"))
993-
parseTest(#"[\g+30]"#, charClass("g", "+", "3", "0"))
994-
parseTest(#"[\g{1}]"#, charClass("g", "{", "1", "}"))
995-
parseTest(#"[\k'a']"#, charClass("k", "'", "a", "'"))
996-
997-
parseTest(#"\g"#, atom(.char("g")))
998-
parseTest(#"\k"#, atom(.char("k")))
9991033

10001034
// MARK: Character names.
10011035

@@ -1526,7 +1560,7 @@ extension RegexTests {
15261560
parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))
15271561

15281562
parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰"))
1529-
parseWithDelimitersTest(#"re'\🔥✅'"#, concat("🔥", ""))
1563+
parseWithDelimitersTest(#"re'🔥✅'"#, concat("🔥", ""))
15301564

15311565
// Printable ASCII characters.
15321566
delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
@@ -1871,10 +1905,37 @@ extension RegexTests {
18711905
diagnosticTest("(?<a-b", .expected(">"))
18721906
diagnosticTest("(?<a-b>", .expected(")"))
18731907

1908+
// The first ']' of a custom character class is literal, so this is missing
1909+
// the closing bracket.
1910+
diagnosticTest("[]", .expected("]"))
1911+
18741912
// MARK: Bad escapes
18751913

18761914
diagnosticTest("\\", .expectedEscape)
18771915

1916+
// TODO: Custom diagnostic for control sequence
1917+
diagnosticTest(#"\c"#, .unexpectedEndOfInput)
1918+
1919+
// TODO: Custom diagnostic for expected backref
1920+
diagnosticTest(#"\g"#, .invalidEscape("g"))
1921+
diagnosticTest(#"\k"#, .invalidEscape("k"))
1922+
1923+
// TODO: Custom diagnostic for backref in custom char class
1924+
diagnosticTest(#"[\g]"#, .invalidEscape("g"))
1925+
diagnosticTest(#"[\g+30]"#, .invalidEscape("g"))
1926+
diagnosticTest(#"[\g{1}]"#, .invalidEscape("g"))
1927+
diagnosticTest(#"[\k'a']"#, .invalidEscape("k"))
1928+
1929+
// TODO: Custom diagnostic for missing '\Q'
1930+
diagnosticTest(#"\E"#, .invalidEscape("E"))
1931+
1932+
// Non-ASCII non-whitespace cases.
1933+
diagnosticTest(#"\🔥"#, .invalidEscape("🔥"))
1934+
diagnosticTest(#"\🇩🇰"#, .invalidEscape("🇩🇰"))
1935+
diagnosticTest(#"\e\#u{301}"#, .invalidEscape("e\u{301}"))
1936+
diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é"))
1937+
diagnosticTest(#"\˂"#, .invalidEscape("˂"))
1938+
18781939
// MARK: Text Segment options
18791940

18801941
diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)

0 commit comments

Comments
 (0)