Skip to content

Commit 767a95f

Browse files
committed
Parse escaped backreferences and subpatterns
Parse the escaped syntaxes for backreferences and subpatterns (the latter are so syntactically similar, it made sense to also parse them). This doesn't yet handle the non-escaped syntax for either, in particular Python-style backreferences `(?P=...)`. These are syntactically similar to groups (despite being atoms), so will require some more thought on how to parse.
1 parent 6cfceaf commit 767a95f

File tree

5 files changed

+194
-29
lines changed

5 files changed

+194
-29
lines changed

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -52,8 +52,6 @@ extension AST {
5252
case endOfLine
5353

5454
// References
55-
//
56-
// TODO: Haven't thought through these a ton
5755
case backreference(Reference)
5856
case subpattern(Reference)
5957
case condition(Reference)
@@ -375,7 +373,9 @@ extension AST.Atom.CharacterProperty {
375373
public enum Reference: Hashable {
376374
// \n \gn \g{n} \g<n> \g'n' (?n) (?(n)...
377375
// Oniguruma: \k<n>, \k'n'
378-
case absolute(Int)
376+
// If the reference was written as \n, and n could potentially be an octal
377+
// sequence, `couldBeOctal` will be set to true.
378+
case absolute(Int, couldBeOctal: Bool = false)
379379

380380
// \g{-n} \g<+n> \g'+n' \g<-n> \g'-n' (?+n) (?-n)
381381
// (?(+n)... (?(-n)...
@@ -414,12 +414,8 @@ extension AST.Atom: _ASTPrintable {
414414
case .startOfLine: return "^"
415415
case .endOfLine: return "$"
416416

417-
case .backreference(_):
418-
fatalError("TODO")
419-
case .subpattern(_):
420-
fatalError("TODO")
421-
case .condition(_):
422-
fatalError("TODO")
417+
case .backreference(let r), .subpattern(let r), .condition(let r):
418+
return "\(r)"
423419

424420
case .char, .scalar:
425421
fatalError("Unreachable")

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 129 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,13 @@ extension Source {
205205
try lexNumber(Int.self, .decimal)
206206
}
207207

208+
mutating func expectNumber() throws -> Located<Int> {
209+
guard let num = try lexNumber() else {
210+
throw ParseError.expectedNumber("", kind: .decimal)
211+
}
212+
return num
213+
}
214+
208215
/// Eat a scalar value from hexadecimal notation off the front
209216
private mutating func expectUnicodeScalar(
210217
numDigits: Int
@@ -654,12 +661,118 @@ extension Source {
654661
}
655662
}
656663

664+
private mutating func lexNumberedReference(
665+
) throws -> Located<Reference>? {
666+
try recordLoc { src in
667+
if src.tryEat("+") {
668+
return .relative(try src.expectNumber().value)
669+
}
670+
if src.tryEat("-") {
671+
return .relative(try -src.expectNumber().value)
672+
}
673+
if let num = try src.lexNumber() {
674+
return .absolute(num.value)
675+
}
676+
return nil
677+
}
678+
}
679+
680+
private mutating func expectNamedOrNumberedReference(
681+
endingWith ending: String
682+
) throws -> Located<Reference> {
683+
try recordLoc { src in
684+
if let numbered = try src.lexNumberedReference() {
685+
try src.expect(sequence: ending)
686+
return numbered.value
687+
}
688+
return .named(try src.lexUntil(eating: ending).value)
689+
}
690+
}
691+
692+
private static func getClosingDelimiter(
693+
for openChar: Character
694+
) -> Character {
695+
switch openChar {
696+
case "<": return ">"
697+
case "'": return "'"
698+
case "{": return "}"
699+
default:
700+
fatalError("Not implemented")
701+
}
702+
}
703+
704+
/// Lex an escaped reference for a backreference or subpattern.
705+
///
706+
/// EscapedReference -> 'g{' NameOrNumberRef '}'
707+
/// | 'g<' NameOrNumberRef '>'
708+
/// | "g'" NameOrNumberRef "'"
709+
/// | 'g' NumberRef
710+
/// | 'k<' <String> '>'
711+
/// | "k'" <String> "'"
712+
/// | 'k{' <String> '}'
713+
/// | [1-9] [0-9]+
714+
///
715+
/// NumberRef -> ('+' | '-')? <Decimal Number>
716+
/// NameOrNumberRef -> NumberRef | <String>
717+
private mutating func lexEscapedReference(
718+
) throws -> Located<AST.Atom.Kind>? {
719+
try recordLoc { src in
720+
if src.tryEat("g") {
721+
// PCRE-style backreferences.
722+
if src.tryEat("{") {
723+
let ref = try src.expectNamedOrNumberedReference(
724+
endingWith: "}").value
725+
return .backreference(ref)
726+
}
727+
728+
// Oniguruma-style subpatterns.
729+
if let openChar = src.tryEat(anyOf: "<", "'") {
730+
let ref = try src.expectNamedOrNumberedReference(
731+
endingWith: String(Source.getClosingDelimiter(for: openChar))).value
732+
return .subpattern(ref)
733+
}
734+
735+
// PCRE allows \g followed by a bare numeric reference.
736+
if let ref = try src.lexNumberedReference() {
737+
return .backreference(ref.value)
738+
}
739+
740+
// Fallback to a literal character. We need to return here as we've
741+
// already eaten the 'g'.
742+
return .char("g")
743+
}
744+
745+
if src.tryEat("k") {
746+
// Perl/.NET-style backreferences.
747+
if let openChar = src.tryEat(anyOf: "<", "'", "{") {
748+
let closingChar = Source.getClosingDelimiter(for: openChar)
749+
return .backreference(.named(
750+
try src.lexUntil(eating: closingChar).value))
751+
}
752+
// Fallback to a literal character. We need to return here as we've
753+
// already eaten the 'k'.
754+
return .char("k")
755+
}
756+
757+
// If we can lex a number other than 0 (as that's an octal sequence),
758+
// it's a backreference. Though we should make a note of whether it could
759+
// feasibly be an octal sequence, as the matching engine may need to
760+
// treat it as such.
761+
if src.peek() != "0", let num = try src.lexNumber() {
762+
let digits = src.input[num.location.range]
763+
let couldBeOctal = digits.count > 1 && digits.all(\.isOctalDigit)
764+
return .backreference(.absolute(num.value, couldBeOctal: couldBeOctal))
765+
}
766+
return nil
767+
}
768+
}
769+
657770
/// Consume an escaped atom, starting from after the backslash
658771
///
659772
/// Escaped -> KeyboardModified | Builtin
660773
/// | UniScalar | Property | NamedCharacter
774+
/// | EscapedReference
661775
///
662-
/// TODO: references
663776
mutating func expectEscaped(
664777
isInCustomCharacterClass ccc: Bool
665778
) throws -> Located<AST.Atom.Kind> {
@@ -685,30 +798,26 @@ extension Source {
685798
return .property(prop.value)
686799
}
687800

688-
let char = src.eat()
689-
690-
// Single-character builtins
691-
if let builtin = AST.Atom.EscapedBuiltin(
692-
char, inCustomCharacterClass: ccc
693-
) {
694-
return .escaped(builtin)
801+
// References using escape syntax, e.g \1, \g{1}, \k<...>, ...
802+
if let ref = try src.lexEscapedReference()?.value {
803+
return ref
695804
}
696805

697-
switch char {
698-
// Scalars
699-
case "u", "x", "U", "o", "0":
806+
// Hexadecimal and octal unicode scalars.
807+
if let char = src.tryEat(anyOf: "u", "x", "U", "o", "0") {
700808
return try .scalar(
701809
src.expectUnicodeScalar(escapedCharacter: char).value)
810+
}
702811

703-
// Unicode property checks
704-
case "p", "P":
705-
fatalError("TODO: properties")
706-
707-
case "1"..."9", "g", "k":
708-
fatalError("TODO: References")
812+
let char = src.eat()
709813

710-
default: return .char(char)
814+
// Single-character builtins.
815+
if let builtin = AST.Atom.EscapedBuiltin(
816+
char, inCustomCharacterClass: ccc
817+
) {
818+
return .escaped(builtin)
711819
}
820+
return .char(char)
712821
}
713822
}
714823

@@ -741,6 +850,8 @@ extension Source {
741850
return .property(prop)
742851
}
743852

853+
// TODO: Python-style backreferences (?P=...), which look like groups.
854+
744855
let char = src.eat()
745856
switch char {
746857
case ")", "|":
@@ -758,8 +869,6 @@ extension Source {
758869
case "\\": return try src.expectEscaped(
759870
isInCustomCharacterClass: customCC).value
760871

761-
// TODO: backreferences et al here?
762-
763872
case "]":
764873
assert(!customCC, "parser should have prevented this")
765874
fallthrough

Sources/_MatchingEngine/Regex/Parse/Source.swift

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,17 @@ extension Source {
7676
return true
7777
}
7878

79+
mutating func tryEat<C: Collection>(anyOf set: C) -> Char?
80+
where C.Element == Char
81+
{
82+
guard let c = peek(), set.contains(c) else { return nil }
83+
advance()
84+
return c
85+
}
86+
mutating func tryEat(anyOf set: Char...) -> Char? {
87+
tryEat(anyOf: set)
88+
}
89+
7990
mutating func eat(asserting c: Char) {
8091
assert(peek() == c)
8192
advance()

Tests/RegexTests/MatchTests.swift

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -525,5 +525,18 @@ extension RegexTests {
525525

526526

527527
}
528+
529+
func testMatchReferences() {
530+
// TODO: Implement backreference/subpattern matching.
531+
matchTest(#"(.)\1"#, input: "112", match: "11", xfail: true)
532+
matchTest(#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
533+
input: "aaaaaaaaabbc", match: "aaaaaaaaabb", xfail: true)
534+
matchTest(#"(.)\10"#, input: "a\u{8}b", match: "a\u{8}", xfail: true)
535+
536+
matchTest(#"(.)\g001"#, input: "112", match: "11", xfail: true)
537+
matchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true)
538+
matchTest(#"(?<a>.)(.)\k<a>"#, input: "abac", match: "aba", xfail: true)
539+
matchTest(#"\g'+2'(.)(.)"#, input: "abac", match: "aba", xfail: true)
540+
}
528541
}
529542

Tests/RegexTests/ParseTests.swift

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,6 +363,40 @@ extension RegexTests {
363363
parseTest("a(*atomic_script_run:b)c",
364364
concat("a", atomicScriptRun("b"), "c"))
365365

366+
// MARK: References
367+
368+
parseTest(#"\1"#, atom(.backreference(.absolute(1))))
369+
parseTest(#"\10"#, atom(.backreference(.absolute(10, couldBeOctal: true))))
370+
parseTest(#"\18"#, atom(.backreference(.absolute(18, couldBeOctal: false))))
371+
parseTest(#"\7777"#, atom(.backreference(.absolute(7777, couldBeOctal: true))))
372+
373+
parseTest(#"\g1"#, atom(.backreference(.absolute(1))))
374+
parseTest(#"\g001"#, atom(.backreference(.absolute(1))))
375+
parseTest(#"\g52"#, atom(.backreference(.absolute(52))))
376+
parseTest(#"\g-01"#, atom(.backreference(.relative(-1))))
377+
parseTest(#"\g+30"#, atom(.backreference(.relative(30))))
378+
379+
parseTest(#"\g{1}"#, atom(.backreference(.absolute(1))))
380+
parseTest(#"\g{001}"#, atom(.backreference(.absolute(1))))
381+
parseTest(#"\g{52}"#, atom(.backreference(.absolute(52))))
382+
parseTest(#"\g{-01}"#, atom(.backreference(.relative(-1))))
383+
parseTest(#"\g{+30}"#, atom(.backreference(.relative(30))))
384+
385+
parseTest(#"\k{a0}"#, atom(.backreference(.named("a0"))))
386+
parseTest(#"\k<bc>"#, atom(.backreference(.named("bc"))))
387+
parseTest(#"\k''"#, atom(.backreference(.named(""))))
388+
parseTest(#"\g{abc}"#, atom(.backreference(.named("abc"))))
389+
390+
parseTest(#"\g<1>"#, atom(.subpattern(.absolute(1))))
391+
parseTest(#"\g<001>"#, atom(.subpattern(.absolute(1))))
392+
parseTest(#"\g'52'"#, atom(.subpattern(.absolute(52))))
393+
parseTest(#"\g'-01'"#, atom(.subpattern(.relative(-1))))
394+
parseTest(#"\g'+30'"#, atom(.subpattern(.relative(30))))
395+
parseTest(#"\g'abc'"#, atom(.subpattern(.named("abc"))))
396+
397+
parseTest(#"\g"#, atom(.char("g")))
398+
parseTest(#"\k"#, atom(.char("k")))
399+
366400
// MARK: Character names.
367401

368402
parseTest(#"\N{abc}"#, atom(.namedCharacter("abc")))
@@ -477,6 +511,8 @@ extension RegexTests {
477511
parseNotEqualTest(#"([a-c&&e]*)+"#,
478512
#"([a-d&&e]*)+"#)
479513

514+
parseNotEqualTest(#"\1"#, #"\10"#)
515+
480516
// TODO: failure tests
481517
}
482518

0 commit comments

Comments
 (0)