@@ -205,6 +205,13 @@ extension Source {
205
205
try lexNumber ( Int . self, . decimal)
206
206
}
207
207
208
+ mutating func expectNumber( ) throws -> Located < Int > {
209
+ guard let num = try lexNumber ( ) else {
210
+ throw ParseError . expectedNumber ( " " , kind: . decimal)
211
+ }
212
+ return num
213
+ }
214
+
208
215
/// Eat a scalar value from hexadecimal notation off the front
209
216
private mutating func expectUnicodeScalar(
210
217
numDigits: Int
@@ -654,12 +661,118 @@ extension Source {
654
661
}
655
662
}
656
663
664
+ private mutating func lexNumberedReference(
665
+ ) throws -> Located < Reference > ? {
666
+ try recordLoc { src in
667
+ if src. tryEat ( " + " ) {
668
+ return . relative( try src. expectNumber ( ) . value)
669
+ }
670
+ if src. tryEat ( " - " ) {
671
+ return . relative( try - src. expectNumber ( ) . value)
672
+ }
673
+ if let num = try src. lexNumber ( ) {
674
+ return . absolute( num. value)
675
+ }
676
+ return nil
677
+ }
678
+ }
679
+
680
+ private mutating func expectNamedOrNumberedReference(
681
+ endingWith ending: String
682
+ ) throws -> Located < Reference > {
683
+ try recordLoc { src in
684
+ if let numbered = try src. lexNumberedReference ( ) {
685
+ try src. expect ( sequence: ending)
686
+ return numbered. value
687
+ }
688
+ return . named( try src. lexUntil ( eating: ending) . value)
689
+ }
690
+ }
691
+
692
+ private static func getClosingDelimiter(
693
+ for openChar: Character
694
+ ) -> Character {
695
+ switch openChar {
696
+ case " < " : return " > "
697
+ case " ' " : return " ' "
698
+ case " { " : return " } "
699
+ default :
700
+ fatalError ( " Not implemented " )
701
+ }
702
+ }
703
+
704
+ /// Lex an escaped reference for a backreference or subpattern.
705
+ ///
706
+ /// EscapedReference -> 'g{' NameOrNumberRef '}'
707
+ /// | 'g<' NameOrNumberRef '>'
708
+ /// | "g'" NameOrNumberRef "'"
709
+ /// | 'g' NumberRef
710
+ /// | 'k<' <String> '>'
711
+ /// | "k'" <String> "'"
712
+ /// | 'k{' <String> '}'
713
+ /// | [1-9] [0-9]+
714
+ ///
715
+ /// NumberRef -> ('+' | '-')? <Decimal Number>
716
+ /// NameOrNumberRef -> NumberRef | <String>
717
+ private mutating func lexEscapedReference(
718
+ ) throws -> Located < AST . Atom . Kind > ? {
719
+ try recordLoc { src in
720
+ if src. tryEat ( " g " ) {
721
+ // PCRE-style backreferences.
722
+ if src. tryEat ( " { " ) {
723
+ let ref = try src. expectNamedOrNumberedReference (
724
+ endingWith: " } " ) . value
725
+ return . backreference( ref)
726
+ }
727
+
728
+ // Oniguruma-style subpatterns.
729
+ if let openChar = src. tryEat ( anyOf: " < " , " ' " ) {
730
+ let ref = try src. expectNamedOrNumberedReference (
731
+ endingWith: String ( Source . getClosingDelimiter ( for: openChar) ) ) . value
732
+ return . subpattern( ref)
733
+ }
734
+
735
+ // PCRE allows \g followed by a bare numeric reference.
736
+ if let ref = try src. lexNumberedReference ( ) {
737
+ return . backreference( ref. value)
738
+ }
739
+
740
+ // Fallback to a literal character. We need to return here as we've
741
+ // already eaten the 'g'.
742
+ return . char( " g " )
743
+ }
744
+
745
+ if src. tryEat ( " k " ) {
746
+ // Perl/.NET-style backreferences.
747
+ if let openChar = src. tryEat ( anyOf: " < " , " ' " , " { " ) {
748
+ let closingChar = Source . getClosingDelimiter ( for: openChar)
749
+ return . backreference( . named(
750
+ try src. lexUntil ( eating: closingChar) . value) )
751
+ }
752
+ // Fallback to a literal character. We need to return here as we've
753
+ // already eaten the 'k'.
754
+ return . char( " k " )
755
+ }
756
+
757
+ // If we can lex a number other than 0 (as that's an octal sequence),
758
+ // it's a backreference. Though we should make a note of whether it could
759
+ // feasibly be an octal sequence, as the matching engine may need to
760
+ // treat it as such.
761
+ if src. peek ( ) != " 0 " , let num = try src. lexNumber ( ) {
762
+ let digits = src. input [ num. location. range]
763
+ let couldBeOctal = digits. count > 1 && digits. all ( \. isOctalDigit)
764
+ return . backreference( . absolute( num. value, couldBeOctal: couldBeOctal) )
765
+ }
766
+ return nil
767
+ }
768
+ }
769
+
657
770
/// Consume an escaped atom, starting from after the backslash
658
771
///
659
772
/// Escaped -> KeyboardModified | Builtin
660
773
/// | UniScalar | Property | NamedCharacter
774
+ /// | EscapedReference
661
775
///
662
- /// TODO: references
663
776
mutating func expectEscaped(
664
777
isInCustomCharacterClass ccc: Bool
665
778
) throws -> Located < AST . Atom . Kind > {
@@ -685,30 +798,26 @@ extension Source {
685
798
return . property( prop. value)
686
799
}
687
800
688
- let char = src. eat ( )
689
-
690
- // Single-character builtins
691
- if let builtin = AST . Atom. EscapedBuiltin (
692
- char, inCustomCharacterClass: ccc
693
- ) {
694
- return . escaped( builtin)
801
+ // References using escape syntax, e.g \1, \g{1}, \k<...>, ...
802
+ if let ref = try src. lexEscapedReference ( ) ? . value {
803
+ return ref
695
804
}
696
805
697
- switch char {
698
- // Scalars
699
- case " u " , " x " , " U " , " o " , " 0 " :
806
+ // Hexadecimal and octal unicode scalars.
807
+ if let char = src. tryEat ( anyOf: " u " , " x " , " U " , " o " , " 0 " ) {
700
808
return try . scalar(
701
809
src. expectUnicodeScalar ( escapedCharacter: char) . value)
810
+ }
702
811
703
- // Unicode property checks
704
- case " p " , " P " :
705
- fatalError ( " TODO: properties " )
706
-
707
- case " 1 " ... " 9 " , " g " , " k " :
708
- fatalError ( " TODO: References " )
812
+ let char = src. eat ( )
709
813
710
- default : return . char( char)
814
+ // Single-character builtins.
815
+ if let builtin = AST . Atom. EscapedBuiltin (
816
+ char, inCustomCharacterClass: ccc
817
+ ) {
818
+ return . escaped( builtin)
711
819
}
820
+ return . char( char)
712
821
}
713
822
}
714
823
@@ -741,6 +850,8 @@ extension Source {
741
850
return . property( prop)
742
851
}
743
852
853
+ // TODO: Python-style backreferences (?P=...), which look like groups.
854
+
744
855
let char = src. eat ( )
745
856
switch char {
746
857
case " ) " , " | " :
@@ -758,8 +869,6 @@ extension Source {
758
869
case " \\ " : return try src. expectEscaped (
759
870
isInCustomCharacterClass: customCC) . value
760
871
761
- // TODO: backreferences et al here?
762
-
763
872
case " ] " :
764
873
assert ( !customCC, " parser should have prevented this " )
765
874
fallthrough
0 commit comments