@@ -234,7 +234,7 @@ extension Source {
234
234
/// | 'x' HexDigit{2}
235
235
/// | 'U' HexDigit{8}
236
236
/// | 'o{' OctalDigit{1...} '}'
237
- /// | '0' OctalDigit{0...2 }
237
+ /// | OctalDigit{0...3 }
238
238
///
239
239
mutating func expectUnicodeScalar(
240
240
escapedCharacter base: Character
@@ -257,12 +257,12 @@ extension Source {
257
257
let str = try src. lexUntil ( eating: " } " ) . value
258
258
return try Source . validateUnicodeScalar ( str, . octal)
259
259
260
- case " 0 " :
260
+ case let c where c . isOctalDigit :
261
261
// We can read *up to* 2 more octal digits per PCRE.
262
- // FIXME: ICU can read up to 3 octal digits, we should have a parser
263
- // mode to switch.
264
- guard let str = src. tryEatPrefix ( maxLength: 2 , \. isOctalDigit) ? . string
265
- else { return Unicode . Scalar ( 0 ) }
262
+ // FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
263
+ // we should have a parser mode to switch.
264
+ let nextDigits = src. tryEatPrefix ( maxLength: 2 , \. isOctalDigit)
265
+ let str = String ( c ) + ( nextDigits ? . string ?? " " )
266
266
return try Source . validateUnicodeScalar ( str, . octal)
267
267
268
268
default :
@@ -661,6 +661,10 @@ extension Source {
661
661
}
662
662
}
663
663
664
+ /// Try to lex an absolute or relative numbered reference.
665
+ ///
666
+ /// NumberRef -> ('+' | '-')? <Decimal Number>
667
+ ///
664
668
private mutating func lexNumberedReference(
665
669
) throws -> Located < Reference > ? {
666
670
try recordLoc { src in
@@ -677,6 +681,10 @@ extension Source {
677
681
}
678
682
}
679
683
684
+ /// Try to lex a numbered reference, or otherwise a named reference.
685
+ ///
686
+ /// NameOrNumberRef -> NumberRef | <String>
687
+ ///
680
688
private mutating func expectNamedOrNumberedReference(
681
689
endingWith ending: String
682
690
) throws -> Located < Reference > {
@@ -712,9 +720,8 @@ extension Source {
712
720
/// | 'k{' <String> '}'
713
721
/// | [1-9] [0-9]+
714
722
///
715
- /// NumberRef -> ('+' | '-')? <Decimal Number>
716
- /// NameOrNumberRef -> NumberRef | <String>
717
723
private mutating func lexEscapedReference(
724
+ numberOfPriorGroups: Int
718
725
) throws -> Located < AST . Atom . Kind > ? {
719
726
try recordLoc { src in
720
727
if src. tryEat ( " g " ) {
@@ -754,14 +761,27 @@ extension Source {
754
761
return . char( " k " )
755
762
}
756
763
757
- // If we can lex a number other than 0 (as that's an octal sequence),
758
- // it's a backreference. Though we should make a note of whether it could
759
- // feasibly be an octal sequence, as the matching engine may need to
760
- // treat it as such.
761
- if src. peek ( ) != " 0 " , let num = try src. lexNumber ( ) {
762
- let digits = src. input [ num. location. range]
763
- let couldBeOctal = digits. count > 1 && digits. all ( \. isOctalDigit)
764
- return . backreference( . absolute( num. value, couldBeOctal: couldBeOctal) )
764
+ // Lexing \n is tricky. In PCRE it's treated as a backreference if its
765
+ // first digit is not 0 (as that is always octal) and one of the following
766
+ // holds:
767
+ //
768
+ // - It's 0 < n < 10 (as octal would be pointless here)
769
+ // - Its first digit is 8 or 9 (as not valid octal)
770
+ // - There have been as many prior groups as the reference.
771
+ //
772
+ // Oniguruma follows the same rules except the second one. e.g \81 and \91
773
+ // are instead treated as literal 81 and 91 respectively.
774
+ // TODO: If we want a strict Oniguruma mode, we'll need to add a check
775
+ // here.
776
+ if src. peek ( ) != " 0 " , let digits = src. peekPrefix ( \. isNumber) {
777
+ // First lex out the decimal digits and see if we can treat this as a
778
+ // backreference.
779
+ let num = try Source . validateNumber ( digits. string, Int . self, . decimal)
780
+ if num < 10 || digits. first == " 8 " || digits. first == " 9 " ||
781
+ num <= numberOfPriorGroups {
782
+ src. advance ( digits. count)
783
+ return . backreference( . absolute( num) )
784
+ }
765
785
}
766
786
return nil
767
787
}
@@ -774,7 +794,7 @@ extension Source {
774
794
/// | EscapedReference
775
795
///
776
796
mutating func expectEscaped(
777
- isInCustomCharacterClass ccc: Bool
797
+ isInCustomCharacterClass ccc: Bool , numberOfPriorGroups : Int
778
798
) throws -> Located < AST . Atom . Kind > {
779
799
try recordLoc { src in
780
800
// Keyboard control/meta
@@ -799,16 +819,13 @@ extension Source {
799
819
}
800
820
801
821
// References using escape syntax, e.g \1, \g{1}, \k<...>, ...
802
- if let ref = try src. lexEscapedReference ( ) ? . value {
822
+ // These are not valid inside custom character classes.
823
+ if !ccc, let ref = try src. lexEscapedReference (
824
+ numberOfPriorGroups: numberOfPriorGroups
825
+ ) ? . value {
803
826
return ref
804
827
}
805
828
806
- // Hexadecimal and octal unicode scalars.
807
- if let char = src. tryEat ( anyOf: " u " , " x " , " U " , " o " , " 0 " ) {
808
- return try . scalar(
809
- src. expectUnicodeScalar ( escapedCharacter: char) . value)
810
- }
811
-
812
829
let char = src. eat ( )
813
830
814
831
// Single-character builtins.
@@ -817,7 +834,17 @@ extension Source {
817
834
) {
818
835
return . escaped( builtin)
819
836
}
820
- return . char( char)
837
+
838
+ switch char {
839
+ // Hexadecimal and octal unicode scalars. This must be done after
840
+ // backreference lexing due to the ambiguity with \nnn.
841
+ case let c where c. isOctalDigit: fallthrough
842
+ case " u " , " x " , " U " , " o " :
843
+ return try . scalar(
844
+ src. expectUnicodeScalar ( escapedCharacter: char) . value)
845
+ default :
846
+ return . char( char)
847
+ }
821
848
}
822
849
}
823
850
@@ -834,7 +861,8 @@ extension Source {
834
861
/// ExpGroupStart -> '(_:'
835
862
///
836
863
mutating func lexAtom(
837
- isInCustomCharacterClass customCC: Bool
864
+ isInCustomCharacterClass customCC: Bool ,
865
+ numberOfPriorGroups: Int
838
866
) throws -> AST . Atom ? {
839
867
let kind : Located < AST . Atom . Kind > ? = try recordLoc { src in
840
868
// Check for not-an-atom, e.g. parser recursion termination
@@ -867,7 +895,8 @@ extension Source {
867
895
868
896
// Escaped
869
897
case " \\ " : return try src. expectEscaped (
870
- isInCustomCharacterClass: customCC) . value
898
+ isInCustomCharacterClass: customCC,
899
+ numberOfPriorGroups: numberOfPriorGroups) . value
871
900
872
901
case " ] " :
873
902
assert ( !customCC, " parser should have prevented this " )
@@ -882,13 +911,16 @@ extension Source {
882
911
883
912
/// Try to lex the end of a range in a custom character class, which consists
884
913
/// of a '-' character followed by an atom.
885
- mutating func lexCustomCharClassRangeEnd( ) throws -> AST . Atom ? {
914
+ mutating func lexCustomCharClassRangeEnd(
915
+ numberOfPriorGroups: Int
916
+ ) throws -> AST . Atom ? {
886
917
// Make sure we don't have a binary operator e.g '--', and the '-' is not
887
918
// ending the custom character class (in which case it is literal).
888
919
guard peekCCBinOp ( ) == nil && !starts( with: " -] " ) && tryEat ( " - " ) else {
889
920
return nil
890
921
}
891
- return try lexAtom ( isInCustomCharacterClass: true )
922
+ return try lexAtom ( isInCustomCharacterClass: true ,
923
+ numberOfPriorGroups: numberOfPriorGroups)
892
924
}
893
925
}
894
926
0 commit comments