Skip to content

Commit 3b5533f

Browse files
committed
Octal disambiguation
Implement octal disambiguation for the `\nnn` syntax where a backreference is only formed if there have been that many prior groups, or it begins with 8 or 9, or is less than 10. In addition, generalize the \0nn syntax to support arbitrary \nnn octal sequences inside and outside character classes.
1 parent 767a95f commit 3b5533f

File tree

8 files changed

+246
-87
lines changed

8 files changed

+246
-87
lines changed

Sources/_MatchingEngine/Regex/AST/Atom.swift

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -373,9 +373,7 @@ extension AST.Atom.CharacterProperty {
373373
public enum Reference: Hashable {
374374
// \n \gn \g{n} \g<n> \g'n' (?n) (?(n)...
375375
// Oniguruma: \k<n>, \k'n'
376-
// If the reference was written as \n, and n could potentially be an octal
377-
// sequence, `couldBeOctal` will be set to true.
378-
case absolute(Int, couldBeOctal: Bool = false)
376+
case absolute(Int)
379377

380378
// \g{-n} \g<+n> \g'+n' \g<-n> \g'-n' (?+n) (?-n)
381379
// (?(+n)... (?(-n)...

Sources/_MatchingEngine/Regex/Parse/LexicalAnalysis.swift

Lines changed: 60 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,7 @@ extension Source {
234234
/// | 'x' HexDigit{2}
235235
/// | 'U' HexDigit{8}
236236
/// | 'o{' OctalDigit{1...} '}'
237-
/// | '0' OctalDigit{0...2}
237+
/// | OctalDigit{1...3}
238238
///
239239
mutating func expectUnicodeScalar(
240240
escapedCharacter base: Character
@@ -257,12 +257,12 @@ extension Source {
257257
let str = try src.lexUntil(eating: "}").value
258258
return try Source.validateUnicodeScalar(str, .octal)
259259

260-
case "0":
260+
case let c where c.isOctalDigit:
261261
// We can read *up to* 2 more octal digits per PCRE.
262-
// FIXME: ICU can read up to 3 octal digits, we should have a parser
263-
// mode to switch.
264-
guard let str = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)?.string
265-
else { return Unicode.Scalar(0) }
262+
// FIXME: ICU can read up to 3 octal digits if the leading digit is 0,
263+
// we should have a parser mode to switch.
264+
let nextDigits = src.tryEatPrefix(maxLength: 2, \.isOctalDigit)
265+
let str = String(c) + (nextDigits?.string ?? "")
266266
return try Source.validateUnicodeScalar(str, .octal)
267267

268268
default:
@@ -661,6 +661,10 @@ extension Source {
661661
}
662662
}
663663

664+
/// Try to lex an absolute or relative numbered reference.
665+
///
666+
/// NumberRef -> ('+' | '-')? <Decimal Number>
667+
///
664668
private mutating func lexNumberedReference(
665669
) throws -> Located<Reference>? {
666670
try recordLoc { src in
@@ -677,6 +681,10 @@ extension Source {
677681
}
678682
}
679683

684+
/// Try to lex a numbered reference, or otherwise a named reference.
685+
///
686+
/// NameOrNumberRef -> NumberRef | <String>
687+
///
680688
private mutating func expectNamedOrNumberedReference(
681689
endingWith ending: String
682690
) throws -> Located<Reference> {
@@ -712,9 +720,8 @@ extension Source {
712720
/// | 'k{' <String> '}'
713721
/// | [1-9] [0-9]+
714722
///
715-
/// NumberRef -> ('+' | '-')? <Decimal Number>
716-
/// NameOrNumberRef -> NumberRef | <String>
717723
private mutating func lexEscapedReference(
724+
priorGroupCount: Int
718725
) throws -> Located<AST.Atom.Kind>? {
719726
try recordLoc { src in
720727
if src.tryEat("g") {
@@ -754,14 +761,27 @@ extension Source {
754761
return .char("k")
755762
}
756763

757-
// If we can lex a number other than 0 (as that's an octal sequence),
758-
// it's a backreference. Though we should make a note of whether it could
759-
// feasibly be an octal sequence, as the matching engine may need to
760-
// treat it as such.
761-
if src.peek() != "0", let num = try src.lexNumber() {
762-
let digits = src.input[num.location.range]
763-
let couldBeOctal = digits.count > 1 && digits.all(\.isOctalDigit)
764-
return .backreference(.absolute(num.value, couldBeOctal: couldBeOctal))
764+
// Lexing \n is tricky, as it's ambiguous with octal sequences. In PCRE it
765+
// is treated as a backreference if its first digit is not 0 (as that is
766+
// always octal) and one of the following holds:
767+
//
768+
// - It's 0 < n < 10 (as octal would be pointless here)
769+
// - Its first digit is 8 or 9 (as not valid octal)
770+
// - There have been as many prior groups as the reference.
771+
//
772+
// Oniguruma follows the same rules except the second one. e.g \81 and \91
773+
// are instead treated as literal 81 and 91 respectively.
774+
// TODO: If we want a strict Oniguruma mode, we'll need to add a check
775+
// here.
776+
if src.peek() != "0", let digits = src.peekPrefix(\.isNumber) {
777+
// First lex out the decimal digits and see if we can treat this as a
778+
// backreference.
779+
let num = try Source.validateNumber(digits.string, Int.self, .decimal)
780+
if num < 10 || digits.first == "8" || digits.first == "9" ||
781+
num <= priorGroupCount {
782+
src.advance(digits.count)
783+
return .backreference(.absolute(num))
784+
}
765785
}
766786
return nil
767787
}
@@ -774,7 +794,7 @@ extension Source {
774794
/// | EscapedReference
775795
///
776796
mutating func expectEscaped(
777-
isInCustomCharacterClass ccc: Bool
797+
isInCustomCharacterClass ccc: Bool, priorGroupCount: Int
778798
) throws -> Located<AST.Atom.Kind> {
779799
try recordLoc { src in
780800
// Keyboard control/meta
@@ -799,16 +819,13 @@ extension Source {
799819
}
800820

801821
// References using escape syntax, e.g \1, \g{1}, \k<...>, ...
802-
if let ref = try src.lexEscapedReference()?.value {
822+
// These are not valid inside custom character classes.
823+
if !ccc, let ref = try src.lexEscapedReference(
824+
priorGroupCount: priorGroupCount
825+
)?.value {
803826
return ref
804827
}
805828

806-
// Hexadecimal and octal unicode scalars.
807-
if let char = src.tryEat(anyOf: "u", "x", "U", "o", "0") {
808-
return try .scalar(
809-
src.expectUnicodeScalar(escapedCharacter: char).value)
810-
}
811-
812829
let char = src.eat()
813830

814831
// Single-character builtins.
@@ -817,7 +834,17 @@ extension Source {
817834
) {
818835
return .escaped(builtin)
819836
}
820-
return .char(char)
837+
838+
switch char {
839+
// Hexadecimal and octal unicode scalars. This must be done after
840+
// backreference lexing due to the ambiguity with \nnn.
841+
case let c where c.isOctalDigit: fallthrough
842+
case "u", "x", "U", "o":
843+
return try .scalar(
844+
src.expectUnicodeScalar(escapedCharacter: char).value)
845+
default:
846+
return .char(char)
847+
}
821848
}
822849
}
823850

@@ -834,7 +861,7 @@ extension Source {
834861
/// ExpGroupStart -> '(_:'
835862
///
836863
mutating func lexAtom(
837-
isInCustomCharacterClass customCC: Bool
864+
isInCustomCharacterClass customCC: Bool, priorGroupCount: Int
838865
) throws -> AST.Atom? {
839866
let kind: Located<AST.Atom.Kind>? = try recordLoc { src in
840867
// Check for not-an-atom, e.g. parser recursion termination
@@ -867,7 +894,8 @@ extension Source {
867894

868895
// Escaped
869896
case "\\": return try src.expectEscaped(
870-
isInCustomCharacterClass: customCC).value
897+
isInCustomCharacterClass: customCC,
898+
priorGroupCount: priorGroupCount).value
871899

872900
case "]":
873901
assert(!customCC, "parser should have prevented this")
@@ -882,13 +910,16 @@ extension Source {
882910

883911
/// Try to lex the end of a range in a custom character class, which consists
884912
/// of a '-' character followed by an atom.
885-
mutating func lexCustomCharClassRangeEnd() throws -> AST.Atom? {
913+
mutating func lexCustomCharClassRangeEnd(
914+
priorGroupCount: Int
915+
) throws -> AST.Atom? {
886916
// Make sure we don't have a binary operator e.g '--', and the '-' is not
887917
// ending the custom character class (in which case it is literal).
888918
guard peekCCBinOp() == nil && !starts(with: "-]") && tryEat("-") else {
889919
return nil
890920
}
891-
return try lexAtom(isInCustomCharacterClass: true)
921+
return try lexAtom(isInCustomCharacterClass: true,
922+
priorGroupCount: priorGroupCount)
892923
}
893924
}
894925

Sources/_MatchingEngine/Regex/Parse/Parse.swift

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,14 @@ Lexical analysis provides the following:
4343
private struct Parser {
4444
var source: Source
4545

46+
/// Tracks the number of parent custom character classes to allow us to
47+
/// determine whether or not to lex with custom character class syntax.
4648
fileprivate var customCharacterClassDepth = 0
4749

50+
/// Tracks the number of group openings we've seen, to disambiguate the '\n'
51+
/// syntax as a backreference or an octal sequence.
52+
fileprivate var priorGroupCount = 0
53+
4854
init(_ source: Source) {
4955
self.source = source
5056
}
@@ -163,6 +169,7 @@ extension Parser {
163169
let _start = source.currentPosition
164170

165171
if let kind = try source.lexGroupStart() {
172+
priorGroupCount += 1
166173
let child = try parse()
167174
try source.expect(")")
168175
return .group(.init(kind, child, loc(_start)))
@@ -173,7 +180,8 @@ extension Parser {
173180
}
174181

175182
if let atom = try source.lexAtom(
176-
isInCustomCharacterClass: isInCustomCharacterClass
183+
isInCustomCharacterClass: isInCustomCharacterClass,
184+
priorGroupCount: priorGroupCount
177185
) {
178186
// TODO: track source locations
179187
return .atom(atom)
@@ -247,11 +255,14 @@ extension Parser {
247255
continue
248256
}
249257

250-
guard let atom = try source.lexAtom(isInCustomCharacterClass: true)
251-
else { break }
258+
guard let atom = try source.lexAtom(
259+
isInCustomCharacterClass: true, priorGroupCount: priorGroupCount)
260+
else { break }
252261

253262
// Range between atoms.
254-
if let rhs = try source.lexCustomCharClassRangeEnd() {
263+
if let rhs = try source.lexCustomCharClassRangeEnd(
264+
priorGroupCount: priorGroupCount
265+
) {
255266
guard atom.literalCharacterValue != nil &&
256267
rhs.literalCharacterValue != nil else {
257268
throw ParseError.invalidCharacterClassRangeOperand

Sources/_MatchingEngine/Regex/Parse/Source.swift

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,15 @@ extension Source {
119119
mutating func tryEatPrefix(
120120
maxLength: Int? = nil,
121121
_ f: (Char) -> Bool
122+
) -> Input.SubSequence? {
123+
guard let pre = peekPrefix(maxLength: maxLength, f) else { return nil }
124+
defer { self.advance(pre.count) }
125+
return pre
126+
}
127+
128+
func peekPrefix(
129+
maxLength: Int? = nil,
130+
_ f: (Char) -> Bool
122131
) -> Input.SubSequence? {
123132
let chunk: Input.SubSequence
124133
if let maxLength = maxLength {
@@ -129,7 +138,6 @@ extension Source {
129138
let pre = chunk.prefix(while: f)
130139
guard !pre.isEmpty else { return nil }
131140

132-
defer { self.advance(pre.count) }
133141
return pre
134142
}
135143

Sources/_StringProcessing/ASTBuilder.swift

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,10 @@ func concat(_ asts: AST...) -> AST {
3030
concat(asts)
3131
}
3232

33+
func empty() -> AST {
34+
.empty(.init(.fake))
35+
}
36+
3337
func group(
3438
_ kind: AST.Group.Kind, _ child: AST
3539
) -> AST {
@@ -182,6 +186,19 @@ func escaped(
182186
func scalar(_ s: Unicode.Scalar) -> AST {
183187
atom(.scalar(s))
184188
}
189+
func scalar_m(_ s: Unicode.Scalar) -> AST.CustomCharacterClass.Member {
190+
atom_m(.scalar(s))
191+
}
192+
193+
func backreference(_ r: Reference) -> AST {
194+
atom(.backreference(r))
195+
}
196+
func subpattern(_ r: Reference) -> AST {
197+
atom(.subpattern(r))
198+
}
199+
func condition(_ r: Reference) -> AST {
200+
atom(.condition(r))
201+
}
185202

186203
func prop(
187204
_ kind: AST.Atom.CharacterProperty.Kind,

Tests/RegexTests/LexTests.swift

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,13 @@ func diagnose(
2929
}
3030
}
3131

32+
extension Source {
33+
@discardableResult
34+
fileprivate mutating func lexBasicAtom() throws -> AST.Atom? {
35+
try lexAtom(isInCustomCharacterClass: false, priorGroupCount: 0)
36+
}
37+
}
38+
3239
extension RegexTests {
3340
func testLexicalAnalysis() {
3441
diagnose("a", expecting: .expected("b")) { src in
@@ -92,27 +99,13 @@ extension RegexTests {
9299
}
93100

94101
// Test expected closing delimiters.
95-
diagnose(#"\u{5"#, expecting: .expected("}")) { src in
96-
_ = try src.lexAtom(isInCustomCharacterClass: false)
97-
}
98-
diagnose(#"\x{5"#, expecting: .expected("}")) { src in
99-
_ = try src.lexAtom(isInCustomCharacterClass: false)
100-
}
101-
diagnose(#"\N{A"#, expecting: .expected("}")) { src in
102-
_ = try src.lexAtom(isInCustomCharacterClass: false)
103-
}
104-
diagnose(#"\N{U+A"#, expecting: .expected("}")) { src in
105-
_ = try src.lexAtom(isInCustomCharacterClass: false)
106-
}
107-
diagnose(#"\p{a"#, expecting: .expected("}")) { src in
108-
_ = try src.lexAtom(isInCustomCharacterClass: false)
109-
}
110-
diagnose(#"\p{a="#, expecting: .expected("}")) { src in
111-
_ = try src.lexAtom(isInCustomCharacterClass: false)
112-
}
113-
diagnose(#"(?#"#, expecting: .expected(")")) { src in
114-
_ = try src.lexComment()
115-
}
102+
diagnose(#"\u{5"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
103+
diagnose(#"\x{5"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
104+
diagnose(#"\N{A"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
105+
diagnose(#"\N{U+A"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
106+
diagnose(#"\p{a"#, expecting: .expected("}")) { try $0.lexBasicAtom() }
107+
diagnose(#"\p{a="#, expecting: .expected("}")) { try $0.lexBasicAtom() }
108+
diagnose(#"(?#"#, expecting: .expected(")")) { _ = try $0.lexComment() }
116109

117110
// TODO: want to dummy print out source ranges, etc, test that.
118111
}

Tests/RegexTests/MatchTests.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -531,7 +531,7 @@ extension RegexTests {
531531
matchTest(#"(.)\1"#, input: "112", match: "11", xfail: true)
532532
matchTest(#"(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)\10"#,
533533
input: "aaaaaaaaabbc", match: "aaaaaaaaabb", xfail: true)
534-
matchTest(#"(.)\10"#, input: "a\u{8}b", match: "a\u{8}", xfail: true)
534+
matchTest(#"(.)\10"#, input: "a\u{8}b", match: "a\u{8}")
535535

536536
matchTest(#"(.)\g001"#, input: "112", match: "11", xfail: true)
537537
matchTest(#"(.)(.)\g-02"#, input: "abac", match: "aba", xfail: true)

0 commit comments

Comments
 (0)