Skip to content

Commit 49dcaa1

Browse files
committed
Fix \o parsing crash
This was caused by the fact that we'd walk into `expectUnicodeScalar` if we saw `\o`, but we only want to parse `\o{`. Instead, change it to be a `lex..` method, and bail if we don't lex a scalar.
1 parent d8d1d9c commit 49dcaa1

File tree

2 files changed

+57
-58
lines changed

2 files changed

+57
-58
lines changed

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 55 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -342,8 +342,8 @@ extension Source {
342342
}.value
343343
}
344344

345-
/// Eat a scalar off the front, starting from after the
346-
/// backslash and base character (e.g. `\u` or `\x`).
345+
/// Try to eat a scalar off the front, starting from after the backslash and
346+
/// base character (e.g. `\u` or `\x`).
347347
///
348348
/// UniScalar -> 'u{' UniScalarSequence '}'
349349
/// | 'u' HexDigit{4}
@@ -353,60 +353,60 @@ extension Source {
353353
/// | 'o{' OctalDigit{1...} '}'
354354
/// | '0' OctalDigit{0...3}
355355
///
356-
mutating func expectUnicodeScalar(
357-
escapedCharacter base: Character
358-
) throws -> AST.Atom.Kind {
356+
mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? {
359357
try recordLoc { src in
358+
try src.tryEating { src in
360359

361-
func nullScalar() -> AST.Atom.Kind {
362-
let pos = src.currentPosition
363-
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
364-
}
365-
366-
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
367-
switch base {
368-
// Hex numbers.
369-
case "u" where src.tryEat("{"):
370-
return try src.expectUnicodeScalarSequence(eating: "}")
371-
372-
case "x" where src.tryEat("{"):
373-
let str = try src.lexUntil(eating: "}")
374-
return .scalar(try Source.validateUnicodeScalar(str, .hex))
375-
376-
case "x":
377-
// \x expects *up to* 2 digits.
378-
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
379-
else {
380-
// In PCRE, \x without any valid hex digits is \u{0}.
381-
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
382-
// could be changed to throw an error if we had a parsing mode for
383-
// them.
384-
return nullScalar()
360+
func nullScalar() -> AST.Atom.Kind {
361+
let pos = src.currentPosition
362+
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
385363
}
386-
return .scalar(try Source.validateUnicodeScalar(digits, .hex))
387364

388-
case "u":
389-
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
390-
case "U":
391-
return .scalar(try src.expectUnicodeScalar(numDigits: 8))
365+
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
366+
switch src.tryEat() {
367+
// Hex numbers.
368+
case "u" where src.tryEat("{"):
369+
return try src.expectUnicodeScalarSequence(eating: "}")
370+
371+
case "x" where src.tryEat("{"):
372+
let str = try src.lexUntil(eating: "}")
373+
return .scalar(try Source.validateUnicodeScalar(str, .hex))
374+
375+
case "x":
376+
// \x expects *up to* 2 digits.
377+
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
378+
else {
379+
// In PCRE, \x without any valid hex digits is \u{0}.
380+
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
381+
// could be changed to throw an error if we had a parsing mode for
382+
// them.
383+
return nullScalar()
384+
}
385+
return .scalar(try Source.validateUnicodeScalar(digits, .hex))
386+
387+
case "u":
388+
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
389+
case "U":
390+
return .scalar(try src.expectUnicodeScalar(numDigits: 8))
391+
392+
// Octal numbers.
393+
case "o" where src.tryEat("{"):
394+
let str = try src.lexUntil(eating: "}")
395+
return .scalar(try Source.validateUnicodeScalar(str, .octal))
396+
397+
case "0":
398+
// We can read *up to* 3 more octal digits.
399+
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
400+
// PCRE mode, we should limit it here.
401+
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
402+
else {
403+
return nullScalar()
404+
}
405+
return .scalar(try Source.validateUnicodeScalar(digits, .octal))
392406

393-
// Octal numbers.
394-
case "o" where src.tryEat("{"):
395-
let str = try src.lexUntil(eating: "}")
396-
return .scalar(try Source.validateUnicodeScalar(str, .octal))
397-
398-
case "0":
399-
// We can read *up to* 3 more octal digits.
400-
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
401-
// PCRE mode, we should limit it here.
402-
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
403-
else {
404-
return nullScalar()
407+
default:
408+
return nil
405409
}
406-
return .scalar(try Source.validateUnicodeScalar(digits, .octal))
407-
408-
default:
409-
fatalError("Unexpected scalar start")
410410
}
411411
}.value
412412
}
@@ -1712,6 +1712,11 @@ extension Source {
17121712
return ref
17131713
}
17141714

1715+
// Hexadecimal and octal unicode scalars.
1716+
if let scalar = try src.lexUnicodeScalar() {
1717+
return scalar
1718+
}
1719+
17151720
guard let char = src.tryEat() else {
17161721
throw ParseError.expectedEscape
17171722
}
@@ -1723,14 +1728,6 @@ extension Source {
17231728
return .escaped(builtin)
17241729
}
17251730

1726-
switch char {
1727-
// Hexadecimal and octal unicode scalars.
1728-
case "u", "x", "U", "o", "0":
1729-
return try src.expectUnicodeScalar(escapedCharacter: char)
1730-
default:
1731-
break
1732-
}
1733-
17341731
// We only allow unknown escape sequences for non-letter non-number ASCII,
17351732
// and non-ASCII whitespace.
17361733
// TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.

Tests/RegexTests/ParseTests.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2669,6 +2669,8 @@ extension RegexTests {
26692669

26702670
diagnosticTest("\\", .expectedEscape)
26712671

2672+
diagnosticTest(#"\o"#, .invalidEscape("o"))
2673+
26722674
// TODO: Custom diagnostic for control sequence
26732675
diagnosticTest(#"\c"#, .unexpectedEndOfInput)
26742676

0 commit comments

Comments
 (0)