Skip to content

Fix \o crasher and allow (?) #487

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Jun 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
118 changes: 60 additions & 58 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -342,8 +342,8 @@ extension Source {
}.value
}

/// Eat a scalar off the front, starting from after the
/// backslash and base character (e.g. `\u` or `\x`).
/// Try to eat a scalar off the front, starting from after the backslash and
/// base character (e.g. `\u` or `\x`).
///
/// UniScalar -> 'u{' UniScalarSequence '}'
/// | 'u' HexDigit{4}
Expand All @@ -353,60 +353,60 @@ extension Source {
/// | 'o{' OctalDigit{1...} '}'
/// | '0' OctalDigit{0...3}
///
mutating func expectUnicodeScalar(
escapedCharacter base: Character
) throws -> AST.Atom.Kind {
mutating func lexUnicodeScalar() throws -> AST.Atom.Kind? {
try recordLoc { src in
try src.tryEating { src in

func nullScalar() -> AST.Atom.Kind {
let pos = src.currentPosition
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
}

// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
switch base {
// Hex numbers.
case "u" where src.tryEat("{"):
return try src.expectUnicodeScalarSequence(eating: "}")

case "x" where src.tryEat("{"):
let str = try src.lexUntil(eating: "}")
return .scalar(try Source.validateUnicodeScalar(str, .hex))

case "x":
// \x expects *up to* 2 digits.
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
else {
// In PCRE, \x without any valid hex digits is \u{0}.
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
// could be changed to throw an error if we had a parsing mode for
// them.
return nullScalar()
func nullScalar() -> AST.Atom.Kind {
let pos = src.currentPosition
return .scalar(.init(UnicodeScalar(0), SourceLocation(pos ..< pos)))
}
return .scalar(try Source.validateUnicodeScalar(digits, .hex))

case "u":
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
case "U":
return .scalar(try src.expectUnicodeScalar(numDigits: 8))
// TODO: PCRE offers a different behavior if PCRE2_ALT_BSUX is set.
switch src.tryEat() {
// Hex numbers.
case "u" where src.tryEat("{"):
return try src.expectUnicodeScalarSequence(eating: "}")

case "x" where src.tryEat("{"):
let str = try src.lexUntil(eating: "}")
return .scalar(try Source.validateUnicodeScalar(str, .hex))

case "x":
// \x expects *up to* 2 digits.
guard let digits = src.tryEatLocatedPrefix(maxLength: 2, \.isHexDigit)
else {
// In PCRE, \x without any valid hex digits is \u{0}.
// TODO: This doesn't appear to be followed by ICU or Oniguruma, so
// could be changed to throw an error if we had a parsing mode for
// them.
return nullScalar()
}
return .scalar(try Source.validateUnicodeScalar(digits, .hex))

case "u":
return .scalar(try src.expectUnicodeScalar(numDigits: 4))
case "U":
return .scalar(try src.expectUnicodeScalar(numDigits: 8))

// Octal numbers.
case "o" where src.tryEat("{"):
let str = try src.lexUntil(eating: "}")
return .scalar(try Source.validateUnicodeScalar(str, .octal))

case "0":
// We can read *up to* 3 more octal digits.
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
// PCRE mode, we should limit it here.
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
else {
return nullScalar()
}
return .scalar(try Source.validateUnicodeScalar(digits, .octal))

// Octal numbers.
case "o" where src.tryEat("{"):
let str = try src.lexUntil(eating: "}")
return .scalar(try Source.validateUnicodeScalar(str, .octal))

case "0":
// We can read *up to* 3 more octal digits.
// FIXME: PCRE can only read up to 2 octal digits, if we get a strict
// PCRE mode, we should limit it here.
guard let digits = src.tryEatLocatedPrefix(maxLength: 3, \.isOctalDigit)
else {
return nullScalar()
default:
return nil
}
return .scalar(try Source.validateUnicodeScalar(digits, .octal))

default:
fatalError("Unexpected scalar start")
}
}.value
}
Expand Down Expand Up @@ -802,6 +802,11 @@ extension Source {
mutating func lexMatchingOptionSequence(
context: ParsingContext
) throws -> AST.MatchingOptionSequence? {
// PCRE accepts '(?)'
// TODO: This is a no-op, should we warn?
if peek() == ")" {
return .init(caretLoc: nil, adding: [], minusLoc: nil, removing: [])
}
let ateCaret = recordLoc { $0.tryEat("^") }

// TODO: Warn on duplicate options, and options appearing in both adding
Expand Down Expand Up @@ -1707,6 +1712,11 @@ extension Source {
return ref
}

// Hexadecimal and octal unicode scalars.
if let scalar = try src.lexUnicodeScalar() {
return scalar
}

guard let char = src.tryEat() else {
throw ParseError.expectedEscape
}
Expand All @@ -1718,14 +1728,6 @@ extension Source {
return .escaped(builtin)
}

switch char {
// Hexadecimal and octal unicode scalars.
case "u", "x", "U", "o", "0":
return try src.expectUnicodeScalar(escapedCharacter: char)
default:
break
}

// We only allow unknown escape sequences for non-letter non-number ASCII,
// and non-ASCII whitespace.
// TODO: Once we have fix-its, suggest a `0` prefix for octal `[\7]`.
Expand Down
35 changes: 0 additions & 35 deletions Tests/RegexTests/LexTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -61,41 +61,6 @@ extension RegexTests {
_ = try src.lexNumber()
}

func diagnoseUniScalarOverflow(_ input: String, base: Character) {
let scalars = input.first == "{"
? String(input.dropFirst().dropLast())
: input
diagnose(
input,
expecting: .numberOverflow(scalars)
) { src in
_ = try src.expectUnicodeScalar(escapedCharacter: base)
}
}
func diagnoseUniScalar(
_ input: String,
base: Character,
expectedDigits numDigits: Int
) {
let scalars = input.first == "{"
? String(input.dropFirst().dropLast())
: input
diagnose(
input,
expecting: .expectedNumDigits(scalars, numDigits)
) { src in
_ = try src.expectUnicodeScalar(escapedCharacter: base)
}
_ = scalars
}

diagnoseUniScalar(
"12", base: "u", expectedDigits: 4)
diagnoseUniScalar(
"12", base: "U", expectedDigits: 8)
diagnoseUniScalarOverflow("{123456789}", base: "u")
diagnoseUniScalarOverflow("{123456789}", base: "x")

// TODO: want to dummy print out source ranges, etc, test that.
}

Expand Down
10 changes: 10 additions & 0 deletions Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1002,6 +1002,9 @@ extension RegexTests {
concat("a", atomicScriptRun("b"), "c"), throwsError: .unsupported)

// Matching option changing groups.
parseTest("(?)", changeMatchingOptions(
matchingOptions()
))
parseTest("(?-)", changeMatchingOptions(
matchingOptions()
))
Expand Down Expand Up @@ -2666,6 +2669,8 @@ extension RegexTests {

diagnosticTest("\\", .expectedEscape)

diagnosticTest(#"\o"#, .invalidEscape("o"))

// TODO: Custom diagnostic for control sequence
diagnosticTest(#"\c"#, .unexpectedEndOfInput)

Expand Down Expand Up @@ -2877,6 +2882,11 @@ extension RegexTests {
diagnosticTest(#"[\d--\u{a b}]"#, .unsupported("scalar sequence in custom character class"))
diagnosticTest(#"[\d--[\u{a b}]]"#, .unsupported("scalar sequence in custom character class"))

diagnosticTest(#"\u12"#, .expectedNumDigits("12", 4))
diagnosticTest(#"\U12"#, .expectedNumDigits("12", 8))
diagnosticTest(#"\u{123456789}"#, .numberOverflow("123456789"))
diagnosticTest(#"\x{123456789}"#, .numberOverflow("123456789"))

// MARK: Matching options

diagnosticTest(#"(?^-"#, .cannotRemoveMatchingOptionsAfterCaret)
Expand Down