Skip to content

[5.7] Reject .NET subtraction and quote range operands #518

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions Sources/_RegexParser/Regex/AST/CustomCharClass.swift
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@ extension AST {
self.rhs = rhs
self.trivia = trivia
}

public var location: SourceLocation {
lhs.location.union(with: rhs.location)
}
}
public enum SetOp: String, Hashable {
case subtraction = "--"
Expand Down Expand Up @@ -108,6 +112,25 @@ extension CustomCC.Member {
public var isSemantic: Bool {
!isTrivia
}

public var location: SourceLocation {
switch self {
case let .custom(c): return c.location
case let .range(r): return r.location
case let .atom(a): return a.location
case let .quote(q): return q.location
case let .trivia(t): return t.location
case let .setOperation(lhs, dash, rhs):
var loc = dash.location
if let lhs = lhs.first {
loc = loc.union(with: lhs.location)
}
if let rhs = rhs.last {
loc = loc.union(with: rhs.location)
}
return loc
}
}
}

extension AST.CustomCharacterClass {
Expand Down
5 changes: 4 additions & 1 deletion Sources/_RegexParser/Regex/Parse/Diagnostics.swift
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ enum ParseError: Error, Hashable {
case invalidNamedReference(String)
case duplicateNamedCapture(String)
case invalidCharacterClassRangeOperand
case unsupportedDotNetSubtraction
case invalidQuantifierRange(Int, Int)
case invalidCharacterRange(from: Character, to: Character)
case notQuantifiable
Expand Down Expand Up @@ -174,7 +175,9 @@ extension ParseError: CustomStringConvertible {
case .expectedCustomCharacterClassMembers:
return "expected custom character class members"
case .invalidCharacterClassRangeOperand:
return "invalid character class range"
return "invalid bound for character class range"
case .unsupportedDotNetSubtraction:
return "subtraction with '-' is unsupported; use '--' instead"
case .emptyProperty:
return "empty property"
case .unknownProperty(let key, let value):
Expand Down
19 changes: 19 additions & 0 deletions Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1245,6 +1245,25 @@ extension Source {
return nil
}

/// Check to see if we can lex a .NET subtraction. Returns the
/// location of the `-`.
///
/// DotNetSubtraction -> Trivia* '-' Trivia* CustomCharClass
///
func canLexDotNetCharClassSubtraction(
context: ParsingContext
) -> SourceLocation? {
lookahead { src in
// We can lex '-' as a .NET subtraction if it precedes a custom character
// class.
while (try? src.lexTrivia(context: context)) != nil {}
guard let dashLoc = src.tryEatWithLoc("-") else { return nil }
while (try? src.lexTrivia(context: context)) != nil {}
guard src.lexCustomCCStart() != nil else { return nil }
return dashLoc
}
}

private mutating func lexPOSIXCharacterProperty(
) throws -> Located<AST.Atom.CharacterProperty>? {
try recordLoc { src in
Expand Down
124 changes: 83 additions & 41 deletions Sources/_RegexParser/Regex/Parse/Parse.swift
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,12 @@ extension Parser {
var members: Array<Member> = []
try parseCCCMembers(into: &members)

// Make sure we have at least one semantic member.
if members.none(\.isSemantic) {
throw Source.LocatedError(
ParseError.expectedCustomCharacterClassMembers, start.location)
}

// If we have a binary set operator, parse it and the next members. Note
// that this means we left associate for a chain of operators.
// TODO: We may want to diagnose and require users to disambiguate, at least
Expand All @@ -511,16 +517,12 @@ extension Parser {
var rhs: Array<Member> = []
try parseCCCMembers(into: &rhs)

if members.none(\.isSemantic) || rhs.none(\.isSemantic) {
if rhs.none(\.isSemantic) {
throw Source.LocatedError(
ParseError.expectedCustomCharacterClassMembers, start.location)
}
members = [.setOperation(members, binOp, rhs)]
}
if members.none(\.isSemantic) {
throw Source.LocatedError(
ParseError.expectedCustomCharacterClassMembers, start.location)
}
try source.expect("]")
return CustomCC(start, members, loc(start.location.start))
}
Expand Down Expand Up @@ -550,48 +552,88 @@ extension Parser {
return nil
}

mutating func parseCCCMembers(
into members: inout Array<CustomCC.Member>
/// Attempt to parse a custom character class range into `members`, or regular
/// members if a range cannot be formed.
mutating func parsePotentialCCRange(
into members: inout [CustomCC.Member]
) throws {
// Parse members until we see the end of the custom char class or an
// operator.
while let member = try parseCCCMember() {
members.append(member)

// If we have an atom, we can try to parse a character class range. Each
// time we parse a component of the range, we append to `members` in case
// it ends up not being a range, and we bail. If we succeed in parsing, we
// remove the intermediate members.
if case .atom(let lhs) = member {
let membersBeforeRange = members.count - 1

while let t = try source.lexTrivia(context: context) {
members.append(.trivia(t))
}
guard let lhs = members.last, lhs.isSemantic else { return }

// Try and see if we can parse a character class range. Each time we parse
// a component of the range, we append to `members` in case it ends up not
// being a range, and we bail. If we succeed in parsing, we remove the
// intermediate members.
let membersBeforeRange = members.count - 1
while let t = try source.lexTrivia(context: context) {
members.append(.trivia(t))
}
guard let dash = source.lexCustomCharacterClassRangeOperator() else {
return
}

guard let dash = source.lexCustomCharacterClassRangeOperator() else {
continue
}
// If we can't parse a range, '-' becomes literal, e.g `[6-]`.
members.append(.atom(.init(.char("-"), dash)))
// If we can't parse a range, '-' becomes literal, e.g `[6-]`.
members.append(.atom(.init(.char("-"), dash)))

while let t = try source.lexTrivia(context: context) {
members.append(.trivia(t))
while let t = try source.lexTrivia(context: context) {
members.append(.trivia(t))
}
guard let rhs = try parseCCCMember() else { return }
members.append(rhs)

func makeOperand(_ m: CustomCC.Member, isLHS: Bool) throws -> AST.Atom {
switch m {
case .atom(let a):
return a
case .custom:
// Not supported. While .NET allows `x-[...]` to spell subtraction, we
// require `x--[...]`. We also ban `[...]-x` for consistency.
if isLHS {
throw Source.LocatedError(
ParseError.invalidCharacterClassRangeOperand, m.location)
} else {
throw Source.LocatedError(
ParseError.unsupportedDotNetSubtraction, m.location)
}
guard let rhs = try parseCCCMember() else { continue }
members.append(rhs)

guard case let .atom(rhs) = rhs else { continue }

// We've successfully parsed an atom LHS and RHS, so form a range,
// collecting the trivia we've parsed, and replacing the members that
// would have otherwise been added to the custom character class.
let rangeMemberCount = members.count - membersBeforeRange
let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia)
members.removeLast(rangeMemberCount)
members.append(.range(.init(lhs, dash, rhs, trivia: trivia)))
case .quote:
// Currently unsupported, we need to figure out what the semantics
// would be for grapheme/scalar modes.
throw Source.LocatedError(
ParseError.unsupported("range with quoted sequence"), m.location)
case .trivia:
throw Unreachable("Should have been lexed separately")
case .range, .setOperation:
throw Unreachable("Parsed later")
}
}
let lhsOp = try makeOperand(lhs, isLHS: true)
let rhsOp = try makeOperand(rhs, isLHS: false)

// We've successfully parsed an atom LHS and RHS, so form a range,
// collecting the trivia we've parsed, and replacing the members that
// would have otherwise been added to the custom character class.
let rangeMemberCount = members.count - membersBeforeRange
let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia)
members.removeLast(rangeMemberCount)
members.append(.range(.init(lhsOp, dash, rhsOp, trivia: trivia)))

// We need to specially check if we can lex a .NET character class
// subtraction here as e.g `[a-c-[...]]` is allowed in .NET. Otherwise we'd
// treat the second `-` as literal.
if let dashLoc = source.canLexDotNetCharClassSubtraction(context: context) {
throw Source.LocatedError(
ParseError.unsupportedDotNetSubtraction, dashLoc)
}
}

mutating func parseCCCMembers(
into members: inout Array<CustomCC.Member>
) throws {
// Parse members and ranges until we see the end of the custom char class
// or an operator.
while let member = try parseCCCMember() {
members.append(member)
try parsePotentialCCRange(into: &members)
}
}
}

Expand Down
65 changes: 64 additions & 1 deletion Tests/RegexTests/ParseTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -515,10 +515,36 @@ extension RegexTests {

parseTest(
"[a-b-c]", charClass(range_m("a", "b"), "-", "c"))
parseTest(
"[a-b-c-d]", charClass(range_m("a", "b"), "-", range_m("c", "d")))

parseTest("[a-c---]", charClass(
setOp(range_m("a", "c"), op: .subtraction, "-")
))

parseTest("(?x)[a-c -- -]", concat(
changeMatchingOptions(matchingOptions(adding: .extended)),
charClass(setOp(range_m("a", "c"), op: .subtraction, "-"))
))

parseTest("(?x)[a-c - - -]", concat(
changeMatchingOptions(matchingOptions(adding: .extended)),
charClass(range_m("a", "c"), range_m("-", "-"))
))

parseTest("[-a-]", charClass("-", "a", "-"))
parseTest("[[a]-]", charClass(charClass("a"), "-"))
parseTest("[[a]-b]", charClass(charClass("a"), "-", "b"))
parseTest("[-[a]]", charClass("-", charClass("a")))

parseTest(#"(?x)[ -[b]]"#, concat(
changeMatchingOptions(matchingOptions(adding: .extended)),
charClass("-", charClass("b"))
))

parseTest(#"[ - [ ]]"#, charClass(range_m(" ", " "), charClass(" ")))
parseTest(#"[ - [ ] ]"#, charClass(range_m(" ", " "), charClass(" "), " "))

parseTest(#"[a-c-\Qd\E]"#, charClass(range_m("a", "c"), "-", quote_m("d")))

parseTest("[a-z]", charClass(range_m("a", "z")))
parseTest("[a-a]", charClass(range_m("a", "a")))
Expand Down Expand Up @@ -2688,6 +2714,32 @@ extension RegexTests {
diagnosticTest("[[:=:]]", .emptyProperty)

diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand)
diagnosticTest("[[a]-b]", .invalidCharacterClassRangeOperand)

// .NET subtraction is banned, we require explicit '--'.
diagnosticTest("[a-[b]]", .unsupportedDotNetSubtraction)
diagnosticTest(#"[abc-[def]]"#, .unsupportedDotNetSubtraction)
diagnosticTest(#"[abc-[^def]]"#, .unsupportedDotNetSubtraction)
diagnosticTest(#"[\d\u{0}[a]-[b-[c]]]"#, .unsupportedDotNetSubtraction)
diagnosticTest("[a-z-[d-w-[m-o]]]", .unsupportedDotNetSubtraction)
diagnosticTest(#"[a-[:b]]"#, .unsupportedDotNetSubtraction)
diagnosticTest(#"[[a]-[b]]"#, .invalidCharacterClassRangeOperand)
diagnosticTest(#"[ -[ ]]"#, .unsupportedDotNetSubtraction)
diagnosticTest(#"(?x)[a - [b] ]"#, .unsupportedDotNetSubtraction)

diagnosticTest(#"[a-[]]"#, .expectedCustomCharacterClassMembers)
diagnosticTest(#"[-[]]"#, .expectedCustomCharacterClassMembers)
diagnosticTest(#"(?x)[ - [ ] ]"#, .expectedCustomCharacterClassMembers)
diagnosticTest(#"(?x)[a-[ ] ]"#, .expectedCustomCharacterClassMembers)
diagnosticTest(#"[a-[:digit:]]"#, .invalidCharacterClassRangeOperand)

diagnosticTest("[--]", .expectedCustomCharacterClassMembers)
diagnosticTest("[---]", .expectedCustomCharacterClassMembers)
diagnosticTest("[----]", .expectedCustomCharacterClassMembers)

// Quoted sequences aren't currently supported as range operands.
diagnosticTest(#"[a-\Qbc\E]"#, .unsupported("range with quoted sequence"))
diagnosticTest(#"[\Qbc\E-de]"#, .unsupported("range with quoted sequence"))

diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))
diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))
Expand Down Expand Up @@ -2873,6 +2925,17 @@ extension RegexTests {
/#
"""#, .quoteMayNotSpanMultipleLines)

// .NET subtraction
diagnosticWithDelimitersTest(#"""
#/
[
a # interesting
- #a
[ b] # comment
]
/#
"""#, .unsupportedDotNetSubtraction)

// MARK: Group specifiers

diagnosticTest(#"(*"#, .unknownGroupKind("*"))
Expand Down