Skip to content

Commit dd333f3

Browse files
committed
Reject .NET subtraction and quote range operands
Tighten up validation of character class range operands such that we reject quotes and custom character classes. This includes rejecting syntax that would be a subtraction in .NET. We throw a custom error that suggests using `--` instead.
1 parent a722b20 commit dd333f3

File tree

5 files changed

+147
-5
lines changed

5 files changed

+147
-5
lines changed

Sources/_RegexParser/Regex/AST/CustomCharClass.swift

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,10 @@ extension AST {
6262
self.rhs = rhs
6363
self.trivia = trivia
6464
}
65+
66+
public var location: SourceLocation {
67+
lhs.location.union(with: rhs.location)
68+
}
6569
}
6670
public enum SetOp: String, Hashable {
6771
case subtraction = "--"
@@ -108,6 +112,25 @@ extension CustomCC.Member {
108112
public var isSemantic: Bool {
109113
!isTrivia
110114
}
115+
116+
public var location: SourceLocation {
117+
switch self {
118+
case let .custom(c): return c.location
119+
case let .range(r): return r.location
120+
case let .atom(a): return a.location
121+
case let .quote(q): return q.location
122+
case let .trivia(t): return t.location
123+
case let .setOperation(lhs, dash, rhs):
124+
var loc = dash.location
125+
if let lhs = lhs.first {
126+
loc = loc.union(with: lhs.location)
127+
}
128+
if let rhs = rhs.last {
129+
loc = loc.union(with: rhs.location)
130+
}
131+
return loc
132+
}
133+
}
111134
}
112135

113136
extension AST.CustomCharacterClass {

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ enum ParseError: Error, Hashable {
9494
case invalidNamedReference(String)
9595
case duplicateNamedCapture(String)
9696
case invalidCharacterClassRangeOperand
97+
case unsupportedDotNetSubtraction
9798
case invalidQuantifierRange(Int, Int)
9899
case invalidCharacterRange(from: Character, to: Character)
99100
case notQuantifiable
@@ -174,7 +175,9 @@ extension ParseError: CustomStringConvertible {
174175
case .expectedCustomCharacterClassMembers:
175176
return "expected custom character class members"
176177
case .invalidCharacterClassRangeOperand:
177-
return "invalid character class range"
178+
return "invalid bound for character class range"
179+
case .unsupportedDotNetSubtraction:
180+
return "subtraction with '-' is unsupported; use '--' instead"
178181
case .emptyProperty:
179182
return "empty property"
180183
case .unknownProperty(let key, let value):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1245,6 +1245,25 @@ extension Source {
12451245
return nil
12461246
}
12471247

1248+
/// Check to see if we can lex a .NET subtraction. Returns the
1249+
/// location of the `-`.
1250+
///
1251+
/// DotNetSubtraction -> Trivia* '-' Trivia* CustomCharClass
1252+
///
1253+
func canLexDotNetCharClassSubtraction(
1254+
context: ParsingContext
1255+
) -> SourceLocation? {
1256+
lookahead { src in
1257+
// We can lex '-' as a .NET subtraction if it precedes a custom character
1258+
// class.
1259+
while (try? src.lexTrivia(context: context)) != nil {}
1260+
guard let dashLoc = src.tryEatWithLoc("-") else { return nil }
1261+
while (try? src.lexTrivia(context: context)) != nil {}
1262+
guard src.lexCustomCCStart() != nil else { return nil }
1263+
return dashLoc
1264+
}
1265+
}
1266+
12481267
private mutating func lexPOSIXCharacterProperty(
12491268
) throws -> Located<AST.Atom.CharacterProperty>? {
12501269
try recordLoc { src in

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 37 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -557,7 +557,7 @@ extension Parser {
557557
mutating func parsePotentialCCRange(
558558
into members: inout [CustomCC.Member]
559559
) throws {
560-
guard case .atom(let lhs)? = members.last else { return }
560+
guard let lhs = members.last, lhs.isSemantic else { return }
561561

562562
// Try and see if we can parse a character class range. Each time we parse
563563
// a component of the range, we append to `members` in case it ends up not
@@ -580,15 +580,49 @@ extension Parser {
580580
guard let rhs = try parseCCCMember() else { return }
581581
members.append(rhs)
582582

583-
guard case let .atom(rhs) = rhs else { return }
583+
func makeOperand(_ m: CustomCC.Member, isLHS: Bool) throws -> AST.Atom {
584+
switch m {
585+
case .atom(let a):
586+
return a
587+
case .custom:
588+
// Not supported. While .NET allows `x-[...]` to spell subtraction, we
589+
// require `x--[...]`. We also ban `[...]-x` for consistency.
590+
if isLHS {
591+
throw Source.LocatedError(
592+
ParseError.invalidCharacterClassRangeOperand, m.location)
593+
} else {
594+
throw Source.LocatedError(
595+
ParseError.unsupportedDotNetSubtraction, m.location)
596+
}
597+
case .quote:
598+
// Currently unsupported, we need to figure out what the semantics
599+
// would be for grapheme/scalar modes.
600+
throw Source.LocatedError(
601+
ParseError.unsupported("range with quoted sequence"), m.location)
602+
case .trivia:
603+
throw Unreachable("Should have been lexed separately")
604+
case .range, .setOperation:
605+
throw Unreachable("Parsed later")
606+
}
607+
}
608+
let lhsOp = try makeOperand(lhs, isLHS: true)
609+
let rhsOp = try makeOperand(rhs, isLHS: false)
584610

585611
// We've successfully parsed an atom LHS and RHS, so form a range,
586612
// collecting the trivia we've parsed, and replacing the members that
587613
// would have otherwise been added to the custom character class.
588614
let rangeMemberCount = members.count - membersBeforeRange
589615
let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia)
590616
members.removeLast(rangeMemberCount)
591-
members.append(.range(.init(lhs, dash, rhs, trivia: trivia)))
617+
members.append(.range(.init(lhsOp, dash, rhsOp, trivia: trivia)))
618+
619+
// We need to specially check if we can lex a .NET character class
620+
// subtraction here as e.g `[a-c-[...]]` is allowed in .NET. Otherwise we'd
621+
// treat the second `-` as literal.
622+
if let dashLoc = source.canLexDotNetCharClassSubtraction(context: context) {
623+
throw Source.LocatedError(
624+
ParseError.unsupportedDotNetSubtraction, dashLoc)
625+
}
592626
}
593627

594628
mutating func parseCCCMembers(

Tests/RegexTests/ParseTests.swift

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -515,10 +515,36 @@ extension RegexTests {
515515

516516
parseTest(
517517
"[a-b-c]", charClass(range_m("a", "b"), "-", "c"))
518+
parseTest(
519+
"[a-b-c-d]", charClass(range_m("a", "b"), "-", range_m("c", "d")))
520+
521+
parseTest("[a-c---]", charClass(
522+
setOp(range_m("a", "c"), op: .subtraction, "-")
523+
))
524+
525+
parseTest("(?x)[a-c -- -]", concat(
526+
changeMatchingOptions(matchingOptions(adding: .extended)),
527+
charClass(setOp(range_m("a", "c"), op: .subtraction, "-"))
528+
))
529+
530+
parseTest("(?x)[a-c - - -]", concat(
531+
changeMatchingOptions(matchingOptions(adding: .extended)),
532+
charClass(range_m("a", "c"), range_m("-", "-"))
533+
))
518534

519535
parseTest("[-a-]", charClass("-", "a", "-"))
520536
parseTest("[[a]-]", charClass(charClass("a"), "-"))
521-
parseTest("[[a]-b]", charClass(charClass("a"), "-", "b"))
537+
parseTest("[-[a]]", charClass("-", charClass("a")))
538+
539+
parseTest(#"(?x)[ -[b]]"#, concat(
540+
changeMatchingOptions(matchingOptions(adding: .extended)),
541+
charClass("-", charClass("b"))
542+
))
543+
544+
parseTest(#"[ - [ ]]"#, charClass(range_m(" ", " "), charClass(" ")))
545+
parseTest(#"[ - [ ] ]"#, charClass(range_m(" ", " "), charClass(" "), " "))
546+
547+
parseTest(#"[a-c-\Qd\E]"#, charClass(range_m("a", "c"), "-", quote_m("d")))
522548

523549
parseTest("[a-z]", charClass(range_m("a", "z")))
524550
parseTest("[a-a]", charClass(range_m("a", "a")))
@@ -2688,6 +2714,32 @@ extension RegexTests {
26882714
diagnosticTest("[[:=:]]", .emptyProperty)
26892715

26902716
diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand)
2717+
diagnosticTest("[[a]-b]", .invalidCharacterClassRangeOperand)
2718+
2719+
// .NET subtraction is banned, we require explicit '--'.
2720+
diagnosticTest("[a-[b]]", .unsupportedDotNetSubtraction)
2721+
diagnosticTest(#"[abc-[def]]"#, .unsupportedDotNetSubtraction)
2722+
diagnosticTest(#"[abc-[^def]]"#, .unsupportedDotNetSubtraction)
2723+
diagnosticTest(#"[\d\u{0}[a]-[b-[c]]]"#, .unsupportedDotNetSubtraction)
2724+
diagnosticTest("[a-z-[d-w-[m-o]]]", .unsupportedDotNetSubtraction)
2725+
diagnosticTest(#"[a-[:b]]"#, .unsupportedDotNetSubtraction)
2726+
diagnosticTest(#"[[a]-[b]]"#, .invalidCharacterClassRangeOperand)
2727+
diagnosticTest(#"[ -[ ]]"#, .unsupportedDotNetSubtraction)
2728+
diagnosticTest(#"(?x)[a - [b] ]"#, .unsupportedDotNetSubtraction)
2729+
2730+
diagnosticTest(#"[a-[]]"#, .expectedCustomCharacterClassMembers)
2731+
diagnosticTest(#"[-[]]"#, .expectedCustomCharacterClassMembers)
2732+
diagnosticTest(#"(?x)[ - [ ] ]"#, .expectedCustomCharacterClassMembers)
2733+
diagnosticTest(#"(?x)[a-[ ] ]"#, .expectedCustomCharacterClassMembers)
2734+
diagnosticTest(#"[a-[:digit:]]"#, .invalidCharacterClassRangeOperand)
2735+
2736+
diagnosticTest("[--]", .expectedCustomCharacterClassMembers)
2737+
diagnosticTest("[---]", .expectedCustomCharacterClassMembers)
2738+
diagnosticTest("[----]", .expectedCustomCharacterClassMembers)
2739+
2740+
// Quoted sequences aren't currently supported as range operands.
2741+
diagnosticTest(#"[a-\Qbc\E]"#, .unsupported("range with quoted sequence"))
2742+
diagnosticTest(#"[\Qbc\E-de]"#, .unsupported("range with quoted sequence"))
26912743

26922744
diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))
26932745
diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))
@@ -2873,6 +2925,17 @@ extension RegexTests {
28732925
/#
28742926
"""#, .quoteMayNotSpanMultipleLines)
28752927

2928+
// .NET subtraction
2929+
diagnosticWithDelimitersTest(#"""
2930+
#/
2931+
[
2932+
a # interesting
2933+
- #a
2934+
[ b] # comment
2935+
]
2936+
/#
2937+
"""#, .unsupportedDotNetSubtraction)
2938+
28762939
// MARK: Group specifiers
28772940

28782941
diagnosticTest(#"(*"#, .unknownGroupKind("*"))

0 commit comments

Comments
 (0)