Skip to content

Commit 4328e73

Browse files
committed
Allow trivia between character class range operands
Factor out the logic that deals with parsing an individual character class member, and interleave `lexTrivia` calls between range operand parsing.
1 parent a6bf9b8 commit 4328e73

File tree

6 files changed

+100
-45
lines changed

6 files changed

+100
-45
lines changed

Sources/_RegexParser/Regex/AST/CustomCharClass.swift

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,11 +51,16 @@ extension AST {
5151
public var lhs: Atom
5252
public var dashLoc: SourceLocation
5353
public var rhs: Atom
54+
public var trivia: [AST.Trivia]
5455

55-
public init(_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom) {
56+
public init(
57+
_ lhs: Atom, _ dashLoc: SourceLocation, _ rhs: Atom,
58+
trivia: [AST.Trivia]
59+
) {
5660
self.lhs = lhs
5761
self.dashLoc = dashLoc
5862
self.rhs = rhs
63+
self.trivia = trivia
5964
}
6065
}
6166
public enum SetOp: String, Hashable {
@@ -95,6 +100,11 @@ extension CustomCC.Member {
95100
return false
96101
}
97102

103+
public var asTrivia: AST.Trivia? {
104+
guard case .trivia(let t) = self else { return nil }
105+
return t
106+
}
107+
98108
public var isSemantic: Bool {
99109
!isTrivia
100110
}

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 5 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2039,20 +2039,11 @@ extension Source {
20392039
return AST.Atom(kind.value, kind.location)
20402040
}
20412041

2042-
/// Try to lex the end of a range in a custom character class, which consists
2043-
/// of a '-' character followed by an atom.
2044-
mutating func lexCustomCharClassRangeEnd(
2045-
context: ParsingContext
2046-
) throws -> (dashLoc: SourceLocation, AST.Atom)? {
2047-
// Make sure we don't have a binary operator e.g '--', and the '-' is not
2048-
// ending the custom character class (in which case it is literal).
2049-
guard peekCCBinOp() == nil, !starts(with: "-]"),
2050-
let dash = tryEatWithLoc("-"),
2051-
let end = try lexAtom(context: context)
2052-
else {
2053-
return nil
2054-
}
2055-
return (dash, end)
2042+
/// Try to lex the range operator '-' for a custom character class.
2043+
mutating func lexCustomCharacterClassRangeOperator() -> SourceLocation? {
2044+
// Eat a '-', making sure we don't have a binary op such as '--'.
2045+
guard peekCCBinOp() == nil else { return nil }
2046+
return tryEatWithLoc("-")
20562047
}
20572048

20582049
/// Try to consume a newline sequence matching option kind.

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 58 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -495,43 +495,72 @@ extension Parser {
495495
return CustomCC(start, members, loc(start.location.start))
496496
}
497497

498+
mutating func parseCCCMember() throws -> CustomCC.Member? {
499+
guard !source.isEmpty && source.peek() != "]" && source.peekCCBinOp() == nil
500+
else { return nil }
501+
502+
// Nested custom character class.
503+
if let cccStart = source.lexCustomCCStart() {
504+
return .custom(try parseCustomCharacterClass(cccStart))
505+
}
506+
507+
// Quoted sequence.
508+
if let quote = try source.lexQuote(context: context) {
509+
return .quote(quote)
510+
}
511+
512+
// Lex triva if we're allowed.
513+
if let trivia = try source.lexTrivia(context: context) {
514+
return .trivia(trivia)
515+
}
516+
517+
if let atom = try source.lexAtom(context: context) {
518+
return .atom(atom)
519+
}
520+
return nil
521+
}
522+
498523
mutating func parseCCCMembers(
499524
into members: inout Array<CustomCC.Member>
500525
) throws {
501526
// Parse members until we see the end of the custom char class or an
502527
// operator.
503-
while !source.isEmpty && source.peek() != "]" &&
504-
source.peekCCBinOp() == nil {
505-
506-
// Nested custom character class.
507-
if let cccStart = source.lexCustomCCStart() {
508-
members.append(.custom(try parseCustomCharacterClass(cccStart)))
509-
continue
510-
}
511-
512-
// Quoted sequence.
513-
if let quote = try source.lexQuote(context: context) {
514-
members.append(.quote(quote))
515-
continue
516-
}
517-
518-
// Lex trivia if we're allowed.
519-
if let trivia = try source.lexTrivia(context: context) {
520-
members.append(.trivia(trivia))
521-
continue
522-
}
528+
while let member = try parseCCCMember() {
529+
members.append(member)
530+
531+
// If we have an atom, we can try to parse a character class range. Each
532+
// time we parse a component of the range, we append to `members` in case
533+
// it ends up not being a range, and we bail. If we succeed in parsing, we
534+
// remove the intermediate members.
535+
if case .atom(let lhs) = member {
536+
let membersBeforeRange = members.count - 1
537+
538+
while let t = try source.lexTrivia(context: context) {
539+
members.append(.trivia(t))
540+
}
523541

524-
guard let atom = try source.lexAtom(context: context) else { break }
542+
guard let dash = source.lexCustomCharacterClassRangeOperator() else {
543+
continue
544+
}
545+
// If we can't parse a range, '-' becomes literal, e.g `[6-]`.
546+
members.append(.atom(.init(.char("-"), dash)))
525547

526-
// Range between atoms.
527-
if let (dashLoc, rhs) =
528-
try source.lexCustomCharClassRangeEnd(context: context) {
529-
members.append(.range(.init(atom, dashLoc, rhs)))
530-
continue
548+
while let t = try source.lexTrivia(context: context) {
549+
members.append(.trivia(t))
550+
}
551+
guard let rhs = try parseCCCMember() else { continue }
552+
members.append(rhs)
553+
554+
guard case let .atom(rhs) = rhs else { continue }
555+
556+
// We've successfully parsed an atom LHS and RHS, so form a range,
557+
// collecting the trivia we've parsed, and replacing the members that
558+
// would have otherwise been added to the custom character class.
559+
let rangeMemberCount = members.count - membersBeforeRange
560+
let trivia = members.suffix(rangeMemberCount).compactMap(\.asTrivia)
561+
members.removeLast(rangeMemberCount)
562+
members.append(.range(.init(lhs, dash, rhs, trivia: trivia)))
531563
}
532-
533-
members.append(.atom(atom))
534-
continue
535564
}
536565
}
537566
}

Sources/_StringProcessing/Utility/ASTBuilder.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -416,7 +416,7 @@ func prop_m(
416416
func range_m(
417417
_ lower: AST.Atom, _ upper: AST.Atom
418418
) -> AST.CustomCharacterClass.Member {
419-
.range(.init(lower, .fake, upper))
419+
.range(.init(lower, .fake, upper, trivia: []))
420420
}
421421
func range_m(
422422
_ lower: AST.Atom.Kind, _ upper: AST.Atom.Kind

Tests/RegexTests/MatchTests.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -594,6 +594,8 @@ extension RegexTests {
594594
firstMatchTest("[a-z]", input: "123ABCxyz", match: "x")
595595
firstMatchTest("[a-z]", input: "123-abcxyz", match: "a")
596596

597+
firstMatchTest("(?x)[ a - z ]+", input: " 123-abcxyz", match: "abcxyz")
598+
597599
// Character class subtraction
598600
firstMatchTest("[a-d--a-c]", input: "123abcdxyz", match: "d")
599601

Tests/RegexTests/ParseTests.swift

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -517,6 +517,8 @@ extension RegexTests {
517517
"[a-b-c]", charClass(range_m("a", "b"), "-", "c"))
518518

519519
parseTest("[-a-]", charClass("-", "a", "-"))
520+
parseTest("[[a]-]", charClass(charClass("a"), "-"))
521+
parseTest("[[a]-b]", charClass(charClass("a"), "-", "b"))
520522

521523
parseTest("[a-z]", charClass(range_m("a", "z")))
522524
parseTest("[a-a]", charClass(range_m("a", "a")))
@@ -679,6 +681,16 @@ extension RegexTests {
679681
throwsError: .unsupported
680682
)
681683

684+
parseTest(#"(?x)[ a - b ]"#, concat(
685+
changeMatchingOptions(matchingOptions(adding: .extended)),
686+
charClass(range_m("a", "b"))
687+
))
688+
689+
parseTest(#"(?x)[a - b]"#, concat(
690+
changeMatchingOptions(matchingOptions(adding: .extended)),
691+
charClass(range_m("a", "b"))
692+
))
693+
682694
// MARK: Operators
683695

684696
parseTest(
@@ -2120,6 +2132,17 @@ extension RegexTests {
21202132
/#
21212133
"""#, charClass("a", range_m("b", "c"), "d"))
21222134

2135+
parseWithDelimitersTest(#"""
2136+
#/
2137+
[
2138+
a # interesting
2139+
- #a
2140+
b
2141+
]
2142+
/#
2143+
"""#, charClass(range_m("a", "b")))
2144+
2145+
21232146
// MARK: Delimiter skipping: Make sure we can skip over the ending delimiter
21242147
// if it's clear that it's part of the regex syntax.
21252148

0 commit comments

Comments
 (0)