Skip to content

Commit 120ffc9

Browse files
committed
Fix end-of-line-comment lexing
Previously we would just lex to the end of the input, as it was assumed only single-line regex would be supported. Update the implementation to handle multi-line, and take account of PCRE global options.
1 parent a96648b commit 120ffc9

File tree

5 files changed

+250
-6
lines changed

5 files changed

+250
-6
lines changed

Sources/_RegexParser/Regex/AST/MatchingOptions.swift

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,8 @@ extension AST {
137137
/// Global matching option specifiers. Unlike `MatchingOptionSequence`,
138138
/// these must appear at the start of the pattern, and apply globally.
139139
public struct GlobalMatchingOption: _ASTNode, Hashable {
140-
/// Determines the definition of a newline for the '.' character class.
140+
/// Determines the definition of a newline for the '.' character class and
141+
/// when parsing end-of-line comments.
141142
public enum NewlineMatching: Hashable {
142143
/// (*CR*)
143144
case carriageReturnOnly

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -528,11 +528,27 @@ extension Source {
528528
return try src.expectQuoted(endingWith: "*/").value
529529
}
530530
if context.endOfLineComments, src.tryEat("#") {
531-
// TODO: If we ever support multi-line regex literals, this will need
532-
// to be updated to stop at a newline. Note though that PCRE specifies
533-
// that the newline it matches against can be controlled by the global
534-
// matching options e.g `(*CR)`, `(*ANY)`, ...
535-
return src.lexUntil(\.isEmpty).value
531+
// Try eat until we either exhaust the input, or hit a newline. Note
532+
// that the definition of newline can be altered depending on the global
533+
// matching options. By default we consider a newline to be `\n` or
534+
// `\r`.
535+
return src.lexUntil { src in
536+
if src.isEmpty { return true }
537+
switch context.newlineMode {
538+
case .carriageReturnOnly:
539+
return src.tryEat("\r")
540+
case .linefeedOnly:
541+
return src.tryEat("\n")
542+
case .carriageAndLinefeedOnly:
543+
return src.tryEat("\r\n")
544+
case .anyCarriageReturnOrLinefeed:
545+
return src.tryEat(anyOf: "\r", "\n", "\r\n") != nil
546+
case .anyUnicode:
547+
return src.tryEat(where: \.isNewline)
548+
case .nulCharacter:
549+
return src.tryEat("\0")
550+
}
551+
}.value
536552
}
537553
return nil
538554
}

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,10 @@ struct ParsingContext {
7676
/// The syntax options currently set.
7777
fileprivate(set) var syntax: SyntaxOptions
7878

79+
/// The current newline matching mode.
80+
fileprivate(set) var newlineMode: AST.GlobalMatchingOption.NewlineMatching
81+
= .anyCarriageReturnOrLinefeed
82+
7983
fileprivate mutating func recordGroup(_ g: AST.Group.Kind) {
8084
// TODO: Needs to track group number resets (?|...).
8185
priorGroupCount += 1
@@ -139,6 +143,15 @@ extension Parser {
139143
// First parse any global matching options if present.
140144
let opts = try source.lexGlobalMatchingOptionSequence()
141145

146+
// If we have a newline mode global option, update the context accordingly.
147+
if let opts = opts {
148+
for opt in opts.options.reversed() {
149+
guard case .newlineMatching(let newline) = opt.kind else { continue }
150+
context.newlineMode = newline
151+
break
152+
}
153+
}
154+
142155
// Then parse the root AST node.
143156
let ast = try parseNode()
144157
guard source.isEmpty else {

Sources/_RegexParser/Regex/Parse/Source.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ extension Source {
6868
return true
6969
}
7070

71+
mutating func tryEat(where pred: (Char) throws -> Bool) rethrows -> Bool {
72+
guard let next = peek(), try pred(next) else { return false }
73+
advance()
74+
return true
75+
}
76+
7177
mutating func tryEat<C: Collection>(sequence c: C) -> Bool
7278
where C.Element == Char {
7379
guard _slice.starts(with: c) else { return false }

Tests/RegexTests/ParseTests.swift

Lines changed: 208 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1526,6 +1526,214 @@ extension RegexTests {
15261526
matchingOptions(adding: .extended), isIsolated: true, charClass("a", "b"))
15271527
)
15281528

1529+
// Test multi-line comment handling.
1530+
parseTest(
1531+
"""
1532+
# a
1533+
bc # d
1534+
ef# g
1535+
# h
1536+
""",
1537+
concat("b", "c", "e", "f"),
1538+
syntax: .extendedSyntax
1539+
)
1540+
parseTest(
1541+
"""
1542+
# a\r\
1543+
bc # d\r\
1544+
ef# g\r\
1545+
# h\r
1546+
""",
1547+
concat("b", "c", "e", "f"),
1548+
syntax: .extendedSyntax
1549+
)
1550+
parseTest(
1551+
"""
1552+
# a\r\
1553+
bc # d\r\
1554+
ef# g\r\
1555+
# h\r
1556+
""",
1557+
concat("b", "c", "e", "f"),
1558+
syntax: .extendedSyntax
1559+
)
1560+
parseTest(
1561+
"""
1562+
# a\r
1563+
bc # d\r
1564+
ef# g\r
1565+
# h\r
1566+
""",
1567+
concat("b", "c", "e", "f"),
1568+
syntax: .extendedSyntax
1569+
)
1570+
parseTest(
1571+
"""
1572+
# a\n\r\
1573+
bc # d\n\r\
1574+
ef# g\n\r\
1575+
# h\n\r
1576+
""",
1577+
concat("b", "c", "e", "f"),
1578+
syntax: .extendedSyntax
1579+
)
1580+
parseTest(
1581+
"""
1582+
(*CR)
1583+
# a
1584+
bc # d
1585+
ef# g
1586+
# h
1587+
""",
1588+
ast(empty(), opts: .newlineMatching(.carriageReturnOnly)),
1589+
syntax: .extendedSyntax
1590+
)
1591+
parseTest(
1592+
"""
1593+
(*CR)\r\
1594+
# a\r\
1595+
bc # d\r\
1596+
ef# g\r\
1597+
# h
1598+
""",
1599+
ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageReturnOnly)),
1600+
syntax: .extendedSyntax
1601+
)
1602+
parseTest(
1603+
"""
1604+
(*LF)
1605+
# a
1606+
bc # d
1607+
ef# g
1608+
# h
1609+
""",
1610+
ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.linefeedOnly)),
1611+
syntax: .extendedSyntax
1612+
)
1613+
parseTest(
1614+
"""
1615+
(*CRLF)
1616+
# a
1617+
bc # d
1618+
ef# g
1619+
# h
1620+
""",
1621+
ast(empty(), opts: .newlineMatching(.carriageAndLinefeedOnly)),
1622+
syntax: .extendedSyntax
1623+
)
1624+
parseTest(
1625+
"""
1626+
(*CRLF)
1627+
# a\r
1628+
bc # d\r
1629+
ef# g\r
1630+
# h
1631+
""",
1632+
ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.carriageAndLinefeedOnly)),
1633+
syntax: .extendedSyntax
1634+
)
1635+
parseTest(
1636+
"""
1637+
(*ANYCRLF)
1638+
# a
1639+
bc # d
1640+
ef# g
1641+
# h
1642+
""",
1643+
ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)),
1644+
syntax: .extendedSyntax
1645+
)
1646+
parseTest(
1647+
"""
1648+
(*ANYCRLF)
1649+
# a\r\
1650+
bc # d\r\
1651+
ef# g\r\
1652+
# h
1653+
""",
1654+
ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)),
1655+
syntax: .extendedSyntax
1656+
)
1657+
parseTest(
1658+
"""
1659+
(*ANYCRLF)
1660+
# a\r
1661+
bc # d\r
1662+
ef# g\r
1663+
# h
1664+
""",
1665+
ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyCarriageReturnOrLinefeed)),
1666+
syntax: .extendedSyntax
1667+
)
1668+
parseTest(
1669+
"""
1670+
(*ANY)
1671+
# a
1672+
bc # d
1673+
ef# g
1674+
# h
1675+
""",
1676+
ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)),
1677+
syntax: .extendedSyntax
1678+
)
1679+
parseTest(
1680+
"""
1681+
# a\u{2028}\
1682+
bc # d
1683+
ef# g\u{2028}\
1684+
# h
1685+
""",
1686+
concat("e", "f"),
1687+
syntax: .extendedSyntax
1688+
)
1689+
parseTest(
1690+
"""
1691+
(*ANY)
1692+
# a\u{2028}\
1693+
bc # d\u{2028}\
1694+
ef# g\u{2028}\
1695+
# h
1696+
""",
1697+
ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.anyUnicode)),
1698+
syntax: .extendedSyntax
1699+
)
1700+
parseTest(
1701+
"""
1702+
(*NUL)
1703+
# a
1704+
bc # d\0\
1705+
ef# g
1706+
# h
1707+
""",
1708+
ast(concat("e", "f"), opts: .newlineMatching(.nulCharacter)),
1709+
syntax: .extendedSyntax
1710+
)
1711+
parseTest(
1712+
"""
1713+
(*NUL)
1714+
# a\0\
1715+
bc # d\0\
1716+
ef# g\0\
1717+
# h
1718+
""",
1719+
ast(concat("b", "c", "e", "f"), opts: .newlineMatching(.nulCharacter)),
1720+
syntax: .extendedSyntax
1721+
)
1722+
parseTest(
1723+
"""
1724+
(*CR)(*NUL)
1725+
# a\0\
1726+
bc # d\0\
1727+
ef# g\0\
1728+
# h
1729+
""",
1730+
ast(concat("b", "c", "e", "f"),
1731+
opts: .newlineMatching(.carriageReturnOnly),
1732+
.newlineMatching(.nulCharacter)
1733+
),
1734+
syntax: .extendedSyntax
1735+
)
1736+
15291737
// MARK: Parse with delimiters
15301738

15311739
parseWithDelimitersTest("#/a b/#", concat("a", " ", "b"))

0 commit comments

Comments
 (0)