Skip to content

Commit 3e848aa

Browse files
committed
Lex whitespace in range quantifiers
PCRE does not allow whitespace here, instead treating the sequence as literal if whitespace is present. However this behavior is quite unintuitive. Instead, lex whitespace between range operands.
1 parent d27b896 commit 3e848aa

File tree

3 files changed

+27
-9
lines changed

3 files changed

+27
-9
lines changed

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -430,7 +430,7 @@ extension Source {
430430

431431
return try src.tryEating { src in
432432
guard src.tryEat("{"),
433-
let range = try src.lexRange(context: context),
433+
let range = try src.lexRange(context: context, trivia: &trivia),
434434
src.tryEat("}")
435435
else { return nil }
436436
return range.value
@@ -456,11 +456,17 @@ extension Source {
456456
/// | ExpRange
457457
/// ExpRange -> '..<' <Int> | '...' <Int>
458458
/// | <Int> '..<' <Int> | <Int> '...' <Int>?
459-
mutating func lexRange(context: ParsingContext) throws -> Located<Quant.Amount>? {
459+
mutating func lexRange(
460+
context: ParsingContext, trivia: inout [AST.Trivia]
461+
) throws -> Located<Quant.Amount>? {
460462
try recordLoc { src in
461463
try src.tryEating { src in
464+
if let t = src.lexWhitespace() { trivia.append(t) }
465+
462466
let lowerOpt = try src.lexNumber()
463467

468+
if let t = src.lexWhitespace() { trivia.append(t) }
469+
464470
// ',' or '...' or '..<' or nothing
465471
// TODO: We ought to try and consume whitespace here and emit a
466472
// diagnostic for the user warning them that it would cause the range to
@@ -480,11 +486,15 @@ extension Source {
480486
closedRange = nil
481487
}
482488

489+
if let t = src.lexWhitespace() { trivia.append(t) }
490+
483491
let upperOpt = try src.lexNumber()?.map { upper in
484492
// If we have an open range, the upper bound should be adjusted down.
485493
closedRange == true ? upper : upper - 1
486494
}
487495

496+
if let t = src.lexWhitespace() { trivia.append(t) }
497+
488498
switch (lowerOpt, closedRange, upperOpt) {
489499
case let (l?, nil, nil):
490500
return .exactly(l)

Tests/RegexTests/MatchTests.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -341,8 +341,12 @@ extension RegexTests {
341341

342342
firstMatchTest(
343343
#"a{1,2}"#, input: "123aaaxyz", match: "aa")
344+
firstMatchTest(
345+
#"a{ 1 , 2 }"#, input: "123aaaxyz", match: "aa")
344346
firstMatchTest(
345347
#"a{,2}"#, input: "123aaaxyz", match: "")
348+
firstMatchTest(
349+
#"a{ , 2 }"#, input: "123aaaxyz", match: "")
346350
firstMatchTest(
347351
#"a{,2}x"#, input: "123aaaxyz", match: "aax")
348352
firstMatchTest(
@@ -351,6 +355,8 @@ extension RegexTests {
351355
#"a{2,}"#, input: "123aaaxyz", match: "aaa")
352356
firstMatchTest(
353357
#"a{1}"#, input: "123aaaxyz", match: "a")
358+
firstMatchTest(
359+
#"a{ 1 }"#, input: "123aaaxyz", match: "a")
354360
firstMatchTest(
355361
#"a{1,2}?"#, input: "123aaaxyz", match: "a")
356362
firstMatchTest(

Tests/RegexTests/ParseTests.swift

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -832,6 +832,10 @@ extension RegexTests {
832832
#"a{1,1}"#,
833833
quantRange(1...1, of: "a"))
834834

835+
parseTest("x{3, 5}", quantRange(3 ... 5, of: "x"))
836+
parseTest("x{ 3 , 5 }", quantRange(3 ... 5, of: "x"))
837+
parseTest("x{3 }", exactly(3, of: "x"))
838+
835839
// Make sure ranges get treated as literal if invalid.
836840
parseTest("{", "{")
837841
parseTest("{,", concat("{", ","))
@@ -851,11 +855,6 @@ extension RegexTests {
851855
parseTest("x{+", concat("x", oneOrMore(of: "{")))
852856
parseTest("x{6,+", concat("x", "{", "6", oneOrMore(of: ",")))
853857

854-
// TODO: We should emit a diagnostic for this.
855-
parseTest("x{3, 5}", concat("x", "{", "3", ",", " ", "5", "}"))
856-
parseTest("{3, 5}", concat("{", "3", ",", " ", "5", "}"))
857-
parseTest("{3 }", concat("{", "3", " ", "}"))
858-
859858
// MARK: Groups
860859

861860
// Named captures
@@ -1771,10 +1770,10 @@ extension RegexTests {
17711770

17721771
// PCRE states that whitespace won't be ignored within a range.
17731772
// http://pcre.org/current/doc/html/pcre2api.html#SEC20
1774-
// TODO: We ought to warn on this, and produce a range anyway.
1773+
// We however do ignore it.
17751774
parseTest("(?x)a{1, 3}", concat(
17761775
changeMatchingOptions(matchingOptions(adding: .extended)),
1777-
"a", "{", "1", ",", "3", "}"
1776+
quantRange(1 ... 3, of: "a")
17781777
))
17791778

17801779
// Test that we cover the list of whitespace characters covered by PCRE.
@@ -2695,6 +2694,9 @@ extension RegexTests {
26952694
diagnosticTest("{1,3}", .quantifierRequiresOperand("{1,3}"))
26962695
diagnosticTest("a{3,2}", .invalidQuantifierRange(3, 2))
26972696

2697+
diagnosticTest("{3, 5}", .quantifierRequiresOperand("{3, 5}"))
2698+
diagnosticTest("{3 }", .quantifierRequiresOperand("{3 }"))
2699+
26982700
// These are not quantifiable.
26992701
diagnosticTest(#"\b?"#, .notQuantifiable)
27002702
diagnosticTest(#"\B*"#, .notQuantifiable)

0 commit comments

Comments
 (0)