Skip to content

Commit 29df207

Browse files
committed
Allow coalescing through trivia
I also noticed that `lexQuantifier` could silently eat trivia if it failed to lex a quantification, so also fix that.
1 parent 6d7e168 commit 29df207

File tree

5 files changed

+104
-22
lines changed

5 files changed

+104
-22
lines changed

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 24 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -480,35 +480,37 @@ extension Parser {
480480
///
481481
mutating func lexQuantifier(
482482
) -> (Located<Quant.Amount>, Located<Quant.Kind>, [AST.Trivia])? {
483-
var trivia: [AST.Trivia] = []
483+
tryEating { p in
484+
var trivia: [AST.Trivia] = []
484485

485-
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
486+
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
486487

487-
let amt: Located<Quant.Amount>? = recordLoc { p in
488-
if p.tryEat("*") { return .zeroOrMore }
489-
if p.tryEat("+") { return .oneOrMore }
490-
if p.tryEat("?") { return .zeroOrOne }
488+
let amt: Located<Quant.Amount>? = p.recordLoc { p in
489+
if p.tryEat("*") { return .zeroOrMore }
490+
if p.tryEat("+") { return .oneOrMore }
491+
if p.tryEat("?") { return .zeroOrOne }
491492

492-
return p.tryEating { p in
493-
guard p.tryEat("{"),
494-
let range = p.lexRange(trivia: &trivia),
495-
p.tryEat("}")
496-
else { return nil }
497-
return range.value
493+
return p.tryEating { p in
494+
guard p.tryEat("{"),
495+
let range = p.lexRange(trivia: &trivia),
496+
p.tryEat("}")
497+
else { return nil }
498+
return range.value
499+
}
498500
}
499-
}
500-
guard let amt = amt else { return nil }
501+
guard let amt = amt else { return nil }
501502

502-
// PCRE allows non-semantic whitespace here in extended syntax mode.
503-
if let t = lexNonSemanticWhitespace() { trivia.append(t) }
503+
// PCRE allows non-semantic whitespace here in extended syntax mode.
504+
if let t = p.lexNonSemanticWhitespace() { trivia.append(t) }
504505

505-
let kind: Located<Quant.Kind> = recordLoc { p in
506-
if p.tryEat("?") { return .reluctant }
507-
if p.tryEat("+") { return .possessive }
508-
return .eager
509-
}
506+
let kind: Located<Quant.Kind> = p.recordLoc { p in
507+
if p.tryEat("?") { return .reluctant }
508+
if p.tryEat("+") { return .possessive }
509+
return .eager
510+
}
510511

511-
return (amt, kind, trivia)
512+
return (amt, kind, trivia)
513+
}
512514
}
513515

514516
/// Try to consume a range, returning `nil` if unsuccessful.

Sources/_StringProcessing/ByteCodeGen.swift

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,10 @@ fileprivate extension Compiler.ByteCodeGen {
788788
current.append(lhs)
789789
rangeOperands.append(String(rhs))
790790
return true
791+
case .trivia:
792+
// Trivia can be completely ignored if we've already coalesced
793+
// something.
794+
return !current.isEmpty
791795
default:
792796
return false
793797
}
@@ -903,6 +907,10 @@ fileprivate extension Compiler.ByteCodeGen {
903907
case .quotedLiteral(let q):
904908
str += q
905909
return true
910+
case .trivia:
911+
// Trivia can be completely ignored if we've already coalesced
912+
// something.
913+
return !str.isEmpty
906914
default:
907915
return false
908916
}

Sources/_StringProcessing/PrintAsPattern.swift

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,10 @@ extension PrettyPrinter {
304304
case .quotedLiteral(let q):
305305
literal.append(q)
306306
return true
307+
case .trivia:
308+
// Trivia can be completely ignored if we've already coalesced
309+
// something.
310+
return !literal.isEmpty
307311
default:
308312
return false
309313
}

Tests/RegexTests/MatchTests.swift

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,23 @@ extension RegexTests {
345345
input: "e\u{301}0e\u{302}",
346346
match: "e\u{301}0e\u{302}"
347347
)
348+
firstMatchTest(
349+
#"(?x) e \u{35C} \u{315}(?#hello)\u{301}"#,
350+
input: "e\u{301}\u{315}\u{35C}",
351+
match: "e\u{301}\u{315}\u{35C}"
352+
)
353+
firstMatchTest(
354+
#"(?x) e \u{35C} \u{315 301}"#,
355+
input: "e\u{301}\u{315}\u{35C}",
356+
match: "e\u{301}\u{315}\u{35C}"
357+
)
358+
359+
// We don't coalesce across groups.
360+
firstMatchTests(
361+
#"e\u{301}(?:\u{315}\u{35C})?"#,
362+
("e\u{301}", "e\u{301}"),
363+
("e\u{301}\u{315}\u{35C}", nil)
364+
)
348365

349366
// Escape sequences that represent scalar values.
350367
firstMatchTest(#"\a[\b]\e\f\n\r\t"#,
@@ -790,6 +807,30 @@ extension RegexTests {
790807
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
791808
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
792809
)
810+
firstMatchTests(
811+
#"(?x) [ e \u{315} \u{301} \u{35C} ]"#,
812+
("e", nil),
813+
("e\u{315}", nil),
814+
("e\u{301}", nil),
815+
("e\u{315}\u{301}\u{35C}", "e\u{315}\u{301}\u{35C}"),
816+
("e\u{301}\u{315}\u{35C}", "e\u{301}\u{315}\u{35C}"),
817+
("e\u{35C}\u{301}\u{315}", "e\u{35C}\u{301}\u{315}")
818+
)
819+
820+
// We don't coalesce across character classes.
821+
firstMatchTests(
822+
#"e[\u{315}\u{301}\u{35C}]"#,
823+
("e", nil),
824+
("e\u{315}", nil),
825+
("e\u{315}\u{301}", nil),
826+
("e\u{301}\u{315}\u{35C}", nil)
827+
)
828+
firstMatchTests(
829+
#"[e[\u{301}]]"#,
830+
("e", "e"),
831+
("\u{301}", "\u{301}"),
832+
("e\u{301}", nil)
833+
)
793834

794835
firstMatchTests(
795836
#"[a-z1\u{E9}-\u{302}\u{E1}3-59]"#,
@@ -948,6 +989,16 @@ extension RegexTests {
948989
("e\u{302}", nil),
949990
("e\u{303}", "e\u{303}"))
950991

992+
firstMatchTests(
993+
#"(?x) [ e \u{301} [ e \u{303} ] -- [ [ e \u{301} ] e \u{302} ] ]"#,
994+
("e", nil),
995+
("\u{301}", nil),
996+
("\u{302}", nil),
997+
("\u{303}", nil),
998+
("e\u{301}", nil),
999+
("e\u{302}", nil),
1000+
("e\u{303}", "e\u{303}"))
1001+
9511002
firstMatchTest("[-]", input: "123-abcxyz", match: "-")
9521003

9531004
// These are metacharacters in certain contexts, but normal characters
@@ -2118,6 +2169,11 @@ extension RegexTests {
21182169
#"\u{65 301}"#,
21192170
(eComposed, true),
21202171
(eDecomposed, true))
2172+
2173+
matchTest(
2174+
#"(?x) \u{65} \u{301}"#,
2175+
(eComposed, true),
2176+
(eDecomposed, true))
21212177
}
21222178

21232179
func testCanonicalEquivalenceCharacterClass() throws {

Tests/RegexTests/RenderDSLTests.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,18 @@ extension RenderDSLTests {
146146
}
147147
"""#)
148148

149+
try testConversion(#"(?x) a \u{301}"#, #"""
150+
Regex {
151+
"a\u{301}"
152+
}
153+
"""#)
154+
155+
try testConversion(#"(?x) [ a b c \u{301} ] "#, #"""
156+
Regex {
157+
One(.anyOf("abc\u{301}"))
158+
}
159+
"""#)
160+
149161
try testConversion(#"👨\u{200D}👨\u{200D}👧\u{200D}👦"#, #"""
150162
Regex {
151163
"👨\u{200D}👨\u{200D}👧\u{200D}👦"

0 commit comments

Comments
 (0)