Skip to content

Commit d9d02c1

Browse files
committed
Ban ] as literal first character of custom character class
PCRE, Oniguruma, and ICU allow `]` to appear as the first member of a custom character class, and treat it as literal, due to empty character classes being invalid. However this behavior isn't particularly intuitive, and makes lexing heuristics harder to implement properly. Instead, reject such character classes as being empty, and require escaping if `]` is meant as the first character.
1 parent 87ea119 commit d9d02c1

File tree

3 files changed

+10
-32
lines changed

3 files changed

+10
-32
lines changed

Sources/_RegexParser/Regex/Parse/Parse.swift

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -465,16 +465,6 @@ extension Parser {
465465
var members: Array<Member> = []
466466
try parseCCCMembers(into: &members)
467467

468-
// If we didn't parse any semantic members, we can eat a ']' character, as
469-
// PCRE, Oniguruma, and ICU forbid empty character classes, and assume an
470-
// initial ']' is literal.
471-
if members.none(\.isSemantic) {
472-
if let loc = source.tryEatWithLoc("]") {
473-
members.append(.atom(.init(.char("]"), loc)))
474-
try parseCCCMembers(into: &members)
475-
}
476-
}
477-
478468
// If we have a binary set operator, parse it and the next members. Note
479469
// that this means we left associate for a chain of operators.
480470
// TODO: We may want to diagnose and require users to disambiguate, at least

Tests/RegexTests/MatchTests.swift

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1282,7 +1282,6 @@ extension RegexTests {
12821282
firstMatchTest(#"(?xx)[ \t]+"#, input: " \t \t", match: "\t")
12831283

12841284
firstMatchTest("(?xx)[ a && ab ]+", input: " aaba ", match: "aa")
1285-
firstMatchTest("(?xx)[ ] a ]+", input: " a]]a ] ", match: "a]]a")
12861285
}
12871286

12881287
func testASCIIClasses() {

Tests/RegexTests/ParseTests.swift

Lines changed: 10 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -528,23 +528,7 @@ extension RegexTests {
528528
))
529529

530530
parseTest("[-]", charClass("-"))
531-
532-
// Empty character classes are forbidden, therefore these are character
533-
// classes containing literal ']'.
534-
parseTest("[]]", charClass("]"))
535-
parseTest("[]a]", charClass("]", "a"))
536-
parseTest("(?x)[ ]]", concat(
537-
changeMatchingOptions(matchingOptions(adding: .extended)),
538-
charClass("]")
539-
))
540-
parseTest("(?x)[ ] ]", concat(
541-
changeMatchingOptions(matchingOptions(adding: .extended)),
542-
charClass("]")
543-
))
544-
parseTest("(?x)[ ] a ]", concat(
545-
changeMatchingOptions(matchingOptions(adding: .extended)),
546-
charClass("]", "a")
547-
))
531+
parseTest(#"[\]]"#, charClass("]"))
548532

549533
// These are metacharacters in certain contexts, but normal characters
550534
// otherwise.
@@ -2497,10 +2481,15 @@ extension RegexTests {
24972481

24982482
diagnosticTest("[a", .expected("]"))
24992483

2500-
// The first ']' of a custom character class is literal, so these are
2501-
// missing the closing bracket.
2502-
diagnosticTest("[]", .expected("]"))
2503-
diagnosticTest("(?x)[ ]", .expected("]"))
2484+
// Character classes may not be empty.
2485+
diagnosticTest("[]", .expectedCustomCharacterClassMembers)
2486+
diagnosticTest("[]]", .expectedCustomCharacterClassMembers)
2487+
diagnosticTest("[]a]", .expectedCustomCharacterClassMembers)
2488+
diagnosticTest("(?x)[ ]", .expectedCustomCharacterClassMembers)
2489+
diagnosticTest("(?x)[ ] ]", .expectedCustomCharacterClassMembers)
2490+
diagnosticTest("(?x)[ ] a ]", .expectedCustomCharacterClassMembers)
2491+
diagnosticTest("(?xx)[ ] a ]+", .expectedCustomCharacterClassMembers)
2492+
diagnosticTest("(?x)[ ]]", .expectedCustomCharacterClassMembers)
25042493

25052494
diagnosticTest("[&&]", .expectedCustomCharacterClassMembers)
25062495
diagnosticTest("[a&&]", .expectedCustomCharacterClassMembers)

0 commit comments

Comments
 (0)