Skip to content

Commit 390d9ee

Browse files
committed
Ban confusable multi-scalar ASCII characters
Ban multi-scalar characters that start with ASCII, and are not letters, numbers, or `\r\n`. These may be confused with metacharacters and as such should be spelled explicitly.
1 parent b563ce4 commit 390d9ee

File tree

4 files changed

+42
-2
lines changed

4 files changed

+42
-2
lines changed

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ enum ParseError: Error, Hashable {
4242
case expectedNonEmptyContents
4343
case expectedEscape
4444
case invalidEscape(Character)
45+
case confusableCharacter(Character)
4546

4647
case cannotReferToWholePattern
4748

@@ -128,6 +129,8 @@ extension ParseError: CustomStringConvertible {
128129
return "expected escape sequence"
129130
case .invalidEscape(let c):
130131
return "invalid escape sequence '\\\(c)'"
132+
case .confusableCharacter(let c):
133+
return "'\(c)' is confusable for a metacharacter; use '\\u{...}' instead"
131134
case .cannotReferToWholePattern:
132135
return "cannot refer to whole pattern here"
133136
case .quantifierRequiresOperand(let q):

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1982,10 +1982,21 @@ extension Source {
19821982

19831983
case "]":
19841984
assert(!customCC, "parser should have prevented this")
1985-
fallthrough
1985+
break
19861986

1987-
default: return .char(char)
1987+
default:
1988+
// Reject non-letter non-number non-`\r\n` ASCII characters that have
1989+
// multiple scalars. These may be confusable for metacharacters, e.g
1990+
// `[\u{301}]` wouldn't be interpreted as a custom character class due
1991+
// to the combining accent (assuming it is literal, not `\u{...}`).
1992+
let scalars = char.unicodeScalars
1993+
if scalars.count > 1 && scalars.first!.isASCII && char != "\r\n" &&
1994+
!char.isLetter && !char.isNumber {
1995+
throw ParseError.confusableCharacter(char)
1996+
}
1997+
break
19881998
}
1999+
return .char(char)
19892000
}
19902001
guard let kind = kind else { return nil }
19912002
return AST.Atom(kind.value, kind.location)

Tests/RegexTests/MatchTests.swift

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,12 @@ extension RegexTests {
218218
firstMatchTest(
219219
#"abc\d"#, input: "xyzabc123", match: "abc1")
220220

221+
// MARK: Allowed combining characters
222+
223+
firstMatchTest("e\u{301}", input: "e\u{301}", match: "e\u{301}")
224+
firstMatchTest("1\u{358}", input: "1\u{358}", match: "1\u{358}")
225+
firstMatchTest(#"\ \#u{361}"#, input: " \u{361}", match: " \u{361}")
226+
221227
// MARK: Alternations
222228

223229
firstMatchTest(

Tests/RegexTests/ParseTests.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,12 @@ extension RegexTests {
394394
#"abc\d"#,
395395
concat("a", "b", "c", escaped(.decimalDigit)))
396396

397+
// MARK: Allowed combining characters
398+
399+
parseTest("e\u{301}", "e\u{301}")
400+
parseTest("1\u{358}", "1\u{358}")
401+
parseTest(#"\ \#u{361}"#, " \u{361}")
402+
397403
// MARK: Alternations
398404

399405
parseTest(
@@ -476,6 +482,8 @@ extension RegexTests {
476482
parseTest(#"\u{ a }"#, scalar("\u{A}"))
477483
parseTest(#"\u{ a }\u{ B }"#, concat(scalar("\u{A}"), scalar("\u{B}")))
478484

485+
parseTest(#"[\u{301}]"#, charClass(scalar_m("\u{301}")))
486+
479487
// MARK: Scalar sequences
480488

481489
parseTest(#"\u{A bC}"#, scalarSeq("\u{A}", "\u{BC}"))
@@ -2543,6 +2551,18 @@ extension RegexTests {
25432551
diagnosticTest(#"\˂"#, .invalidEscape("˂"))
25442552
diagnosticTest(#"\d\#u{301}"#, .invalidEscape("d\u{301}"))
25452553

2554+
// MARK: Confusable characters
2555+
2556+
diagnosticTest("[\u{301}]", .confusableCharacter("[\u{301}"))
2557+
diagnosticTest("(\u{358})", .confusableCharacter("(\u{358}"))
2558+
diagnosticTest("{\u{35B}}", .confusableCharacter("{\u{35B}"))
2559+
diagnosticTest(#"\\#u{35C}"#, .confusableCharacter(#"\\#u{35C}"#))
2560+
diagnosticTest("^\u{35D}", .confusableCharacter("^\u{35D}"))
2561+
diagnosticTest("$\u{35E}", .confusableCharacter("$\u{35E}"))
2562+
diagnosticTest(".\u{35F}", .confusableCharacter(".\u{35F}"))
2563+
diagnosticTest("|\u{360}", .confusableCharacter("|\u{360}"))
2564+
diagnosticTest(" \u{361}", .confusableCharacter(" \u{361}"))
2565+
25462566
// MARK: Character properties
25472567

25482568
diagnosticTest(#"\p{Lx}"#, .unknownProperty(key: nil, value: "Lx"))

0 commit comments

Comments
 (0)