Skip to content

Commit d3bd6ad

Browse files
committed
Error on unknown escape sequences
Throw an error for unknown a-z escape sequences as well as non-ASCII non-whitespace escape sequences.
1 parent d2ff78f commit d3bd6ad

File tree

3 files changed

+45
-11
lines changed

3 files changed

+45
-11
lines changed

Sources/_RegexParser/Regex/Parse/Diagnostics.swift

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ enum ParseError: Error, Hashable {
3939

4040
case expectedNonEmptyContents
4141
case expectedEscape
42+
case invalidEscape(Character)
4243

4344
case cannotReferToWholePattern
4445

@@ -107,6 +108,8 @@ extension ParseError: CustomStringConvertible {
107108
return "expected non-empty contents"
108109
case .expectedEscape:
109110
return "expected escape sequence"
111+
case .invalidEscape(let c):
112+
return "invalid escape sequence '\\\(c)'"
110113
case .cannotReferToWholePattern:
111114
return "cannot refer to whole pattern here"
112115
case .notQuantifiable:

Sources/_RegexParser/Regex/Parse/LexicalAnalysis.swift

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1489,8 +1489,17 @@ extension Source {
14891489
return try .scalar(
14901490
src.expectUnicodeScalar(escapedCharacter: char).value)
14911491
default:
1492-
return .char(char)
1492+
break
14931493
}
1494+
1495+
// We only allow unknown escape sequences for non-letter ASCII, and
1496+
// non-ASCII whitespace.
1497+
guard (char.isASCII && !char.isLetter) ||
1498+
(!char.isASCII && char.isWhitespace)
1499+
else {
1500+
throw ParseError.invalidEscape(char)
1501+
}
1502+
return .char(char)
14941503
}
14951504
}
14961505

Tests/RegexTests/ParseTests.swift

Lines changed: 32 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -544,9 +544,8 @@ extension RegexTests {
544544
#"a\Q \Q \\.\Eb"#,
545545
concat("a", quote(#" \Q \\."#), "b"))
546546

547-
// These follow the PCRE behavior.
547+
// This follows the PCRE behavior.
548548
parseTest(#"\Q\\E"#, quote("\\"))
549-
parseTest(#"\E"#, "E")
550549

551550
parseTest(#"a" ."b"#, concat("a", quote(" ."), "b"),
552551
syntax: .experimental)
@@ -566,6 +565,16 @@ extension RegexTests {
566565

567566
parseTest(#"["-"]"#, charClass(range_m("\"", "\"")))
568567

568+
// MARK: Escapes
569+
570+
// Not metachars, but we allow their escape as ASCII.
571+
parseTest(#"\<"#, "<")
572+
parseTest(#"\ "#, " ")
573+
parseTest(#"\\"#, "\\")
574+
575+
// Escaped U+3000 IDEOGRAPHIC SPACE.
576+
parseTest(#"\\#u{3000}"#, "\u{3000}")
577+
569578
// MARK: Comments
570579

571580
parseTest(
@@ -989,13 +998,6 @@ extension RegexTests {
989998
// Backreferences are not valid in custom character classes.
990999
parseTest(#"[\8]"#, charClass("8"))
9911000
parseTest(#"[\9]"#, charClass("9"))
992-
parseTest(#"[\g]"#, charClass("g"))
993-
parseTest(#"[\g+30]"#, charClass("g", "+", "3", "0"))
994-
parseTest(#"[\g{1}]"#, charClass("g", "{", "1", "}"))
995-
parseTest(#"[\k'a']"#, charClass("k", "'", "a", "'"))
996-
997-
parseTest(#"\g"#, atom(.char("g")))
998-
parseTest(#"\k"#, atom(.char("k")))
9991001

10001002
// MARK: Character names.
10011003

@@ -1526,7 +1528,7 @@ extension RegexTests {
15261528
parseWithDelimitersTest("re'x*'", zeroOrMore(of: "x"))
15271529

15281530
parseWithDelimitersTest(#"re'🔥🇩🇰'"#, concat("🔥", "🇩🇰"))
1529-
parseWithDelimitersTest(#"re'\🔥✅'"#, concat("🔥", ""))
1531+
parseWithDelimitersTest(#"re'🔥✅'"#, concat("🔥", ""))
15301532

15311533
// Printable ASCII characters.
15321534
delimiterLexingTest(##"re' !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'"##)
@@ -1875,6 +1877,26 @@ extension RegexTests {
18751877

18761878
diagnosticTest("\\", .expectedEscape)
18771879

1880+
// TODO: Custom diagnostic for expected backref
1881+
diagnosticTest(#"\g"#, .invalidEscape("g"))
1882+
diagnosticTest(#"\k"#, .invalidEscape("k"))
1883+
1884+
// TODO: Custom diagnostic for backref in custom char class
1885+
diagnosticTest(#"[\g]"#, .invalidEscape("g"))
1886+
diagnosticTest(#"[\g+30]"#, .invalidEscape("g"))
1887+
diagnosticTest(#"[\g{1}]"#, .invalidEscape("g"))
1888+
diagnosticTest(#"[\k'a']"#, .invalidEscape("k"))
1889+
1890+
// TODO: Custom diagnostic for missing '\Q'
1891+
diagnosticTest(#"\E"#, .invalidEscape("E"))
1892+
1893+
// Non-ASCII non-whitespace cases.
1894+
diagnosticTest(#"\🔥"#, .invalidEscape("🔥"))
1895+
diagnosticTest(#"\🇩🇰"#, .invalidEscape("🇩🇰"))
1896+
diagnosticTest(#"\e\#u{301}"#, .invalidEscape("e\u{301}"))
1897+
diagnosticTest(#"\\#u{E9}"#, .invalidEscape("é"))
1898+
diagnosticTest(#"\˂"#, .invalidEscape("˂"))
1899+
18781900
// MARK: Text Segment options
18791901

18801902
diagnosticTest("(?-y{g})", .cannotRemoveTextSegmentOptions)

0 commit comments

Comments
 (0)