Skip to content

Commit 3146746

Browse files
committed
Limit CC ranges to single-scalar bounds
Character class ranges don't work well with multi-scalar inputs, in either the range or the matched character. This change limits range endpoints to single-scalar characters and matches only characters that are themselves a single scalar. Fixes issue swiftlang#407, which now displays this behavior: ``` try /[1-2]/.wholeMatch(in: "1️⃣") // nil try /[12]/.wholeMatch(in: "1️⃣") // nil try /(?U)[\d]/.wholeMatch(in: "1️⃣") // nil ```
1 parent 7969272 commit 3146746

File tree

6 files changed

+19
-10
lines changed

6 files changed

+19
-10
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,7 @@ extension AST.Atom {
771771
/// range.
772772
public var isValidCharacterClassRangeBound: Bool {
773773
// If we have a literal character value for this, it can be used as a bound.
774-
if literalCharacterValue != nil { return true }
774+
if literalCharacterValue?.hasExactlyOneScalar == true { return true }
775775
switch kind {
776776
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
777777
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:

Sources/_RegexParser/Utility/Misc.swift

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,13 @@ extension Substring {
1919
var string: String { String(self) }
2020
}
2121

22+
extension Character {
23+
/// Whether this character is made up of exactly one Unicode scalar value.
24+
public var hasExactlyOneScalar: Bool {
25+
unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
26+
}
27+
}
28+
2229
extension CustomStringConvertible {
2330
@_alwaysEmitIntoClient
2431
public var halfWidthCornerQuoted: String {

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -248,10 +248,10 @@ extension DSLTree.CustomCharacterClass.Member {
248248
return c
249249
case let .range(low, high):
250250
// TODO:
251-
guard let lhs = low.literalCharacterValue else {
251+
guard let lhs = low.literalCharacterValue, lhs.hasExactlyOneScalar else {
252252
throw Unsupported("\(low) in range")
253253
}
254-
guard let rhs = high.literalCharacterValue else {
254+
guard let rhs = high.literalCharacterValue, rhs.hasExactlyOneScalar else {
255255
throw Unsupported("\(high) in range")
256256
}
257257

@@ -262,6 +262,7 @@ extension DSLTree.CustomCharacterClass.Member {
262262
return { input, bounds in
263263
// TODO: check for out of bounds?
264264
let curIdx = bounds.lowerBound
265+
guard input[curIdx].hasExactlyOneScalar else { return nil }
265266
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
266267
// TODO: semantic level
267268
return input.index(after: curIdx)
@@ -273,6 +274,7 @@ extension DSLTree.CustomCharacterClass.Member {
273274
return { input, bounds in
274275
// TODO: check for out of bounds?
275276
let curIdx = bounds.lowerBound
277+
guard input[curIdx].hasExactlyOneScalar else { return nil }
276278
if (lhs...rhs).contains(input[curIdx]) {
277279
// TODO: semantic level
278280
return input.index(after: curIdx)

Sources/_StringProcessing/Unicode/CharacterProps.swift

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,3 @@
1111

1212

1313
// TODO
14-
15-
extension Character {
16-
/// Whether this character is made up of exactly one Unicode scalar value.
17-
var hasExactlyOneScalar: Bool {
18-
unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
19-
}
20-
}

Sources/_StringProcessing/_CharacterClassModel.swift

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,11 @@ public struct _CharacterClassModel: Hashable {
103103
return c == character
104104
}
105105
case .range(let range):
106+
// Ranges can be formed with single-scalar characters, and can only
107+
// match as such.
108+
// FIXME: Convert to canonical composed version before testing?
109+
guard character.hasExactlyOneScalar else { return false }
110+
106111
if options.isCaseInsensitive {
107112
let newLower = range.lowerBound.lowercased()
108113
let newUpper = range.upperBound.lowercased()

Tests/RegexTests/ParseTests.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2534,6 +2534,8 @@ extension RegexTests {
25342534
diagnosticTest("[[:=:]]", .emptyProperty)
25352535

25362536
diagnosticTest(#"|([\d-c])?"#, .invalidCharacterClassRangeOperand)
2537+
diagnosticTest(#"|([🇦🇫-🇿🇼])?"#, .invalidCharacterClassRangeOperand)
2538+
diagnosticTest(#"|([👨‍👩‍👦-👩‍👩‍👧‍👧])?"#, .invalidCharacterClassRangeOperand)
25372539

25382540
diagnosticTest(#"[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))
25392541
diagnosticTest(#"(?i)[_-A]"#, .invalidCharacterRange(from: "_", to: "A"))

0 commit comments

Comments
 (0)