Skip to content

Commit 6c442e5

Browse files
committed
Fix character class range matching
Previously we performed a lexicographic comparison with the bounds of a character class range. However this produced surprising results, and our implementation didn't properly handle case sensitivity. Update the logic to instead only allow single scalar NFC bounds. The input is then converted to NFC in grapheme semantic mode, and checked against the range. In scalar semantic mode, the input scalar is checked on its own. Additionally, fix the case sensitivity handling such that we check both the lowercase and uppercase version of the input against the range.
1 parent d9e26bb commit 6c442e5

File tree

9 files changed

+287
-74
lines changed

9 files changed

+287
-74
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -755,8 +755,10 @@ extension AST.Atom {
755755
/// Whether this atom is valid as the operand of a custom character class
756756
/// range.
757757
public var isValidCharacterClassRangeBound: Bool {
758-
// If we have a literal character value for this, it can be used as a bound.
759-
if literalCharacterValue != nil { return true }
758+
if let c = literalCharacterValue {
759+
// We only match character range bounds that are single scalar NFC.
760+
return c.hasExactlyOneScalar && c.isNFC
761+
}
760762
switch kind {
761763
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
762764
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:

Sources/_RegexParser/Utility/Misc.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,21 @@ extension Substring {
1919
var string: String { String(self) }
2020
}
2121

22+
extension Character {
23+
/// Whether this character is made up of exactly one Unicode scalar value.
24+
public var hasExactlyOneScalar: Bool {
25+
let scalars = unicodeScalars
26+
return scalars.index(after: scalars.startIndex) == scalars.endIndex
27+
}
28+
29+
/// Whether the given character is in NFC form.
30+
internal var isNFC: Bool {
31+
if isASCII { return true }
32+
let str = String(self)
33+
return str._nfcCodeUnits.elementsEqual(str.utf8)
34+
}
35+
}
36+
2237
extension CustomStringConvertible {
2338
@_alwaysEmitIntoClient
2439
public var halfWidthCornerQuoted: String {

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -361,38 +361,60 @@ extension DSLTree.CustomCharacterClass.Member {
361361
}
362362
return c
363363
case let .range(low, high):
364-
// TODO:
365-
guard let lhs = low.literalCharacterValue else {
364+
guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else {
366365
throw Unsupported("\(low) in range")
367366
}
368-
guard let rhs = high.literalCharacterValue else {
367+
guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else {
369368
throw Unsupported("\(high) in range")
370369
}
370+
guard lhs <= rhs else {
371+
throw Unsupported("Invalid range \(low)-\(high)")
372+
}
371373

372-
if opts.isCaseInsensitive {
373-
let lhsLower = lhs.lowercased()
374-
let rhsLower = rhs.lowercased()
375-
guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
376-
return { input, bounds in
377-
// TODO: check for out of bounds?
378-
let curIdx = bounds.lowerBound
379-
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
380-
// TODO: semantic level
381-
return input.index(after: curIdx)
382-
}
383-
return nil
374+
let isCaseInsensitive = opts.isCaseInsensitive
375+
let isCharacterSemantic = opts.semanticLevel == .graphemeCluster
376+
377+
return { input, bounds in
378+
let curIdx = bounds.lowerBound
379+
let nextIndex = isCharacterSemantic
380+
? input.index(after: curIdx)
381+
: input.unicodeScalars.index(after: curIdx)
382+
383+
// Under grapheme semantics, we compare based on single NFC scalars. If
384+
// such a character is not single scalar under NFC, the match fails. In
385+
// scalar semantics, we compare the exact scalar value to the NFC
386+
// bounds.
387+
let scalar = isCharacterSemantic ? input[curIdx].singleNFCScalar
388+
: input.unicodeScalars[curIdx]
389+
guard let scalar = scalar else { return nil }
390+
let scalarRange = lhs ... rhs
391+
if scalarRange.contains(scalar) {
392+
return nextIndex
384393
}
385-
} else {
386-
guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
387-
return { input, bounds in
388-
// TODO: check for out of bounds?
389-
let curIdx = bounds.lowerBound
390-
if (lhs...rhs).contains(input[curIdx]) {
391-
// TODO: semantic level
392-
return input.index(after: curIdx)
394+
395+
// Check for case insensitive matches.
396+
func matchesCased(
397+
_ cased: (UnicodeScalar.Properties) -> String
398+
) -> Bool {
399+
let casedStr = cased(scalar.properties)
400+
// In character semantic mode, we need to map to NFC. In scalar
401+
// semantics, we should have an exact scalar.
402+
let mapped = isCharacterSemantic ? casedStr.singleNFCScalar
403+
: casedStr.singleScalar
404+
guard let mapped = mapped else { return false }
405+
return scalarRange.contains(mapped)
406+
}
407+
if isCaseInsensitive {
408+
if scalar.properties.changesWhenLowercased,
409+
matchesCased(\.lowercaseMapping) {
410+
return nextIndex
411+
}
412+
if scalar.properties.changesWhenUppercased,
413+
matchesCased(\.uppercaseMapping) {
414+
return nextIndex
393415
}
394-
return nil
395416
}
417+
return nil
396418
}
397419

398420
case let .custom(ccc):

Sources/_StringProcessing/Unicode/CharacterProps.swift

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,3 @@
1111

1212

1313
// TODO
14-
15-
extension Character {
16-
/// Whether this character is made up of exactly one Unicode scalar value.
17-
var hasExactlyOneScalar: Bool {
18-
unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
19-
}
20-
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
@_spi(_Unicode)
13+
import Swift
14+
15+
extension UnicodeScalar {
16+
/// Checks whether the scalar is in NFC form.
17+
var isNFC: Bool { Character(self).singleNFCScalar == self }
18+
}
19+
20+
extension Character {
21+
/// If the given character consists of a single NFC scalar, returns it. If
22+
/// there are multiple NFC scalars, returns `nil`.
23+
var singleNFCScalar: UnicodeScalar? {
24+
// SwiftStdlib is always >= 5.7 for a shipped StringProcessing.
25+
guard #available(SwiftStdlib 5.7, *) else { return nil }
26+
var nfcIter = String(self)._nfc.makeIterator()
27+
guard let scalar = nfcIter.next(), nfcIter.next() == nil else { return nil }
28+
return scalar
29+
}
30+
31+
/// If the given character contains a single scalar, returns it. If none or
32+
/// multiple scalars are present, returns `nil`.
33+
var singleScalar: UnicodeScalar? {
34+
hasExactlyOneScalar ? unicodeScalars.first! : nil
35+
}
36+
}
37+
38+
extension String {
39+
/// If the given string consists of a single NFC scalar, returns it. If none
40+
/// or multiple NFC scalars are present, returns `nil`.
41+
var singleNFCScalar: UnicodeScalar? {
42+
guard !isEmpty && index(after: startIndex) == endIndex else { return nil }
43+
return first!.singleNFCScalar
44+
}
45+
46+
/// If the given string contains a single scalar, returns it. If none or
47+
/// multiple scalars are present, returns `nil`.
48+
var singleScalar: UnicodeScalar? {
49+
let scalars = unicodeScalars
50+
guard !scalars.isEmpty &&
51+
scalars.index(after: scalars.startIndex) == scalars.endIndex
52+
else { return nil }
53+
return scalars.first!
54+
}
55+
}

Tests/RegexBuilderTests/RegexDSLTests.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ class RegexDSLTests: XCTestCase {
7474
let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n"
7575

7676
func testCharacterClasses() throws {
77+
// Must have new stdlib for character class ranges.
78+
guard ensureNewStdlib() else { return }
79+
7780
try _testDSLCaptures(
7881
("a c", ("a c", " ", "c")),
7982
matchType: (Substring, Substring, Substring).self, ==)
@@ -248,6 +251,9 @@ class RegexDSLTests: XCTestCase {
248251
}
249252

250253
func testCharacterClassOperations() throws {
254+
// Must have new stdlib for character class ranges.
255+
guard ensureNewStdlib() else { return }
256+
251257
try _testDSLCaptures(
252258
("bcdefn1a", "bcdefn1a"),
253259
("nbcdef1a", nil), // fails symmetric difference lookahead
@@ -591,6 +597,9 @@ class RegexDSLTests: XCTestCase {
591597
}
592598

593599
func testQuantificationBehavior() throws {
600+
// Must have new stdlib for character class ranges.
601+
guard ensureNewStdlib() else { return }
602+
594603
// Eager by default
595604
try _testDSLCaptures(
596605
("abc1def2", ("abc1def2", "2")),

0 commit comments

Comments
 (0)