Skip to content

Commit b8756ae

Browse files
committed
Fix character class range matching
Previously we performed a lexicographic comparison with the bounds of a character class range. However this produced surprising results, and our implementation didn't properly handle case sensitivity. Update the logic to instead only allow single scalar NFC bounds. The input is then converted to NFC in grapheme semantic mode, and checked against the range. In scalar semantic mode, the input scalar is checked on its own. Additionally, fix the case sensitivity handling such that we check both the lowercase and uppercase version of the input against the range.
1 parent ba5b2ca commit b8756ae

File tree

9 files changed

+299
-75
lines changed

9 files changed

+299
-75
lines changed

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -816,8 +816,10 @@ extension AST.Atom {
816816
/// Whether this atom is valid as the operand of a custom character class
817817
/// range.
818818
public var isValidCharacterClassRangeBound: Bool {
819-
// If we have a literal character value for this, it can be used as a bound.
820-
if literalCharacterValue != nil { return true }
819+
if let c = literalCharacterValue {
820+
// We only match character range bounds that are single scalar NFC.
821+
return c.hasExactlyOneScalar && c.isNFC
822+
}
821823
switch kind {
822824
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
823825
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:

Sources/_RegexParser/Utility/Misc.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,21 @@ extension Substring {
1919
var string: String { String(self) }
2020
}
2121

22+
extension Character {
23+
/// Whether this character is made up of exactly one Unicode scalar value.
24+
public var hasExactlyOneScalar: Bool {
25+
let scalars = unicodeScalars
26+
return scalars.index(after: scalars.startIndex) == scalars.endIndex
27+
}
28+
29+
/// Whether the given character is in NFC form.
30+
internal var isNFC: Bool {
31+
if isASCII { return true }
32+
let str = String(self)
33+
return str._nfcCodeUnits.elementsEqual(str.utf8)
34+
}
35+
}
36+
2237
extension CustomStringConvertible {
2338
@_alwaysEmitIntoClient
2439
public var halfWidthCornerQuoted: String {

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -342,38 +342,60 @@ extension DSLTree.CustomCharacterClass.Member {
342342
}
343343
return c
344344
case let .range(low, high):
345-
// TODO:
346-
guard let lhs = low.literalCharacterValue else {
345+
guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else {
347346
throw Unsupported("\(low) in range")
348347
}
349-
guard let rhs = high.literalCharacterValue else {
348+
guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else {
350349
throw Unsupported("\(high) in range")
351350
}
351+
guard lhs <= rhs else {
352+
throw Unsupported("Invalid range \(low)-\(high)")
353+
}
352354

353-
if opts.isCaseInsensitive {
354-
let lhsLower = lhs.lowercased()
355-
let rhsLower = rhs.lowercased()
356-
guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
357-
return { input, bounds in
358-
// TODO: check for out of bounds?
359-
let curIdx = bounds.lowerBound
360-
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
361-
// TODO: semantic level
362-
return input.index(after: curIdx)
363-
}
364-
return nil
355+
let isCaseInsensitive = opts.isCaseInsensitive
356+
let isCharacterSemantic = opts.semanticLevel == .graphemeCluster
357+
358+
return { input, bounds in
359+
let curIdx = bounds.lowerBound
360+
let nextIndex = isCharacterSemantic
361+
? input.index(after: curIdx)
362+
: input.unicodeScalars.index(after: curIdx)
363+
364+
// Under grapheme semantics, we compare based on single NFC scalars. If
365+
// such a character is not single scalar under NFC, the match fails. In
366+
// scalar semantics, we compare the exact scalar value to the NFC
367+
// bounds.
368+
let scalar = isCharacterSemantic ? input[curIdx].singleNFCScalar
369+
: input.unicodeScalars[curIdx]
370+
guard let scalar = scalar else { return nil }
371+
let scalarRange = lhs ... rhs
372+
if scalarRange.contains(scalar) {
373+
return nextIndex
365374
}
366-
} else {
367-
guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
368-
return { input, bounds in
369-
// TODO: check for out of bounds?
370-
let curIdx = bounds.lowerBound
371-
if (lhs...rhs).contains(input[curIdx]) {
372-
// TODO: semantic level
373-
return input.index(after: curIdx)
375+
376+
// Check for case insensitive matches.
377+
func matchesCased(
378+
_ cased: (UnicodeScalar.Properties) -> String
379+
) -> Bool {
380+
let casedStr = cased(scalar.properties)
381+
// In character semantic mode, we need to map to NFC. In scalar
382+
// semantics, we should have an exact scalar.
383+
let mapped = isCharacterSemantic ? casedStr.singleNFCScalar
384+
: casedStr.singleScalar
385+
guard let mapped = mapped else { return false }
386+
return scalarRange.contains(mapped)
387+
}
388+
if isCaseInsensitive {
389+
if scalar.properties.changesWhenLowercased,
390+
matchesCased(\.lowercaseMapping) {
391+
return nextIndex
392+
}
393+
if scalar.properties.changesWhenUppercased,
394+
matchesCased(\.uppercaseMapping) {
395+
return nextIndex
374396
}
375-
return nil
376397
}
398+
return nil
377399
}
378400

379401
case let .custom(ccc):

Sources/_StringProcessing/Unicode/CharacterProps.swift

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,3 @@
1111

1212

1313
// TODO
14-
15-
extension Character {
16-
/// Whether this character is made up of exactly one Unicode scalar value.
17-
var hasExactlyOneScalar: Bool {
18-
unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
19-
}
20-
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
@_spi(_Unicode)
13+
import Swift
14+
15+
extension UnicodeScalar {
16+
/// Checks whether the scalar is in NFC form.
17+
var isNFC: Bool { Character(self).singleNFCScalar == self }
18+
}
19+
20+
extension Character {
21+
/// If the given character consists of a single NFC scalar, returns it. If
22+
/// there are multiple NFC scalars, returns `nil`.
23+
var singleNFCScalar: UnicodeScalar? {
24+
// SwiftStdlib is always >= 5.7 for a shipped StringProcessing.
25+
guard #available(SwiftStdlib 5.7, *) else { return nil }
26+
var nfcIter = String(self)._nfc.makeIterator()
27+
guard let scalar = nfcIter.next(), nfcIter.next() == nil else { return nil }
28+
return scalar
29+
}
30+
31+
/// If the given character contains a single scalar, returns it. If none or
32+
/// multiple scalars are present, returns `nil`.
33+
var singleScalar: UnicodeScalar? {
34+
hasExactlyOneScalar ? unicodeScalars.first! : nil
35+
}
36+
}
37+
38+
extension String {
39+
/// If the given string consists of a single NFC scalar, returns it. If none
40+
/// or multiple NFC scalars are present, returns `nil`.
41+
var singleNFCScalar: UnicodeScalar? {
42+
guard !isEmpty && index(after: startIndex) == endIndex else { return nil }
43+
return first!.singleNFCScalar
44+
}
45+
46+
/// If the given string contains a single scalar, returns it. If none or
47+
/// multiple scalars are present, returns `nil`.
48+
var singleScalar: UnicodeScalar? {
49+
let scalars = unicodeScalars
50+
guard !scalars.isEmpty &&
51+
scalars.index(after: scalars.startIndex) == scalars.endIndex
52+
else { return nil }
53+
return scalars.first!
54+
}
55+
}

Tests/RegexBuilderTests/RegexDSLTests.swift

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,9 @@ class RegexDSLTests: XCTestCase {
7171
}
7272

7373
func testCharacterClasses() throws {
74+
// Must have new stdlib for character class ranges.
75+
guard ensureNewStdlib() else { return }
76+
7477
try _testDSLCaptures(
7578
("a c", ("a c", " ", "c")),
7679
matchType: (Substring, Substring, Substring).self, ==)
@@ -114,6 +117,9 @@ class RegexDSLTests: XCTestCase {
114117
}
115118

116119
func testCharacterClassOperations() throws {
120+
// Must have new stdlib for character class ranges.
121+
guard ensureNewStdlib() else { return }
122+
117123
try _testDSLCaptures(
118124
("bcdefn1a", "bcdefn1a"),
119125
("nbcdef1a", nil), // fails symmetric difference lookahead
@@ -345,6 +351,9 @@ class RegexDSLTests: XCTestCase {
345351
}
346352

347353
func testQuantificationBehavior() throws {
354+
// Must have new stdlib for character class ranges.
355+
guard ensureNewStdlib() else { return }
356+
348357
// Eager by default
349358
try _testDSLCaptures(
350359
("abc1def2", ("abc1def2", "2")),

0 commit comments

Comments
 (0)