Skip to content

Commit b8a729c

Browse files
authored
Merge pull request #570 from hamishknight/is-this-nfc
2 parents 55e1571 + cd5cc37 commit b8a729c

File tree

11 files changed

+419
-96
lines changed

11 files changed

+419
-96
lines changed

Package.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,15 +75,17 @@ let package = Package(
7575
name: "RegexBuilder",
7676
dependencies: ["_StringProcessing", "_RegexParser"],
7777
swiftSettings: publicStdlibSettings),
78+
.target(name: "TestSupport",
79+
swiftSettings: [availabilityDefinition]),
7880
.testTarget(
7981
name: "RegexTests",
80-
dependencies: ["_StringProcessing"],
82+
dependencies: ["_StringProcessing", "TestSupport"],
8183
swiftSettings: [
8284
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"]),
8385
]),
8486
.testTarget(
8587
name: "RegexBuilderTests",
86-
dependencies: ["_StringProcessing", "RegexBuilder"],
88+
dependencies: ["_StringProcessing", "RegexBuilder", "TestSupport"],
8789
swiftSettings: [
8890
.unsafeFlags(["-Xfrontend", "-disable-availability-checking"])
8991
]),

Sources/TestSupport/TestSupport.swift

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
import XCTest
13+
14+
// We need to split this out of the test files, as it needs to be compiled
15+
// *without* `-disable-availability-checking` to ensure the #available check is
16+
// not compiled into a no-op.
17+
18+
#if os(Linux)
19+
public func XCTExpectFailure(
20+
_ message: String? = nil, body: () throws -> Void
21+
) rethrows {}
22+
#endif
23+
24+
/// Guards certain tests to make sure we have a new stdlib available.
25+
public func ensureNewStdlib(
26+
file: StaticString = #file, line: UInt = #line
27+
) -> Bool {
28+
guard #available(SwiftStdlib 5.7, *) else {
29+
XCTExpectFailure { XCTFail("Unsupported stdlib", file: file, line: line) }
30+
return false
31+
}
32+
return true
33+
}

Sources/_RegexParser/Regex/AST/Atom.swift

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -755,8 +755,10 @@ extension AST.Atom {
755755
/// Whether this atom is valid as the operand of a custom character class
756756
/// range.
757757
public var isValidCharacterClassRangeBound: Bool {
758-
// If we have a literal character value for this, it can be used as a bound.
759-
if literalCharacterValue != nil { return true }
758+
if let c = literalCharacterValue {
759+
// We only match character range bounds that are single scalar NFC.
760+
return c.hasExactlyOneScalar && c.isNFC
761+
}
760762
switch kind {
761763
// \cx, \C-x, \M-x, \M-\C-x, \N{...}
762764
case .keyboardControl, .keyboardMeta, .keyboardMetaControl, .namedCharacter:

Sources/_RegexParser/Utility/Misc.swift

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,21 @@ extension Substring {
1919
var string: String { String(self) }
2020
}
2121

22+
extension Character {
23+
/// Whether this character is made up of exactly one Unicode scalar value.
24+
public var hasExactlyOneScalar: Bool {
25+
let scalars = unicodeScalars
26+
return scalars.index(after: scalars.startIndex) == scalars.endIndex
27+
}
28+
29+
/// Whether the given character is in NFC form.
30+
internal var isNFC: Bool {
31+
if isASCII { return true }
32+
let str = String(self)
33+
return str._nfcCodeUnits.elementsEqual(str.utf8)
34+
}
35+
}
36+
2237
extension CustomStringConvertible {
2338
@_alwaysEmitIntoClient
2439
public var halfWidthCornerQuoted: String {

Sources/_StringProcessing/ConsumerInterface.swift

Lines changed: 46 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -361,38 +361,60 @@ extension DSLTree.CustomCharacterClass.Member {
361361
}
362362
return c
363363
case let .range(low, high):
364-
// TODO:
365-
guard let lhs = low.literalCharacterValue else {
364+
guard let lhs = low.literalCharacterValue?.singleScalar, lhs.isNFC else {
366365
throw Unsupported("\(low) in range")
367366
}
368-
guard let rhs = high.literalCharacterValue else {
367+
guard let rhs = high.literalCharacterValue?.singleScalar, rhs.isNFC else {
369368
throw Unsupported("\(high) in range")
370369
}
370+
guard lhs <= rhs else {
371+
throw Unsupported("Invalid range \(low)-\(high)")
372+
}
371373

372-
if opts.isCaseInsensitive {
373-
let lhsLower = lhs.lowercased()
374-
let rhsLower = rhs.lowercased()
375-
guard lhsLower <= rhsLower else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
376-
return { input, bounds in
377-
// TODO: check for out of bounds?
378-
let curIdx = bounds.lowerBound
379-
if (lhsLower...rhsLower).contains(input[curIdx].lowercased()) {
380-
// TODO: semantic level
381-
return input.index(after: curIdx)
382-
}
383-
return nil
374+
let isCaseInsensitive = opts.isCaseInsensitive
375+
let isCharacterSemantic = opts.semanticLevel == .graphemeCluster
376+
377+
return { input, bounds in
378+
let curIdx = bounds.lowerBound
379+
let nextIndex = isCharacterSemantic
380+
? input.index(after: curIdx)
381+
: input.unicodeScalars.index(after: curIdx)
382+
383+
// Under grapheme semantics, we compare based on single NFC scalars. If
384+
// such a character is not single scalar under NFC, the match fails. In
385+
// scalar semantics, we compare the exact scalar value to the NFC
386+
// bounds.
387+
let scalar = isCharacterSemantic ? input[curIdx].singleNFCScalar
388+
: input.unicodeScalars[curIdx]
389+
guard let scalar = scalar else { return nil }
390+
let scalarRange = lhs ... rhs
391+
if scalarRange.contains(scalar) {
392+
return nextIndex
384393
}
385-
} else {
386-
guard lhs <= rhs else { throw Unsupported("Invalid range \(lhs)-\(rhs)") }
387-
return { input, bounds in
388-
// TODO: check for out of bounds?
389-
let curIdx = bounds.lowerBound
390-
if (lhs...rhs).contains(input[curIdx]) {
391-
// TODO: semantic level
392-
return input.index(after: curIdx)
394+
395+
// Check for case insensitive matches.
396+
func matchesCased(
397+
_ cased: (UnicodeScalar.Properties) -> String
398+
) -> Bool {
399+
let casedStr = cased(scalar.properties)
400+
// In character semantic mode, we need to map to NFC. In scalar
401+
// semantics, we should have an exact scalar.
402+
let mapped = isCharacterSemantic ? casedStr.singleNFCScalar
403+
: casedStr.singleScalar
404+
guard let mapped = mapped else { return false }
405+
return scalarRange.contains(mapped)
406+
}
407+
if isCaseInsensitive {
408+
if scalar.properties.changesWhenLowercased,
409+
matchesCased(\.lowercaseMapping) {
410+
return nextIndex
411+
}
412+
if scalar.properties.changesWhenUppercased,
413+
matchesCased(\.uppercaseMapping) {
414+
return nextIndex
393415
}
394-
return nil
395416
}
417+
return nil
396418
}
397419

398420
case let .custom(ccc):

Sources/_StringProcessing/Unicode/CharacterProps.swift

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,3 @@
1111

1212

1313
// TODO
14-
15-
extension Character {
16-
/// Whether this character is made up of exactly one Unicode scalar value.
17-
var hasExactlyOneScalar: Bool {
18-
unicodeScalars.index(after: unicodeScalars.startIndex) == unicodeScalars.endIndex
19-
}
20-
}
Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2022 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
//
10+
//===----------------------------------------------------------------------===//
11+
12+
@_spi(_Unicode)
13+
import Swift
14+
15+
extension UnicodeScalar {
16+
/// Checks whether the scalar is in NFC form.
17+
var isNFC: Bool { Character(self).singleNFCScalar == self }
18+
}
19+
20+
extension Character {
21+
/// If the given character consists of a single NFC scalar, returns it. If
22+
/// there are multiple NFC scalars, returns `nil`.
23+
var singleNFCScalar: UnicodeScalar? {
24+
// SwiftStdlib is always >= 5.7 for a shipped StringProcessing.
25+
guard #available(SwiftStdlib 5.7, *) else { return nil }
26+
var nfcIter = String(self)._nfc.makeIterator()
27+
guard let scalar = nfcIter.next(), nfcIter.next() == nil else { return nil }
28+
return scalar
29+
}
30+
31+
/// If the given character contains a single scalar, returns it. If none or
32+
/// multiple scalars are present, returns `nil`.
33+
var singleScalar: UnicodeScalar? {
34+
hasExactlyOneScalar ? unicodeScalars.first! : nil
35+
}
36+
}
37+
38+
extension String {
39+
/// If the given string consists of a single NFC scalar, returns it. If none
40+
/// or multiple NFC scalars are present, returns `nil`.
41+
var singleNFCScalar: UnicodeScalar? {
42+
guard !isEmpty && index(after: startIndex) == endIndex else { return nil }
43+
return first!.singleNFCScalar
44+
}
45+
46+
/// If the given string contains a single scalar, returns it. If none or
47+
/// multiple scalars are present, returns `nil`.
48+
var singleScalar: UnicodeScalar? {
49+
let scalars = unicodeScalars
50+
guard !scalars.isEmpty &&
51+
scalars.index(after: scalars.startIndex) == scalars.endIndex
52+
else { return nil }
53+
return scalars.first!
54+
}
55+
}

Tests/RegexBuilderTests/RegexDSLTests.swift

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,7 @@
1212
import XCTest
1313
import _StringProcessing
1414
import RegexBuilder
15-
16-
#if os(Linux)
17-
func XCTExpectFailure(_ message: String? = nil, body: () throws -> Void) rethrows {}
18-
#endif
15+
import TestSupport
1916

2017
class RegexDSLTests: XCTestCase {
2118
func _testDSLCaptures<Content: RegexComponent, MatchType>(
@@ -77,6 +74,9 @@ class RegexDSLTests: XCTestCase {
7774
let asciiNewlines = "\u{A}\u{B}\u{C}\u{D}\r\n"
7875

7976
func testCharacterClasses() throws {
77+
// Must have new stdlib for character class ranges.
78+
guard ensureNewStdlib() else { return }
79+
8080
try _testDSLCaptures(
8181
("a c", ("a c", " ", "c")),
8282
matchType: (Substring, Substring, Substring).self, ==)
@@ -251,6 +251,9 @@ class RegexDSLTests: XCTestCase {
251251
}
252252

253253
func testCharacterClassOperations() throws {
254+
// Must have new stdlib for character class ranges.
255+
guard ensureNewStdlib() else { return }
256+
254257
try _testDSLCaptures(
255258
("bcdefn1a", "bcdefn1a"),
256259
("nbcdef1a", nil), // fails symmetric difference lookahead
@@ -594,6 +597,9 @@ class RegexDSLTests: XCTestCase {
594597
}
595598

596599
func testQuantificationBehavior() throws {
600+
// Must have new stdlib for character class ranges.
601+
guard ensureNewStdlib() else { return }
602+
597603
// Eager by default
598604
try _testDSLCaptures(
599605
("abc1def2", ("abc1def2", "2")),

0 commit comments

Comments
 (0)