Skip to content

Commit f8e5f28

Browse files
authored
Merge pull request #38922 from Azoy/native-normalization
[stdlib] Implement native normalization for String
2 parents 9233975 + 014e822 commit f8e5f28

32 files changed

+48416
-1480
lines changed

stdlib/public/SwiftShims/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ set(sources
1919
System.h
2020
Target.h
2121
ThreadLocalStorage.h
22+
UnicodeData.h
2223
UnicodeShims.h
2324
Visibility.h
2425
_SwiftConcurrency.h
Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
#ifndef SWIFT_STDLIB_SHIMS_UNICODEDATA_H
14+
#define SWIFT_STDLIB_SHIMS_UNICODEDATA_H
15+
16+
#include "SwiftStdint.h"
17+
#include "Visibility.h"
18+
19+
#ifdef __cplusplus
20+
extern "C" {
21+
#endif
22+
23+
SWIFT_RUNTIME_STDLIB_INTERNAL
24+
__swift_uint16_t _swift_stdlib_getNormData(__swift_uint32_t scalar);
25+
26+
SWIFT_RUNTIME_STDLIB_INTERNAL
27+
const __swift_uint8_t * const _swift_stdlib_nfd_decompositions;
28+
29+
SWIFT_RUNTIME_STDLIB_INTERNAL
30+
__swift_uint32_t _swift_stdlib_getDecompositionEntry(__swift_uint32_t scalar);
31+
32+
SWIFT_RUNTIME_STDLIB_INTERNAL
33+
__swift_uint32_t _swift_stdlib_getComposition(__swift_uint32_t x,
34+
__swift_uint32_t y);
35+
36+
SWIFT_RUNTIME_STDLIB_INTERNAL
37+
__swift_intptr_t _swift_stdlib_getMphIdx(__swift_uint32_t scalar,
38+
__swift_intptr_t levels,
39+
const __swift_uint64_t * const *keys,
40+
const __swift_uint16_t * const *ranks,
41+
const __swift_uint16_t * const sizes);
42+
43+
#ifdef __cplusplus
44+
} // extern "C"
45+
#endif
46+
47+
#endif // SWIFT_STDLIB_SHIMS_UNICODEDATA_H

stdlib/public/SwiftShims/UnicodeShims.h

Lines changed: 0 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,6 @@ typedef enum __swift_stdlib_UNumericType {
432432

433433
typedef struct __swift_stdlib_UBreakIterator __swift_stdlib_UBreakIterator;
434434
typedef struct __swift_stdlib_UText __swift_stdlib_UText;
435-
typedef struct __swift_stdlib_UNormalizer2 __swift_stdlib_UNormalizer2;
436435
typedef __swift_int8_t __swift_stdlib_UBool;
437436
typedef __swift_int32_t __swift_stdlib_UChar32;
438437
#if defined(__APPLE__)
@@ -489,27 +488,6 @@ SWIFT_RUNTIME_STDLIB_API
489488
__swift_int32_t __swift_stdlib_ubrk_following(__swift_stdlib_UBreakIterator *bi,
490489
__swift_int32_t offset);
491490

492-
SWIFT_RUNTIME_STDLIB_API
493-
__swift_stdlib_UBool
494-
__swift_stdlib_unorm2_hasBoundaryBefore(const __swift_stdlib_UNormalizer2 *,
495-
__swift_stdlib_UChar32);
496-
497-
SWIFT_RUNTIME_STDLIB_API
498-
const __swift_stdlib_UNormalizer2 *
499-
__swift_stdlib_unorm2_getNFCInstance(__swift_stdlib_UErrorCode *);
500-
501-
SWIFT_RUNTIME_STDLIB_API
502-
__swift_int32_t
503-
__swift_stdlib_unorm2_normalize(const __swift_stdlib_UNormalizer2 *,
504-
const __swift_stdlib_UChar *, __swift_int32_t,
505-
__swift_stdlib_UChar *, __swift_int32_t,
506-
__swift_stdlib_UErrorCode *);
507-
508-
SWIFT_RUNTIME_STDLIB_API
509-
__swift_int32_t __swift_stdlib_unorm2_spanQuickCheckYes(
510-
const __swift_stdlib_UNormalizer2 *, const __swift_stdlib_UChar *,
511-
__swift_int32_t, __swift_stdlib_UErrorCode *);
512-
513491
SWIFT_RUNTIME_STDLIB_API
514492
__swift_stdlib_UBool
515493
__swift_stdlib_u_hasBinaryProperty(__swift_stdlib_UChar32,

stdlib/public/SwiftShims/module.modulemap

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ module SwiftShims {
1717
header "SwiftStdint.h"
1818
header "System.h"
1919
header "ThreadLocalStorage.h"
20+
header "UnicodeData.h"
2021
header "UnicodeShims.h"
2122
header "Visibility.h"
2223
export *

stdlib/public/core/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,8 @@ set(SWIFTLIB_ESSENTIAL
101101
NativeDictionary.swift
102102
NativeSet.swift
103103
NewtypeWrapper.swift
104+
NFC.swift
105+
NFD.swift
104106
ObjectIdentifier.swift
105107
Optional.swift
106108
OptionSet.swift
@@ -172,6 +174,7 @@ set(SWIFTLIB_ESSENTIAL
172174
ThreadLocalStorage.swift
173175
UIntBuffer.swift
174176
UnavailableStringAPIs.swift
177+
UnicodeData.swift
175178
UnicodeEncoding.swift
176179
UnicodeHelpers.swift
177180
UnicodeParser.swift

stdlib/public/core/GroupInfo.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@
99
"Character.swift",
1010
"CharacterProperties.swift",
1111
"ICU.swift",
12-
"NormalizedCodeUnitIterator.swift",
12+
"NFC.swift",
13+
"NFD.swift",
1314
"SmallString.swift",
1415
"StaticString.swift",
1516
"String.swift",
@@ -42,6 +43,7 @@
4243
"StringUnicodeScalarView.swift",
4344
"Substring.swift",
4445
"Unicode.swift",
46+
"UnicodeData.swift",
4547
"UnicodeEncoding.swift",
4648
"UnicodeHelpers.swift",
4749
"UnicodeParser.swift",

stdlib/public/core/NFC.swift

Lines changed: 223 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,223 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2021 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
import SwiftShims
14+
15+
extension Unicode {
16+
internal struct _NFC<S: StringProtocol> {
17+
let base: S
18+
}
19+
}
20+
21+
extension Unicode._NFC {
22+
internal struct Iterator {
23+
var buffer = Unicode._NormDataBuffer()
24+
25+
// This is our starter that is currently being composed with other scalars
26+
// into new scalars. For example, "e\u{301}", here our first scalar is 'e',
27+
// which is a starter, thus we assign composee to this 'e' and move to the
28+
// next scalar. We attempt to compose our composee, 'e', with '\u{301}' and
29+
// find that there is a composition. Thus our new composee is now 'é' and
30+
// we continue to try and compose following scalars with this composee.
31+
var composee: Unicode.Scalar? = nil
32+
33+
var iterator: Unicode._NFD<S>.Iterator
34+
}
35+
}
36+
37+
extension Unicode._NFC.Iterator: IteratorProtocol {
38+
internal func compose(
39+
_ x: Unicode.Scalar,
40+
and y: Unicode.Scalar
41+
) -> Unicode.Scalar? {
42+
// Fast path: ASCII and some latiny scalars never compose when they're on
43+
// the rhs.
44+
if _fastPath(y.value < 0x300) {
45+
return nil
46+
}
47+
48+
if let hangul = composeHangul(x, and: y) {
49+
return hangul
50+
}
51+
52+
// Otherwise, lookup the composition.
53+
let composition = _swift_stdlib_getComposition(x.value, y.value)
54+
55+
guard composition != .max else {
56+
return nil
57+
}
58+
59+
return Unicode.Scalar(_value: composition)
60+
}
61+
62+
@inline(never)
63+
internal func composeHangul(
64+
_ x: Unicode.Scalar,
65+
and y: Unicode.Scalar
66+
) -> Unicode.Scalar? {
67+
// L = Hangul leading consonants
68+
let L: (base: UInt32, count: UInt32) = (base: 0x1100, count: 19)
69+
// V = Hangul vowels
70+
let V: (base: UInt32, count: UInt32) = (base: 0x1161, count: 21)
71+
// T = Hangul tail consonants
72+
let T: (base: UInt32, count: UInt32) = (base: 0x11A7, count: 28)
73+
// N = Number of precomposed Hangul syllables that start with the same
74+
// leading consonant. (There is no base for N).
75+
let N: (base: UInt32, count: UInt32) = (base: 0x0, count: 588)
76+
// S = Hangul precomposed syllables
77+
let S: (base: UInt32, count: UInt32) = (base: 0xAC00, count: 11172)
78+
79+
switch (x.value, y.value) {
80+
// Check for Hangul (L, V) -> LV compositions.
81+
case (L.base ..< L.base &+ L.count, V.base ..< V.base &+ V.count):
82+
let lIdx = x.value &- L.base
83+
let vIdx = y.value &- V.base
84+
let lvIdx = lIdx &* N.count &+ vIdx &* T.count
85+
let s = S.base &+ lvIdx
86+
return Unicode.Scalar(_value: s)
87+
88+
// Check for Hangul (LV, T) -> LVT compositions.
89+
case (S.base ..< S.base &+ S.count, T.base &+ 1 ..< T.base &+ T.count):
90+
if (x.value &- S.base) % T.count == 0 {
91+
return Unicode.Scalar(_value: x.value &+ y.value &- T.base)
92+
} else {
93+
fallthrough
94+
}
95+
96+
default:
97+
return nil
98+
}
99+
}
100+
101+
internal mutating func next() -> Unicode.Scalar? {
102+
// Empty out our buffer before attempting to compose anything with our new
103+
// composee.
104+
if let nextBuffered = buffer.next() {
105+
return nextBuffered.scalar
106+
}
107+
108+
while let current = iterator.next() {
109+
guard let currentComposee = composee else {
110+
// If we don't have a composee at this point, we're most likely looking
111+
// at the start of a string. If our class is 0, then attempt to compose
112+
// the following scalars with this one. Otherwise, it's a one off scalar
113+
// that needs to be emitted.
114+
if current.normData.ccc == 0 {
115+
composee = current.scalar
116+
continue
117+
} else {
118+
return current.scalar
119+
}
120+
}
121+
122+
// If we have any scalars in the buffer, it means those scalars couldn't
123+
// compose with our composee to form a new scalar. However, scalars
124+
// following them may still compose with our composee, so take the last
125+
// scalar in the buffer and get its normalization data so that we can
126+
// perform the check underneath this one about whether this current scalar
127+
// is "blocked". We get the last scalar because the scalars we receive are
128+
// already NFD, so the last scalar in the buffer will have the highest
129+
// CCC value in this normalization segment.
130+
guard let lastBufferedNormData = buffer.last?.normData else {
131+
// If we do not any have scalars in our buffer yet, then this step is
132+
// trivial. Attempt to compose our current scalar with whatever composee
133+
// we're currently building up.
134+
135+
// If our right hand side scalar IS NFC_QC, then that means it can
136+
// never compose with any scalars previous to it. So, if our current
137+
// scalar is NFC_QC, then we have no composition.
138+
guard !current.normData.isNFCQC,
139+
let composed = compose(currentComposee, and: current.scalar) else {
140+
// We did not find a composition between the two. If our current class
141+
// is 0, then set that as the new composee and return whatever built
142+
// up scalar we have. Otherwise, add our current scalar to the buffer
143+
// for eventual removal!
144+
145+
if current.normData.ccc == 0 {
146+
composee = current.scalar
147+
return currentComposee
148+
}
149+
150+
buffer.append(current)
151+
continue
152+
}
153+
154+
// We found a composition! Record it as our new composee and repeat the
155+
// process.
156+
composee = composed
157+
continue
158+
}
159+
160+
// Check if our current scalar is not blocked from our current composee.
161+
// In this case blocked means there is some scalar whose class
162+
// (lastBufferedNormData.ccc) is either == 0 or >= current.normData.ccc.
163+
//
164+
// Example:
165+
//
166+
// "z\u{0335}\u{0327}\u{0324}\u{0301}"
167+
//
168+
// In this example, there are several combining marks following a 'z', but
169+
// none of them actually compose with the composee 'z'. However, the last
170+
// scalar U+0301 does actually compose. So this check makes sure that the
171+
// last scalar doesn't have any scalar in between it and the composee that
172+
// would otherwise "block" it from composing.
173+
guard lastBufferedNormData.ccc < current.normData.ccc else {
174+
// We had a scalar block it. That means our current scalar is either a
175+
// starter or has a same class (preserve ordering).
176+
177+
// Starters are the "start" of a new normalization segment. Set it as
178+
// the new composee and return our current composee. This will trigger
179+
// any other scalars in the buffer to be emitted before we handle
180+
// normalizing this new segment.
181+
if current.normData.ccc == 0 {
182+
composee = current.scalar
183+
return currentComposee
184+
}
185+
186+
_internalInvariant(current.normData.ccc == lastBufferedNormData.ccc)
187+
buffer.append(current)
188+
continue
189+
}
190+
191+
// There were no blockers! Attempt to compose the two! (Again, if our rhs
192+
// scalar IS NFC_QC, then it can never compose with anything previous to
193+
// it).
194+
guard !current.normData.isNFCQC,
195+
let composed = compose(currentComposee, and: current.scalar) else {
196+
// No composition found. Stick it at the end of the buffer with the rest
197+
// of non-composed scalars.
198+
199+
buffer.append(current)
200+
continue
201+
}
202+
203+
// They composed! Assign the composition as our new composee and iterate
204+
// to the next scalar.
205+
composee = composed
206+
}
207+
208+
// If we have a leftover composee, make sure to return it.
209+
return composee._take()
210+
}
211+
}
212+
213+
extension Unicode._NFC: Sequence {
214+
internal func makeIterator() -> Iterator {
215+
Iterator(iterator: base._nfd.makeIterator())
216+
}
217+
}
218+
219+
extension StringProtocol {
220+
internal var _nfc: Unicode._NFC<Self> {
221+
Unicode._NFC(base: self)
222+
}
223+
}

0 commit comments

Comments
 (0)