Skip to content

Commit 014e822

Browse files
committed
Address Michael's comments
fix infinite recursion bug NFC: Remove early ccc check remember that false is turned on
1 parent 98aaa15 commit 014e822

File tree

11 files changed

+1424
-1034
lines changed

11 files changed

+1424
-1034
lines changed

stdlib/public/core/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -174,6 +174,7 @@ set(SWIFTLIB_ESSENTIAL
174174
ThreadLocalStorage.swift
175175
UIntBuffer.swift
176176
UnavailableStringAPIs.swift
177+
UnicodeData.swift
177178
UnicodeEncoding.swift
178179
UnicodeHelpers.swift
179180
UnicodeParser.swift

stdlib/public/core/GroupInfo.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,6 @@
1111
"ICU.swift",
1212
"NFC.swift",
1313
"NFD.swift",
14-
"NormalizedCodeUnitIterator.swift",
1514
"SmallString.swift",
1615
"StaticString.swift",
1716
"String.swift",
@@ -44,6 +43,7 @@
4443
"StringUnicodeScalarView.swift",
4544
"Substring.swift",
4645
"Unicode.swift",
46+
"UnicodeData.swift",
4747
"UnicodeEncoding.swift",
4848
"UnicodeHelpers.swift",
4949
"UnicodeParser.swift",

stdlib/public/core/NFC.swift

Lines changed: 111 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -13,131 +13,153 @@
1313
import SwiftShims
1414

1515
extension Unicode {
16-
internal struct NFC<S: StringProtocol> {
16+
internal struct _NFC<S: StringProtocol> {
1717
let base: S
1818
}
1919
}
2020

21-
extension Unicode.NFC {
21+
extension Unicode._NFC {
2222
internal struct Iterator {
23-
var buffer: [(scalar: Unicode.Scalar, normData: UInt16)] = []
24-
23+
var buffer = Unicode._NormDataBuffer()
24+
25+
// This is our starter that is currently being composed with other scalars
26+
// into new scalars. For example, "e\u{301}", here our first scalar is 'e',
27+
// which is a starter, thus we assign composee to this 'e' and move to the
28+
// next scalar. We attempt to compose our composee, 'e', with '\u{301}' and
29+
// find that there is a composition. Thus our new composee is now 'é' and
30+
// we continue to try and compose following scalars with this composee.
2531
var composee: Unicode.Scalar? = nil
26-
27-
var hasBeenReversed = false
28-
29-
var iterator: Unicode.NFD<S>.Iterator
32+
33+
var iterator: Unicode._NFD<S>.Iterator
3034
}
3135
}
3236

33-
extension Unicode.NFC.Iterator: IteratorProtocol {
37+
extension Unicode._NFC.Iterator: IteratorProtocol {
3438
internal func compose(
3539
_ x: Unicode.Scalar,
3640
and y: Unicode.Scalar
3741
) -> Unicode.Scalar? {
3842
// Fast path: ASCII and some latiny scalars never compose when they're on
3943
// the rhs.
40-
if y.value < 0x300 {
44+
if _fastPath(y.value < 0x300) {
4145
return nil
4246
}
43-
47+
48+
if let hangul = composeHangul(x, and: y) {
49+
return hangul
50+
}
51+
52+
// Otherwise, lookup the composition.
53+
let composition = _swift_stdlib_getComposition(x.value, y.value)
54+
55+
guard composition != .max else {
56+
return nil
57+
}
58+
59+
return Unicode.Scalar(_value: composition)
60+
}
61+
62+
@inline(never)
63+
internal func composeHangul(
64+
_ x: Unicode.Scalar,
65+
and y: Unicode.Scalar
66+
) -> Unicode.Scalar? {
67+
// L = Hangul leading consonants
68+
let L: (base: UInt32, count: UInt32) = (base: 0x1100, count: 19)
69+
// V = Hangul vowels
70+
let V: (base: UInt32, count: UInt32) = (base: 0x1161, count: 21)
71+
// T = Hangul tail consonants
72+
let T: (base: UInt32, count: UInt32) = (base: 0x11A7, count: 28)
73+
// N = Number of precomposed Hangul syllables that start with the same
74+
// leading consonant. (There is no base for N).
75+
let N: (base: UInt32, count: UInt32) = (base: 0x0, count: 588)
76+
// S = Hangul precomposed syllables
77+
let S: (base: UInt32, count: UInt32) = (base: 0xAC00, count: 11172)
78+
4479
switch (x.value, y.value) {
4580
// Check for Hangul (L, V) -> LV compositions.
46-
case (0x1100 ... 0x1112, 0x1161 ... 0x1175):
47-
let lIdx = x.value &- 0x1100
48-
let vIdx = y.value &- 0x1161
49-
let lvIdx = lIdx &* 588 &+ vIdx &* 28
50-
let s = 0xAC00 &+ lvIdx
81+
case (L.base ..< L.base &+ L.count, V.base ..< V.base &+ V.count):
82+
let lIdx = x.value &- L.base
83+
let vIdx = y.value &- V.base
84+
let lvIdx = lIdx &* N.count &+ vIdx &* T.count
85+
let s = S.base &+ lvIdx
5186
return Unicode.Scalar(_value: s)
52-
87+
5388
// Check for Hangul (LV, T) -> LVT compositions.
54-
case (0xAC00 ... 0xD7A3, 0x11A7 &+ 1 ... 0x11C2):
55-
if (x.value &- 0xAC00) % 28 == 0 {
56-
return Unicode.Scalar(_value: x.value &+ y.value &- 0x11A7)
89+
case (S.base ..< S.base &+ S.count, T.base &+ 1 ..< T.base &+ T.count):
90+
if (x.value &- S.base) % T.count == 0 {
91+
return Unicode.Scalar(_value: x.value &+ y.value &- T.base)
5792
} else {
5893
fallthrough
5994
}
60-
61-
// Otherwise, look it up.
95+
6296
default:
63-
let composition = _swift_stdlib_getComposition(x.value, y.value)
64-
65-
guard composition != .max else {
66-
return nil
67-
}
68-
69-
return Unicode.Scalar(_value: composition)
97+
return nil
7098
}
7199
}
72100

73101
internal mutating func next() -> Unicode.Scalar? {
74102
// Empty out our buffer before attempting to compose anything with our new
75103
// composee.
76-
if !buffer.isEmpty {
77-
if !hasBeenReversed {
78-
buffer.reverse()
79-
hasBeenReversed = true
80-
}
81-
82-
return buffer.removeLast().scalar
104+
if let nextBuffered = buffer.next() {
105+
return nextBuffered.scalar
83106
}
84-
85-
hasBeenReversed = false
86-
107+
87108
while let current = iterator.next() {
88-
let currentCCC = current.normData >> 3
89-
let currentIsNFCQC = current.normData & 0x6 == 0
90-
91-
guard let l = composee else {
109+
guard let currentComposee = composee else {
92110
// If we don't have a composee at this point, we're most likely looking
93111
// at the start of a string. If our class is 0, then attempt to compose
94112
// the following scalars with this one. Otherwise, it's a one off scalar
95113
// that needs to be emitted.
96-
if currentCCC == 0 {
114+
if current.normData.ccc == 0 {
97115
composee = current.scalar
98116
continue
99117
} else {
100118
return current.scalar
101119
}
102120
}
103-
104-
// Check if we have any scalars within the buffer, and if so get the last
105-
// scalar's normalization data.
106-
guard let lastNormData = buffer.last?.normData else {
121+
122+
// If we have any scalars in the buffer, it means those scalars couldn't
123+
// compose with our composee to form a new scalar. However, scalars
124+
// following them may still compose with our composee, so take the last
125+
// scalar in the buffer and get its normalization data so that we can
126+
// perform the check underneath this one about whether this current scalar
127+
// is "blocked". We get the last scalar because the scalars we receive are
128+
// already NFD, so the last scalar in the buffer will have the highest
129+
// CCC value in this normalization segment.
130+
guard let lastBufferedNormData = buffer.last?.normData else {
107131
// If we do not any have scalars in our buffer yet, then this step is
108132
// trivial. Attempt to compose our current scalar with whatever composee
109133
// we're currently building up.
110-
134+
111135
// If our right hand side scalar IS NFC_QC, then that means it can
112136
// never compose with any scalars previous to it. So, if our current
113137
// scalar is NFC_QC, then we have no composition.
114-
guard !currentIsNFCQC, let p = compose(l, and: current.scalar) else {
138+
guard !current.normData.isNFCQC,
139+
let composed = compose(currentComposee, and: current.scalar) else {
115140
// We did not find a composition between the two. If our current class
116141
// is 0, then set that as the new composee and return whatever built
117142
// up scalar we have. Otherwise, add our current scalar to the buffer
118-
// for eventually removal!
119-
120-
guard currentCCC == 0 else {
121-
buffer.append(current)
122-
continue
143+
// for eventual removal!
144+
145+
if current.normData.ccc == 0 {
146+
composee = current.scalar
147+
return currentComposee
123148
}
124-
125-
composee = current.scalar
126-
return l
149+
150+
buffer.append(current)
151+
continue
127152
}
128-
153+
129154
// We found a composition! Record it as our new composee and repeat the
130155
// process.
131-
composee = p
156+
composee = composed
132157
continue
133158
}
134-
135-
// We only care about the last's ccc.
136-
let lastCCC = lastNormData >> 3
137-
159+
138160
// Check if our current scalar is not blocked from our current composee.
139-
// In this case blocked means there is some scalar whose class (lastClass)
140-
// is either == 0 or >= currentClass.
161+
// In this case blocked means there is some scalar whose class
162+
// (lastBufferedNormData.ccc) is either == 0 or >= current.normData.ccc.
141163
//
142164
// Example:
143165
//
@@ -148,54 +170,54 @@ extension Unicode.NFC.Iterator: IteratorProtocol {
148170
// scalar U+0301 does actually compose. So this check makes sure that the
149171
// last scalar doesn't have any scalar in between it and the composee that
150172
// would otherwise "block" it from composing.
151-
guard lastCCC < currentCCC else {
173+
guard lastBufferedNormData.ccc < current.normData.ccc else {
152174
// We had a scalar block it. That means our current scalar is either a
153175
// starter or has a same class (preserve ordering).
154-
155-
guard currentCCC == 0 else {
156-
// Not a starter, stick it at the end of the buffer and keep going!
157-
158-
buffer.append(current)
159-
continue
160-
}
161-
176+
162177
// Starters are the "start" of a new normalization segment. Set it as
163178
// the new composee and return our current composee. This will trigger
164179
// any other scalars in the buffer to be emitted before we handle
165-
// composing this new composee.
166-
composee = current.scalar
167-
return l
180+
// normalizing this new segment.
181+
if current.normData.ccc == 0 {
182+
composee = current.scalar
183+
return currentComposee
184+
}
185+
186+
_internalInvariant(current.normData.ccc == lastBufferedNormData.ccc)
187+
buffer.append(current)
188+
continue
168189
}
169-
190+
170191
// There were no blockers! Attempt to compose the two! (Again, if our rhs
171192
// scalar IS NFC_QC, then it can never compose with anything previous to
172193
// it).
173-
guard !currentIsNFCQC, let p = compose(l, and: current.scalar) else {
194+
guard !current.normData.isNFCQC,
195+
let composed = compose(currentComposee, and: current.scalar) else {
174196
// No composition found. Stick it at the end of the buffer with the rest
175197
// of non-composed scalars.
176-
198+
177199
buffer.append(current)
178200
continue
179201
}
180-
202+
181203
// They composed! Assign the composition as our new composee and iterate
182204
// to the next scalar.
183-
composee = p
205+
composee = composed
184206
}
185-
207+
186208
// If we have a leftover composee, make sure to return it.
187-
return composee.take()
209+
return composee._take()
188210
}
189211
}
190212

191-
extension Unicode.NFC: Sequence {
213+
extension Unicode._NFC: Sequence {
192214
internal func makeIterator() -> Iterator {
193-
Iterator(iterator: base.nfd.makeIterator())
215+
Iterator(iterator: base._nfd.makeIterator())
194216
}
195217
}
196218

197219
extension StringProtocol {
198-
internal var nfc: Unicode.NFC<Self> {
199-
Unicode.NFC(base: self)
220+
internal var _nfc: Unicode._NFC<Self> {
221+
Unicode._NFC(base: self)
200222
}
201223
}

0 commit comments

Comments
 (0)