Skip to content

Commit 76377ca

Browse files
committed
Implement native scalar names
1 parent b9b06f2 commit 76377ca

File tree

9 files changed

+66845
-43
lines changed

9 files changed

+66845
-43
lines changed

stdlib/public/SwiftShims/UnicodeData.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ __swift_int32_t _swift_stdlib_getMapping(__swift_uint32_t scalar,
8383
SWIFT_RUNTIME_STDLIB_INTERNAL
8484
const __swift_uint32_t *_swift_stdlib_getSpecialMapping(__swift_uint32_t scalar);
8585

86+
SWIFT_RUNTIME_STDLIB_INTERNAL
87+
__swift_intptr_t _swift_stdlib_getScalarName(__swift_uint32_t scalar,
88+
__swift_uint8_t *buffer);
89+
8690
#ifdef __cplusplus
8791
} // extern "C"
8892
#endif

stdlib/public/SwiftShims/UnicodeShims.h

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -280,13 +280,6 @@ __swift_int32_t
280280
__swift_stdlib_u_getIntPropertyValue(__swift_stdlib_UChar32,
281281
__swift_stdlib_UProperty);
282282

283-
SWIFT_RUNTIME_STDLIB_API
284-
__swift_int32_t __swift_stdlib_u_charName(
285-
__swift_stdlib_UChar32 code, __swift_stdlib_UCharNameChoice nameChoice,
286-
char *_Nullable buffer, __swift_int32_t bufferLength,
287-
__swift_stdlib_UErrorCode *pErrorCode);
288-
289-
290283
#ifdef __cplusplus
291284
} // extern "C"
292285
#endif

stdlib/public/core/UnicodeScalarProperties.swift

Lines changed: 91 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1179,34 +1179,90 @@ extension Unicode.Scalar.Properties {
11791179
}
11801180

11811181
extension Unicode.Scalar.Properties {
1182+
internal func _hangulName() -> String {
1183+
// T = Hangul tail consonants
1184+
let T: (base: UInt32, count: UInt32) = (base: 0x11A7, count: 28)
1185+
// N = Number of precomposed Hangul syllables that start with the same
1186+
// leading consonant. (There is no base for N).
1187+
let N: (base: UInt32, count: UInt32) = (base: 0x0, count: 588)
1188+
// S = Hangul precomposed syllables
1189+
let S: (base: UInt32, count: UInt32) = (base: 0xAC00, count: 11172)
1190+
1191+
let hangulLTable = ["G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S",
1192+
"SS", "", "J", "JJ", "C", "K", "T", "P", "H"]
1193+
1194+
let hangulVTable = ["A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O",
1195+
"WA", "WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU",
1196+
"EU", "YI", "I"]
1197+
1198+
let hangulTTable = ["", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG",
1199+
"LM", "LB", "LS", "LT", "LP", "LH", "M", "B", "BS", "S",
1200+
"SS", "NG", "J", "C", "K", "T", "P", "H"]
1201+
1202+
let sIdx = _scalar.value &- S.base
1203+
let lIdx = Int(sIdx / N.count)
1204+
let vIdx = Int((sIdx % N.count) / T.count)
1205+
let tIdx = Int(sIdx % T.count)
1206+
1207+
let scalarName = hangulLTable[lIdx] + hangulVTable[vIdx] + hangulTTable[tIdx]
1208+
return "HANGUL SYLLABLE \(scalarName)"
1209+
}
1210+
1211+
// Used to potentially return a name who can either be represented in a large
1212+
// range or algorithmetically. A good example are the Hangul names. Instead of
1213+
// storing those names, we can define an algorithm to generate the name.
1214+
internal func _fastScalarName() -> String? {
1215+
// Define a couple algorithmetic names below.
1216+
1217+
// Hangul Syllable *
1218+
if (0xAC00 ... 0xD7A3).contains(_scalar.value) {
1219+
return _hangulName()
1220+
}
11821221

1183-
internal func _scalarName(
1184-
_ choice: __swift_stdlib_UCharNameChoice
1185-
) -> String? {
1186-
var error = __swift_stdlib_U_ZERO_ERROR
1187-
let count = Int(__swift_stdlib_u_charName(icuValue, choice, nil, 0, &error))
1188-
guard count > 0 else { return nil }
1189-
1190-
// ICU writes a trailing null, so we have to save room for it as well.
1191-
let array = Array<UInt8>(unsafeUninitializedCapacity: count + 1) {
1192-
buffer, initializedCount in
1193-
var error = __swift_stdlib_U_ZERO_ERROR
1194-
let correctSize = __swift_stdlib_u_charName(
1195-
icuValue,
1196-
choice,
1197-
UnsafeMutableRawPointer(buffer.baseAddress._unsafelyUnwrappedUnchecked)
1198-
.assumingMemoryBound(to: Int8.self),
1199-
Int32(buffer.count),
1200-
&error)
1201-
guard error.isSuccess else {
1202-
fatalError("Unexpected error case-converting Unicode scalar.")
1203-
}
1204-
_internalInvariant(count == correctSize, "inconsistent ICU behavior")
1205-
initializedCount = count + 1
1222+
// Variation Selector-17 through Variation Selector-256
1223+
if (0xE0100 ... 0xE01EF).contains(_scalar.value) {
1224+
return "VARIATION SELECTOR-\(_scalar.value - 0xE0100 + 17)"
1225+
}
1226+
1227+
let scalarName = String(_scalar.value, radix: 16, uppercase: true)
1228+
1229+
// CJK Unified Ideograph-*
1230+
if (0x3400 ... 0x4DBF).contains(_scalar.value) ||
1231+
(0x4E00 ... 0x9FFF).contains(_scalar.value) ||
1232+
(0x20000 ... 0x2A6DF).contains(_scalar.value) ||
1233+
(0x2A700 ... 0x2B738).contains(_scalar.value) ||
1234+
(0x2B740 ... 0x2B81D).contains(_scalar.value) ||
1235+
(0x2B820 ... 0x2CEA1).contains(_scalar.value) ||
1236+
(0x2CEB0 ... 0x2EBE0).contains(_scalar.value) ||
1237+
(0x2F800 ... 0x2FA1D).contains(_scalar.value) ||
1238+
(0x30000 ... 0x3134A).contains(_scalar.value) {
1239+
return "CJK UNIFIED IDEOGRAPH-\(scalarName)"
1240+
}
1241+
1242+
// CJK Compatibility Ideograph-*
1243+
if (0xF900 ... 0xFA6D).contains(_scalar.value) ||
1244+
(0xFA70 ... 0xFAD9).contains(_scalar.value) {
1245+
return "CJK COMPATIBILITY IDEOGRAPH-\(scalarName)"
1246+
}
1247+
1248+
// Tangut Ideograph-*
1249+
if (0x17000 ... 0x187F7).contains(_scalar.value) ||
1250+
(0x18D00 ... 0x18D08).contains(_scalar.value) {
1251+
return "TANGUT IDEOGRAPH-\(scalarName)"
12061252
}
1207-
return array.withUnsafeBufferPointer { buffer in
1208-
String._fromASCII(UnsafeBufferPointer(rebasing: buffer[..<count]))
1253+
1254+
// Khitan Small Script Character-*
1255+
if (0x18B00 ... 0x18CD5).contains(_scalar.value) {
1256+
return "KHITAN SMALL SCRIPT CHARACTER-\(scalarName)"
1257+
}
1258+
1259+
// Nushu Character-*
1260+
if (0x1B170 ... 0x1B2FB).contains(_scalar.value) {
1261+
return "NUSHU CHARACTER-\(scalarName)"
12091262
}
1263+
1264+
// Otherwise, go look it up.
1265+
return nil
12101266
}
12111267

12121268
/// The published name of the scalar.
@@ -1218,7 +1274,16 @@ extension Unicode.Scalar.Properties {
12181274
/// This property corresponds to the "Name" property in the
12191275
/// [Unicode Standard](http://www.unicode.org/versions/latest/).
12201276
public var name: String? {
1221-
return _scalarName(__swift_stdlib_U_UNICODE_CHAR_NAME)
1277+
if let fastName = _fastScalarName() {
1278+
return fastName
1279+
}
1280+
1281+
// The longest name that Unicode defines is 88 characters long.
1282+
let name = String(_uninitializedCapacity: 90) { buffer in
1283+
_swift_stdlib_getScalarName(_scalar.value, buffer.baseAddress)
1284+
}
1285+
1286+
return name.isEmpty ? nil : name
12221287
}
12231288

12241289
/// The normative formal alias of the scalar.

stdlib/public/stubs/UnicodeScalarProps.cpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,3 +189,71 @@ const __swift_uint32_t *_swift_stdlib_getSpecialMapping(__swift_uint32_t scalar)
189189

190190
return _swift_stdlib_special_mappings_data + index;
191191
}
192+
193+
SWIFT_RUNTIME_STDLIB_INTERNAL
194+
__swift_intptr_t _swift_stdlib_getScalarName(__swift_uint32_t scalar,
195+
__swift_uint8_t *buffer) {
196+
auto setOffset = _swift_stdlib_names_scalar_sets[scalar >> 7];
197+
198+
if (setOffset == std::numeric_limits<__swift_uint16_t>::max()) {
199+
return 0;
200+
}
201+
202+
auto scalarIndex = (setOffset << 7) + (scalar & ((1 << 7) - 1));
203+
auto scalarOffset = _swift_stdlib_names_scalars[scalarIndex];
204+
205+
if (scalarOffset == 0) {
206+
return 0;
207+
}
208+
209+
__swift_uint32_t nextScalarOffset = 0;
210+
int i = 1;
211+
212+
// Look for the next scalar who has a name and their position in the names
213+
// array. This tells us exactly how many bytes our name takes up.
214+
while (nextScalarOffset == 0) {
215+
nextScalarOffset = _swift_stdlib_names_scalars[scalarIndex + i];
216+
i += 1;
217+
}
218+
219+
auto nameSize = nextScalarOffset - scalarOffset;
220+
221+
int c = 0;
222+
223+
for (__swift_uint32_t i = 0; i < nameSize; i += 1) {
224+
__swift_uint16_t wordIndex = (__swift_uint16_t) _swift_stdlib_names[
225+
scalarOffset + i
226+
];
227+
228+
// If our word index is 0xFF, then it means our word index is larger than a
229+
// byte, so the next two bytes will compose the 16 bit index.
230+
if (wordIndex == 0xFF) {
231+
i += 1;
232+
auto firstPart = _swift_stdlib_names[scalarOffset + i];
233+
wordIndex = firstPart;
234+
235+
i += 1;
236+
auto secondPart = _swift_stdlib_names[scalarOffset + i];
237+
wordIndex |= secondPart << 8;
238+
}
239+
240+
auto wordOffset = _swift_stdlib_word_indices[wordIndex];
241+
242+
auto word = _swift_stdlib_words + wordOffset;
243+
244+
// The last character in a word has the 7th bit set.
245+
while (*word < 0x80) {
246+
buffer[c++] = *word++;
247+
}
248+
249+
buffer[c++] = *word & 0x7F;
250+
251+
buffer[c++] = ' ';
252+
}
253+
254+
// Remove the trailing space.
255+
c -= 1;
256+
257+
// The return value is the number of initialized bytes.
258+
return c;
259+
}

0 commit comments

Comments
 (0)