Skip to content

Commit a8ae6e7

Browse files
committed
Address Michael's and others comments
fix special mappings fix bug
1 parent ce0e6ff commit a8ae6e7

File tree

7 files changed

+247
-152
lines changed

7 files changed

+247
-152
lines changed

stdlib/public/SwiftShims/UnicodeData.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
extern "C" {
2222
#endif
2323

24+
#define SWIFT_STDLIB_LARGEST_NAME_COUNT 88
25+
2426
//===----------------------------------------------------------------------===//
2527
// Utilities
2628
//===----------------------------------------------------------------------===//
@@ -81,11 +83,14 @@ __swift_int32_t _swift_stdlib_getMapping(__swift_uint32_t scalar,
8183
__swift_uint8_t mapping);
8284

8385
SWIFT_RUNTIME_STDLIB_INTERNAL
84-
const __swift_uint32_t *_swift_stdlib_getSpecialMapping(__swift_uint32_t scalar);
86+
const __swift_uint8_t *_swift_stdlib_getSpecialMapping(__swift_uint32_t scalar,
87+
__swift_uint8_t mapping,
88+
__swift_intptr_t *length);
8589

8690
SWIFT_RUNTIME_STDLIB_INTERNAL
8791
__swift_intptr_t _swift_stdlib_getScalarName(__swift_uint32_t scalar,
88-
__swift_uint8_t *buffer);
92+
__swift_uint8_t *buffer,
93+
__swift_intptr_t capacity);
8994

9095
SWIFT_RUNTIME_STDLIB_INTERNAL
9196
__swift_uint16_t _swift_stdlib_getAge(__swift_uint32_t scalar);

stdlib/public/core/String.swift

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,7 @@ extension String {
850850
}
851851

852852
var result = ""
853+
result.reserveCapacity(utf8.count)
853854

854855
for scalar in unicodeScalars {
855856
result += scalar.properties.lowercaseMapping
@@ -883,6 +884,7 @@ extension String {
883884
}
884885

885886
var result = ""
887+
result.reserveCapacity(utf8.count)
886888

887889
for scalar in unicodeScalars {
888890
result += scalar.properties.uppercaseMapping

stdlib/public/core/UnicodeScalarProperties.swift

Lines changed: 72 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -743,50 +743,32 @@ extension Unicode.Scalar.Properties {
743743

744744
/// Case mapping properties.
745745
extension Unicode.Scalar.Properties {
746-
fileprivate enum _CaseMapping: UInt8 {
747-
case uppercase
748-
case lowercase
749-
case titlecase
746+
fileprivate struct _CaseMapping {
747+
let rawValue: UInt8
748+
749+
static let uppercase = _CaseMapping(rawValue: 0)
750+
static let lowercase = _CaseMapping(rawValue: 1)
751+
static let titlecase = _CaseMapping(rawValue: 2)
750752
}
751753

752754
fileprivate func _getMapping(_ mapping: _CaseMapping) -> String {
753755
// First, check if our scalar has a special mapping where it's mapped to
754756
// more than 1 scalar.
755-
let specialMappingPtr = _swift_stdlib_getSpecialMapping(_scalar.value)
756-
757-
if let specialMapping = specialMappingPtr {
758-
func readSpecialMapping(_ ptr: UnsafePointer<UInt32>) -> String {
759-
let count = Int(ptr.pointee)
760-
761-
if count == 0 {
762-
return "\(_scalar)"
763-
}
764-
765-
var result = ""
766-
767-
for i in 0 ..< count {
768-
result += "\(Unicode.Scalar(_unchecked: ptr[1 + i]))"
769-
}
757+
var specialMappingLength = 0
770758

771-
return result
772-
}
773-
774-
switch mapping {
775-
case .uppercase:
776-
return readSpecialMapping(specialMapping)
777-
778-
case .lowercase:
779-
let upperCount = Int(specialMapping.pointee)
780-
781-
return readSpecialMapping(specialMapping + upperCount + 1)
759+
let specialMappingPtr = _swift_stdlib_getSpecialMapping(
760+
_scalar.value,
761+
mapping.rawValue,
762+
&specialMappingLength
763+
)
782764

783-
case .titlecase:
784-
let upperCount = Int(specialMapping.pointee)
785-
let lowerPtr = specialMapping + upperCount + 1
786-
let lowerCount = Int(lowerPtr.pointee)
765+
if let specialMapping = specialMappingPtr, specialMappingLength != 0 {
766+
let buffer = UnsafeBufferPointer<UInt8>(
767+
start: specialMapping,
768+
count: specialMappingLength
769+
)
787770

788-
return readSpecialMapping(lowerPtr + lowerCount + 1)
789-
}
771+
return String._uncheckedFromUTF8(buffer, isASCII: false)
790772
}
791773

792774
// If we did not have a special mapping, check if we have a direct scalar
@@ -800,11 +782,11 @@ extension Unicode.Scalar.Properties {
800782
let scalar = Unicode.Scalar(
801783
_unchecked: UInt32(Int(_scalar.value) &+ Int(mappingDistance))
802784
)
803-
return "\(scalar)"
785+
return String(scalar)
804786
}
805787

806788
// We did not have any mapping. Return the scalar as is.
807-
return "\(_scalar)"
789+
return String(_scalar)
808790
}
809791

810792
/// The lowercase mapping of the scalar.
@@ -867,7 +849,7 @@ extension Unicode.Scalar.Properties {
867849
/// This property corresponds to the "Age" property in the
868850
/// [Unicode Standard](http://www.unicode.org/versions/latest/).
869851
public var age: Unicode.Version? {
870-
let age = _swift_stdlib_getAge(_scalar.value)
852+
let age: UInt16 = _swift_stdlib_getAge(_scalar.value)
871853

872854
if age == .max {
873855
return nil
@@ -1207,55 +1189,46 @@ extension Unicode.Scalar.Properties {
12071189
internal func _fastScalarName() -> String? {
12081190
// Define a couple algorithmetic names below.
12091191

1192+
let scalarName = String(_scalar.value, radix: 16, uppercase: true)
1193+
1194+
switch _scalar.value {
12101195
// Hangul Syllable *
1211-
if (0xAC00 ... 0xD7A3).contains(_scalar.value) {
1196+
case (0xAC00 ... 0xD7A3):
12121197
return _hangulName()
1213-
}
12141198

12151199
// Variation Selector-17 through Variation Selector-256
1216-
if (0xE0100 ... 0xE01EF).contains(_scalar.value) {
1200+
case (0xE0100 ... 0xE01EF):
12171201
return "VARIATION SELECTOR-\(_scalar.value - 0xE0100 + 17)"
1218-
}
1219-
1220-
let scalarName = String(_scalar.value, radix: 16, uppercase: true)
12211202

1222-
// CJK Unified Ideograph-*
1223-
if (0x3400 ... 0x4DBF).contains(_scalar.value) ||
1224-
(0x4E00 ... 0x9FFF).contains(_scalar.value) ||
1225-
(0x20000 ... 0x2A6DF).contains(_scalar.value) ||
1226-
(0x2A700 ... 0x2B738).contains(_scalar.value) ||
1227-
(0x2B740 ... 0x2B81D).contains(_scalar.value) ||
1228-
(0x2B820 ... 0x2CEA1).contains(_scalar.value) ||
1229-
(0x2CEB0 ... 0x2EBE0).contains(_scalar.value) ||
1230-
(0x2F800 ... 0x2FA1D).contains(_scalar.value) ||
1231-
(0x30000 ... 0x3134A).contains(_scalar.value) {
1203+
case (0x3400 ... 0x4DBF),
1204+
(0x4E00 ... 0x9FFF),
1205+
(0x20000 ... 0x2A6DF),
1206+
(0x2A700 ... 0x2B738),
1207+
(0x2B740 ... 0x2B81D),
1208+
(0x2B820 ... 0x2CEA1),
1209+
(0x2CEB0 ... 0x2EBE0),
1210+
(0x2F800 ... 0x2FA1D),
1211+
(0x30000 ... 0x3134A):
12321212
return "CJK UNIFIED IDEOGRAPH-\(scalarName)"
1233-
}
12341213

1235-
// CJK Compatibility Ideograph-*
1236-
if (0xF900 ... 0xFA6D).contains(_scalar.value) ||
1237-
(0xFA70 ... 0xFAD9).contains(_scalar.value) {
1214+
case (0xF900 ... 0xFA6D),
1215+
(0xFA70 ... 0xFAD9):
12381216
return "CJK COMPATIBILITY IDEOGRAPH-\(scalarName)"
1239-
}
12401217

1241-
// Tangut Ideograph-*
1242-
if (0x17000 ... 0x187F7).contains(_scalar.value) ||
1243-
(0x18D00 ... 0x18D08).contains(_scalar.value) {
1218+
case (0x17000 ... 0x187F7),
1219+
(0x18D00 ... 0x18D08):
12441220
return "TANGUT IDEOGRAPH-\(scalarName)"
1245-
}
12461221

1247-
// Khitan Small Script Character-*
1248-
if (0x18B00 ... 0x18CD5).contains(_scalar.value) {
1222+
case (0x18B00 ... 0x18CD5):
12491223
return "KHITAN SMALL SCRIPT CHARACTER-\(scalarName)"
1250-
}
12511224

1252-
// Nushu Character-*
1253-
if (0x1B170 ... 0x1B2FB).contains(_scalar.value) {
1225+
case (0x1B170 ... 0x1B2FB):
12541226
return "NUSHU CHARACTER-\(scalarName)"
1255-
}
12561227

12571228
// Otherwise, go look it up.
1258-
return nil
1229+
default:
1230+
return nil
1231+
}
12591232
}
12601233

12611234
/// The published name of the scalar.
@@ -1272,8 +1245,14 @@ extension Unicode.Scalar.Properties {
12721245
}
12731246

12741247
// The longest name that Unicode defines is 88 characters long.
1275-
let name = String(_uninitializedCapacity: 90) { buffer in
1276-
_swift_stdlib_getScalarName(_scalar.value, buffer.baseAddress)
1248+
let largestCount = Int(SWIFT_STDLIB_LARGEST_NAME_COUNT)
1249+
1250+
let name = String(_uninitializedCapacity: largestCount) { buffer in
1251+
_swift_stdlib_getScalarName(
1252+
_scalar.value,
1253+
buffer.baseAddress,
1254+
buffer.count
1255+
)
12771256
}
12781257

12791258
return name.isEmpty ? nil : name
@@ -1293,13 +1272,11 @@ extension Unicode.Scalar.Properties {
12931272
/// This property corresponds to the "Name_Alias" property in the
12941273
/// [Unicode Standard](http://www.unicode.org/versions/latest/).
12951274
public var nameAlias: String? {
1296-
let nameAliasPtr = _swift_stdlib_getNameAlias(_scalar.value)
1297-
1298-
guard nameAliasPtr != nil else {
1275+
guard let nameAliasPtr = _swift_stdlib_getNameAlias(_scalar.value) else {
12991276
return nil
13001277
}
13011278

1302-
return String(cString: nameAliasPtr!)
1279+
return String(cString: nameAliasPtr)
13031280
}
13041281
}
13051282

@@ -1449,9 +1426,8 @@ extension Unicode.Scalar.Properties {
14491426
/// This property corresponds to the "Canonical_Combining_Class" property in
14501427
/// the [Unicode Standard](http://www.unicode.org/versions/latest/).
14511428
public var canonicalCombiningClass: Unicode.CanonicalCombiningClass {
1452-
let normData = _swift_stdlib_getNormData(_scalar.value)
1453-
let rawValue = UInt8(normData >> 3)
1454-
return Unicode.CanonicalCombiningClass(rawValue: rawValue)
1429+
let normData = Unicode._NormData(_scalar)
1430+
return Unicode.CanonicalCombiningClass(rawValue: normData.ccc)
14551431
}
14561432
}
14571433

@@ -1498,6 +1474,19 @@ extension Unicode {
14981474
/// meet the requirements of `decimal` will have numeric type `numeric`,
14991475
/// and programs can treat `digit` and `numeric` equivalently.
15001476
case numeric
1477+
1478+
internal init(rawValue: UInt8) {
1479+
switch rawValue {
1480+
case 0:
1481+
self = .numeric
1482+
case 1:
1483+
self = .digit
1484+
case 2:
1485+
self = .decimal
1486+
default:
1487+
fatalError("Unknown numeric type \(rawValue)")
1488+
}
1489+
}
15011490
}
15021491
}
15031492

@@ -1523,16 +1512,11 @@ extension Unicode.Scalar.Properties {
15231512
public var numericType: Unicode.NumericType? {
15241513
let rawValue = _swift_stdlib_getNumericType(_scalar.value)
15251514

1526-
switch rawValue {
1527-
case 0:
1528-
return .numeric
1529-
case 1:
1530-
return .digit
1531-
case 2:
1532-
return .decimal
1533-
default:
1515+
guard rawValue != .max else {
15341516
return nil
15351517
}
1518+
1519+
return Unicode.NumericType(rawValue: rawValue)
15361520
}
15371521

15381522
/// The numeric value of the scalar.

stdlib/public/stubs/UnicodeScalarProps.cpp

Lines changed: 42 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,9 @@ __swift_int32_t _swift_stdlib_getMapping(__swift_uint32_t scalar,
175175
}
176176

177177
SWIFT_RUNTIME_STDLIB_INTERNAL
178-
const __swift_uint32_t *_swift_stdlib_getSpecialMapping(__swift_uint32_t scalar) {
178+
const __swift_uint8_t *_swift_stdlib_getSpecialMapping(__swift_uint32_t scalar,
179+
__swift_uint8_t mapping,
180+
__swift_intptr_t *length) {
179181
auto dataIdx = _swift_stdlib_getScalarBitArrayIdx(scalar,
180182
_swift_stdlib_special_mappings,
181183
_swift_stdlib_special_mappings_ranks);
@@ -186,12 +188,36 @@ const __swift_uint32_t *_swift_stdlib_getSpecialMapping(__swift_uint32_t scalar)
186188

187189
auto index = _swift_stdlib_special_mappings_data_indices[dataIdx];
188190

189-
return _swift_stdlib_special_mappings_data + index;
191+
auto uppercase = _swift_stdlib_special_mappings_data + index;
192+
auto lowercase = uppercase + 1 + *uppercase;
193+
auto titlecase = lowercase + 1 + *lowercase;
194+
195+
switch (mapping) {
196+
// Uppercase
197+
case 0:
198+
*length = *uppercase;
199+
return uppercase + 1;
200+
201+
// Lowercase
202+
case 1:
203+
*length = *lowercase;
204+
return lowercase + 1;
205+
206+
// Titlecase
207+
case 2:
208+
*length = *titlecase;
209+
return titlecase + 1;
210+
211+
// Unknown mapping.
212+
default:
213+
return nullptr;
214+
}
190215
}
191216

192217
SWIFT_RUNTIME_STDLIB_INTERNAL
193218
__swift_intptr_t _swift_stdlib_getScalarName(__swift_uint32_t scalar,
194-
__swift_uint8_t *buffer) {
219+
__swift_uint8_t *buffer,
220+
__swift_intptr_t capacity) {
195221
auto setOffset = _swift_stdlib_names_scalar_sets[scalar >> 7];
196222

197223
if (setOffset == std::numeric_limits<__swift_uint16_t>::max()) {
@@ -217,6 +243,7 @@ __swift_intptr_t _swift_stdlib_getScalarName(__swift_uint32_t scalar,
217243

218244
auto nameSize = nextScalarOffset - scalarOffset;
219245

246+
// The total number of initialized bytes in the name string.
220247
int c = 0;
221248

222249
for (__swift_uint32_t i = 0; i < nameSize; i += 1) {
@@ -242,11 +269,23 @@ __swift_intptr_t _swift_stdlib_getScalarName(__swift_uint32_t scalar,
242269

243270
// The last character in a word has the 7th bit set.
244271
while (*word < 0x80) {
272+
if (c >= capacity) {
273+
return c;
274+
}
275+
245276
buffer[c++] = *word++;
246277
}
247278

279+
if (c >= capacity) {
280+
return c;
281+
}
282+
248283
buffer[c++] = *word & 0x7F;
249284

285+
if (c >= capacity) {
286+
return c;
287+
}
288+
250289
buffer[c++] = ' ';
251290
}
252291

0 commit comments

Comments
 (0)