Skip to content

Commit d0e93ac

Browse files
committed
Various fixes to Unicode.Scalar.Properties.
- numericValue returns nil instead of .nan for non-numerics - Remove small-string optimizations from _scalarName that failed on 32-bit archs - Put case mappings back into U.S.Properties - Added more sanity tests
1 parent 8eef50f commit d0e93ac

File tree

5 files changed

+322
-202
lines changed

5 files changed

+322
-202
lines changed

stdlib/public/SwiftShims/UnicodeShims.h

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -497,9 +497,6 @@ SWIFT_RUNTIME_STDLIB_INTERFACE
497497
__swift_stdlib_UBool
498498
__swift_stdlib_u_hasBinaryProperty(__swift_stdlib_UChar32,
499499
__swift_stdlib_UProperty);
500-
SWIFT_RUNTIME_STDLIB_INTERFACE
501-
__swift_stdlib_UBool
502-
__swift_stdlib_u_isdefined(__swift_stdlib_UChar32);
503500

504501
SWIFT_RUNTIME_STDLIB_INTERFACE
505502
void __swift_stdlib_u_charAge(
@@ -513,7 +510,7 @@ __swift_int32_t
513510
SWIFT_RUNTIME_STDLIB_INTERFACE
514511
__swift_int32_t __swift_stdlib_u_charName(
515512
__swift_stdlib_UChar32 code, __swift_stdlib_UCharNameChoice nameChoice,
516-
char *buffer, __swift_int32_t bufferLength,
513+
char *_Nullable buffer, __swift_int32_t bufferLength,
517514
__swift_stdlib_UErrorCode *pErrorCode);
518515

519516
SWIFT_RUNTIME_STDLIB_INTERFACE

stdlib/public/core/StringComparison.swift

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,7 @@ private struct _UnicodeScalarExceptions {
835835
guard let scalar = UnicodeScalar(rawValue) else { continue }
836836

837837
// Fast path: skip unassigned code points
838-
guard scalar.properties.isDefined else { continue }
838+
guard scalar.properties.generalCategory != .unassigned else { continue }
839839

840840
// Fast path: skip unless QC_FCD=no
841841
if _fastPath(!scalar.properties.isFullCompositionExclusion) {

stdlib/public/core/UnicodeScalarProperties.swift

Lines changed: 146 additions & 175 deletions
Original file line numberDiff line numberDiff line change
@@ -34,132 +34,6 @@ extension Unicode.Scalar {
3434
public var properties: Properties {
3535
return Properties(_scalar: self)
3636
}
37-
38-
/// Returns the lowercase mapping of the scalar.
39-
///
40-
/// This function returns a `String`, not a `Unicode.Scalar` or `Character`,
41-
/// because some mappings may transform a scalar into multiple scalars or
42-
/// graphemes. For example, the character "İ" (U+0130 LATIN CAPITAL LETTER I
43-
/// WITH DOT ABOVE) becomes two scalars (U+0069 LATIN SMALL LETTER I, U+0307
44-
/// COMBINING DOT ABOVE) when converted to lowercase.
45-
///
46-
/// This function corresponds to the `Lowercase_Mapping` property in the
47-
/// [Unicode Standard](http://www.unicode.org/versions/latest/).
48-
///
49-
/// - Returns: The lowercase mapping of the scalar.
50-
public func lowercased() -> String {
51-
return _applyMapping(__swift_stdlib_u_strToLower)
52-
}
53-
54-
/// Returns the titlecase mapping of the scalar.
55-
///
56-
/// This function returns a `String`, not a `Unicode.Scalar` or `Character`,
57-
/// because some mappings may transform a scalar into multiple scalars or
58-
/// graphemes. For example, the ligature "fi" (U+FB01 LATIN SMALL LIGATURE FI)
59-
/// becomes "Fi" (U+0046 LATIN CAPITAL LETTER F, U+0069 LATIN SMALL LETTER I)
60-
/// when converted to titlecase.
61-
///
62-
/// This function corresponds to the `Titlecase_Mapping` property in the
63-
/// [Unicode Standard](http://www.unicode.org/versions/latest/).
64-
///
65-
/// - Returns: The titlecase mapping of the scalar.
66-
public func titlecased() -> String {
67-
return _applyMapping { ptr, cap, src, len, locale, err in
68-
return __swift_stdlib_u_strToTitle(ptr, cap, src, len, nil, locale, err)
69-
}
70-
}
71-
72-
/// Returns the uppercase mapping of the scalar.
73-
///
74-
/// This function returns a `String`, not a `Unicode.Scalar` or `Character`,
75-
/// because some mappings may transform a scalar into multiple scalars or
76-
/// graphemes. For example, the German letter "ß" (U+00DF LATIN SMALL LETTER
77-
/// SHARP S) becomes "SS" (U+0053 LATIN CAPITAL LETTER S, U+0053 LATIN CAPITAL
78-
/// LETTER S) when converted to uppercase.
79-
///
80-
/// This function corresponds to the `Uppercase_Mapping` property in the
81-
/// [Unicode Standard](http://www.unicode.org/versions/latest/).
82-
///
83-
/// - Returns: The titlecase mapping of the scalar.
84-
public func uppercased() -> String {
85-
return _applyMapping(__swift_stdlib_u_strToUpper)
86-
}
87-
88-
/// The UTF-16 encoding of the scalar, represented as a tuple of 2 elements.
89-
///
90-
/// If the scalar only encodes to one code unit, the second element is zero.
91-
@_transparent
92-
internal var _utf16CodeUnits: (UTF16.CodeUnit, UTF16.CodeUnit) {
93-
let utf16 = UnicodeScalar(UInt32(_value))!.utf16
94-
return (utf16[0], utf16.count > 1 ? utf16[1] : 0)
95-
}
96-
97-
// The type of ICU case conversion functions.
98-
internal typealias _U_StrToX = (
99-
/* dest */ UnsafeMutablePointer<__swift_stdlib_UChar>,
100-
/* destCapacity */ Int32,
101-
/* src */ UnsafePointer<__swift_stdlib_UChar>,
102-
/* srcLength */ Int32,
103-
/* locale */ UnsafePointer<Int8>,
104-
/* pErrorCode */ UnsafeMutablePointer<__swift_stdlib_UErrorCode>
105-
) -> Int32
106-
107-
/// Applies the given ICU string mapping to the scalar.
108-
///
109-
/// This function attempts first to write the mapping into a stack-based
110-
/// UTF-16 buffer capable of holding 16 code units, which should be enough for
111-
/// all current case mappings. In the event more space is needed, it will be
112-
/// allocated on the heap.
113-
internal func _applyMapping(_ u_strTo: _U_StrToX) -> String {
114-
let utf16Length = UnicodeScalar(UInt32(_value))!.utf16.count
115-
var utf16 = _utf16CodeUnits
116-
var scratchBuffer = _Normalization._SegmentOutputBuffer(allZeros: ())
117-
let count = scratchBuffer.withUnsafeMutableBufferPointer { bufPtr -> Int in
118-
return withUnsafePointer(to: &utf16) { tuplePtr in
119-
return tuplePtr.withMemoryRebound(to: UInt16.self, capacity: 2) {
120-
utf16Pointer in
121-
var err = __swift_stdlib_U_ZERO_ERROR
122-
let correctSize = u_strTo(
123-
bufPtr.baseAddress._unsafelyUnwrappedUnchecked,
124-
Int32(bufPtr.count),
125-
utf16Pointer,
126-
Int32(utf16Length),
127-
"",
128-
&err)
129-
guard err.isSuccess ||
130-
err == __swift_stdlib_U_BUFFER_OVERFLOW_ERROR else {
131-
fatalError("Unexpected error case-converting Unicode scalar.")
132-
}
133-
return Int(correctSize)
134-
}
135-
}
136-
}
137-
if _fastPath(count <= scratchBuffer.count) {
138-
scratchBuffer.count = count
139-
return String._fromWellFormedUTF16CodeUnits(scratchBuffer)
140-
}
141-
var array = Array<UInt16>(repeating: 0, count: count)
142-
array.withUnsafeMutableBufferPointer { bufPtr in
143-
withUnsafePointer(to: &utf16) { tuplePtr in
144-
tuplePtr.withMemoryRebound(to: UInt16.self, capacity: 2) {
145-
utf16Pointer in
146-
var err = __swift_stdlib_U_ZERO_ERROR
147-
let correctSize = u_strTo(
148-
bufPtr.baseAddress._unsafelyUnwrappedUnchecked,
149-
Int32(bufPtr.count),
150-
utf16Pointer,
151-
Int32(utf16Length),
152-
"",
153-
&err)
154-
guard err.isSuccess else {
155-
fatalError("Unexpected error case-converting Unicode scalar.")
156-
}
157-
_sanityCheck(count == correctSize, "inconsistent ICU behavior")
158-
}
159-
}
160-
}
161-
return String._fromWellFormedUTF16CodeUnits(array[..<count])
162-
}
16337
}
16438

16539
/// Boolean properties that are defined by the Unicode Standard (i.e., not
@@ -784,6 +658,136 @@ extension Unicode.Scalar.Properties {
784658
}
785659
}
786660

661+
/// Case mapping properties.
662+
extension Unicode.Scalar.Properties {
663+
664+
/// The UTF-16 encoding of the scalar, represented as a tuple of 2 elements.
665+
///
666+
/// If the scalar only encodes to one code unit, the second element is zero.
667+
@_transparent
668+
internal var _utf16CodeUnits: (UTF16.CodeUnit, UTF16.CodeUnit) {
669+
let utf16 = UnicodeScalar(UInt32(_value))!.utf16
670+
return (utf16[0], utf16.count > 1 ? utf16[1] : 0)
671+
}
672+
673+
// The type of ICU case conversion functions.
674+
internal typealias _U_StrToX = (
675+
/* dest */ UnsafeMutablePointer<__swift_stdlib_UChar>,
676+
/* destCapacity */ Int32,
677+
/* src */ UnsafePointer<__swift_stdlib_UChar>,
678+
/* srcLength */ Int32,
679+
/* locale */ UnsafePointer<Int8>,
680+
/* pErrorCode */ UnsafeMutablePointer<__swift_stdlib_UErrorCode>
681+
) -> Int32
682+
683+
/// Applies the given ICU string mapping to the scalar.
684+
///
685+
/// This function attempts first to write the mapping into a stack-based
686+
/// UTF-16 buffer capable of holding 16 code units, which should be enough for
687+
/// all current case mappings. In the event more space is needed, it will be
688+
/// allocated on the heap.
689+
internal func _applyMapping(_ u_strTo: _U_StrToX) -> String {
690+
let utf16Length = UnicodeScalar(UInt32(_value))!.utf16.count
691+
var utf16 = _utf16CodeUnits
692+
var scratchBuffer = _Normalization._SegmentOutputBuffer(allZeros: ())
693+
let count = scratchBuffer.withUnsafeMutableBufferPointer { bufPtr -> Int in
694+
return withUnsafePointer(to: &utf16) { tuplePtr in
695+
return tuplePtr.withMemoryRebound(to: UInt16.self, capacity: 2) {
696+
utf16Pointer in
697+
var err = __swift_stdlib_U_ZERO_ERROR
698+
let correctSize = u_strTo(
699+
bufPtr.baseAddress._unsafelyUnwrappedUnchecked,
700+
Int32(bufPtr.count),
701+
utf16Pointer,
702+
Int32(utf16Length),
703+
"",
704+
&err)
705+
guard err.isSuccess ||
706+
err == __swift_stdlib_U_BUFFER_OVERFLOW_ERROR else {
707+
fatalError("Unexpected error case-converting Unicode scalar.")
708+
}
709+
return Int(correctSize)
710+
}
711+
}
712+
}
713+
if _fastPath(count <= scratchBuffer.count) {
714+
scratchBuffer.count = count
715+
return String._fromWellFormedUTF16CodeUnits(scratchBuffer)
716+
}
717+
var array = Array<UInt16>(repeating: 0, count: count)
718+
array.withUnsafeMutableBufferPointer { bufPtr in
719+
withUnsafePointer(to: &utf16) { tuplePtr in
720+
tuplePtr.withMemoryRebound(to: UInt16.self, capacity: 2) {
721+
utf16Pointer in
722+
var err = __swift_stdlib_U_ZERO_ERROR
723+
let correctSize = u_strTo(
724+
bufPtr.baseAddress._unsafelyUnwrappedUnchecked,
725+
Int32(bufPtr.count),
726+
utf16Pointer,
727+
Int32(utf16Length),
728+
"",
729+
&err)
730+
guard err.isSuccess else {
731+
fatalError("Unexpected error case-converting Unicode scalar.")
732+
}
733+
_sanityCheck(count == correctSize, "inconsistent ICU behavior")
734+
}
735+
}
736+
}
737+
return String._fromWellFormedUTF16CodeUnits(array[..<count])
738+
}
739+
740+
/// The lowercase mapping of the scalar.
741+
///
742+
/// This property is a `String`, not a `Unicode.Scalar` or `Character`,
743+
/// because some mappings may transform a scalar into multiple scalars or
744+
/// graphemes. For example, the character "İ" (U+0130 LATIN CAPITAL LETTER I
745+
/// WITH DOT ABOVE) becomes two scalars (U+0069 LATIN SMALL LETTER I, U+0307
746+
/// COMBINING DOT ABOVE) when converted to lowercase.
747+
///
748+
/// This function corresponds to the `Lowercase_Mapping` property in the
749+
/// [Unicode Standard](http://www.unicode.org/versions/latest/).
750+
///
751+
/// - Returns: The lowercase mapping of the scalar.
752+
public var lowercaseMapping: String {
753+
return _applyMapping(__swift_stdlib_u_strToLower)
754+
}
755+
756+
/// The titlecase mapping of the scalar.
757+
///
758+
/// This property is a `String`, not a `Unicode.Scalar` or `Character`,
759+
/// because some mappings may transform a scalar into multiple scalars or
760+
/// graphemes. For example, the ligature "fi" (U+FB01 LATIN SMALL LIGATURE FI)
761+
/// becomes "Fi" (U+0046 LATIN CAPITAL LETTER F, U+0069 LATIN SMALL LETTER I)
762+
/// when converted to titlecase.
763+
///
764+
/// This function corresponds to the `Titlecase_Mapping` property in the
765+
/// [Unicode Standard](http://www.unicode.org/versions/latest/).
766+
///
767+
/// - Returns: The titlecase mapping of the scalar.
768+
public var titlecaseMapping: String {
769+
return _applyMapping { ptr, cap, src, len, locale, err in
770+
return __swift_stdlib_u_strToTitle(ptr, cap, src, len, nil, locale, err)
771+
}
772+
}
773+
774+
/// The uppercase mapping of the scalar.
775+
///
776+
/// This property is a `String`, not a `Unicode.Scalar` or `Character`,
777+
/// because some mappings may transform a scalar into multiple scalars or
778+
/// graphemes. For example, the German letter "ß" (U+00DF LATIN SMALL LETTER
779+
/// SHARP S) becomes "SS" (U+0053 LATIN CAPITAL LETTER S, U+0053 LATIN CAPITAL
780+
/// LETTER S) when converted to uppercase.
781+
///
782+
/// This function corresponds to the `Uppercase_Mapping` property in the
783+
/// [Unicode Standard](http://www.unicode.org/versions/latest/).
784+
///
785+
/// - Returns: The uppercase mapping of the scalar.
786+
public var uppercaseMapping: String {
787+
return _applyMapping(__swift_stdlib_u_strToUpper)
788+
}
789+
}
790+
787791
extension Unicode {
788792

789793
/// A version of the Unicode Standard represented by its `major.minor`
@@ -1091,40 +1095,28 @@ extension Unicode.Scalar.Properties {
10911095
internal func _scalarName(
10921096
_ choice: __swift_stdlib_UCharNameChoice
10931097
) -> String? {
1094-
// ICU writes a trailing nul. We allow ICU to store up to and including
1095-
// `capacity` bytes. If ICU writes `capacity` bytes, then ICU will also
1096-
// write nul into the count, which we will overwrite after.
1097-
var smol = _SmallUTF8String()
1098-
let count = smol._withAllUnsafeMutableBytes { bufPtr -> Int in
1099-
var err = __swift_stdlib_U_ZERO_ERROR
1100-
let count32 = __swift_stdlib_u_charName(
1101-
_value, choice,
1102-
bufPtr.baseAddress._unsafelyUnwrappedUnchecked.assumingMemoryBound(
1103-
to: Int8.self),
1104-
Int32(bufPtr.count), &err)
1105-
return Int(count32)
1106-
}
1098+
var err = __swift_stdlib_U_ZERO_ERROR
1099+
let count = Int(__swift_stdlib_u_charName(_value, choice, nil, 0, &err))
11071100
guard count > 0 else { return nil }
1108-
if count <= smol.capacity {
1109-
smol.count = count
1110-
return String(_StringGuts(smol))
1111-
}
11121101

1113-
// Save room for nul
1114-
var array = Array<UInt8>(repeating: 0, count: 1 + count)
1115-
array.withUnsafeMutableBufferPointer { bufPtr in
1102+
// ICU writes a trailing null, so we have to save room for it as well.
1103+
var array = Array<UInt8>(repeating: 0, count: count + 1)
1104+
return array.withUnsafeMutableBufferPointer { bufPtr in
11161105
var err = __swift_stdlib_U_ZERO_ERROR
11171106
let correctSize = __swift_stdlib_u_charName(
1118-
_value, choice,
1107+
_value,
1108+
choice,
11191109
UnsafeMutableRawPointer(bufPtr.baseAddress._unsafelyUnwrappedUnchecked)
11201110
.assumingMemoryBound(to: Int8.self),
1121-
Int32(bufPtr.count), &err)
1111+
Int32(bufPtr.count),
1112+
&err)
11221113
guard err.isSuccess else {
11231114
fatalError("Unexpected error case-converting Unicode scalar.")
11241115
}
11251116
_sanityCheck(count == correctSize, "inconsistent ICU behavior")
1117+
return String._fromASCII(
1118+
UnsafeBufferPointer(rebasing: bufPtr[..<count]))
11261119
}
1127-
return String._fromWellFormedUTF8CodeUnitSequence(array[..<count])
11281120
}
11291121

11301122
/// The published name of the scalar.
@@ -1376,11 +1368,11 @@ extension Unicode.Scalar.Properties {
13761368

13771369
/// The numeric value of the scalar.
13781370
///
1379-
/// The value of this property is `Double.nan` for scalars that do not
1380-
/// represent a number.
1371+
/// The value of this property is `nil` for scalars that do not represent a
1372+
/// number.
13811373
///
13821374
/// The numeric value of a scalar is represented as a `Double` because some
1383-
/// scalars represent fractions.
1375+
/// scalars represent fractions:
13841376
///
13851377
/// ```
13861378
/// print("X", ("X" as Unicode.Scalar).properties.numericValue)
@@ -1395,30 +1387,9 @@ extension Unicode.Scalar.Properties {
13951387
///
13961388
/// This property corresponds to the `Numeric_Value` property in the
13971389
/// [Unicode Standard](http://www.unicode.org/versions/latest/).
1398-
public var numericValue: Double {
1390+
public var numericValue: Double? {
13991391
let icuNoNumericValue: Double = -123456789
14001392
let result = __swift_stdlib_u_getNumericValue(_value)
1401-
return result != icuNoNumericValue ? result : .nan
1402-
}
1403-
}
1404-
1405-
/// Additional queries that do not correspond precisely to a named Unicode
1406-
/// property.
1407-
extension Unicode.Scalar.Properties {
1408-
1409-
/// A Boolean property indicating whether the supported version of Unicode has
1410-
/// assigned a code point to the value of this scalar.
1411-
///
1412-
/// The value of this property may change depending on the platform, operating
1413-
/// system version, and versions of system libraries in use.
1414-
///
1415-
/// ```
1416-
/// print(("A" as Unicode.Scalar).properties.isDefined)
1417-
/// // Prints "true"
1418-
/// print(("\u{ABCDE}" as Unicode.Scalar).properties.isDefined)
1419-
/// // Prints "false"
1420-
/// ```
1421-
public var isDefined: Bool {
1422-
return __swift_stdlib_u_isdefined(_value) != 0
1393+
return result != icuNoNumericValue ? result : nil
14231394
}
14241395
}

0 commit comments

Comments
 (0)