Skip to content

Commit 2ebc898

Browse files
authored
Merge pull request #42289 from Azoy/add-case-folding (#59212)
[stdlib] Add caseFolded to scalar properties
1 parent 66d6acf commit 2ebc898

File tree

7 files changed

+2316
-1
lines changed

7 files changed

+2316
-1
lines changed

stdlib/private/StdlibUnicodeUnittest/UnicodeScalarProperties.swift

Lines changed: 49 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
//
1111
//===----------------------------------------------------------------------===//
1212

13-
// Unicode scalar tests are currently only avaible on Darwin, awaiting a sensible
13+
// Unicode scalar tests are currently only available on Darwin, awaiting a sensible
1414
// file API...
1515
#if _runtime(_ObjC)
1616
import Foundation
@@ -638,6 +638,54 @@ public let names: [Unicode.Scalar: String] = {
638638
return result
639639
}()
640640

641+
//===----------------------------------------------------------------------===//
642+
// Case Folding
643+
//===----------------------------------------------------------------------===//
644+
645+
func parseCaseFoldings(
646+
_ data: String,
647+
into result: inout [Unicode.Scalar: String]
648+
) {
649+
for line in data.split(separator: "\n") {
650+
// Skip comments
651+
guard !line.hasPrefix("#") else {
652+
continue
653+
}
654+
655+
let components = line.split(separator: ";")
656+
657+
let status = components[1].filter { !$0.isWhitespace }
658+
659+
// We only care about Common and Full case mappings.
660+
guard status == "C" || status == "F" else {
661+
continue
662+
}
663+
664+
let scalar = Unicode.Scalar(parseScalars(String(components[0])).lowerBound)!
665+
666+
let mapping = components[2].split(separator: " ").map {
667+
Unicode.Scalar(UInt32($0, radix: 16)!)!
668+
}
669+
670+
var mappingString = ""
671+
672+
for scalar in mapping {
673+
mappingString.unicodeScalars.append(scalar)
674+
}
675+
676+
result[scalar] = mappingString
677+
}
678+
}
679+
680+
public let caseFolding: [Unicode.Scalar: String] = {
681+
var result: [Unicode.Scalar: String] = [:]
682+
683+
let caseFolding = readInputFile("CaseFolding.txt")
684+
parseCaseFoldings(caseFolding, into: &result)
685+
686+
return result
687+
}()
688+
641689
//===----------------------------------------------------------------------===//
642690
// Script/Script Extensions
643691
//===----------------------------------------------------------------------===//

stdlib/public/SwiftShims/UnicodeData.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,10 @@ const __swift_uint8_t *_swift_stdlib_getScriptExtensions(
108108
__swift_uint32_t scalar,
109109
__swift_uint8_t *count);
110110

111+
SWIFT_RUNTIME_STDLIB_INTERNAL
112+
void _swift_stdlib_getCaseMapping(__swift_uint32_t scalar,
113+
__swift_uint32_t *buffer);
114+
111115
#ifdef __cplusplus
112116
} // extern "C"
113117
#endif

stdlib/public/core/UnicodeSPI.swift

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,3 +165,37 @@ extension Unicode.Scalar.Properties {
165165
return result
166166
}
167167
}
168+
169+
//===----------------------------------------------------------------------===//
170+
// Case folding
171+
//===----------------------------------------------------------------------===//
172+
173+
extension Unicode.Scalar.Properties {
174+
@_spi(_Unicode)
175+
@available(SwiftStdlib 5.7, *)
176+
public var _caseFolded: String {
177+
var buffer: (UInt32, UInt32, UInt32) = (.max, .max, .max)
178+
179+
withUnsafeMutableBytes(of: &buffer) {
180+
// This is safe because the memory is already UInt32
181+
let ptr = $0.baseAddress!.assumingMemoryBound(to: UInt32.self)
182+
_swift_stdlib_getCaseMapping(_scalar.value, ptr)
183+
}
184+
185+
var result = ""
186+
// Max mapping is 3 scalars and the max UTF8 bytes of a scalar is 4.
187+
result.reserveCapacity(12)
188+
189+
withUnsafeBytes(of: &buffer) {
190+
for scalar in $0.bindMemory(to: UInt32.self) {
191+
guard scalar != .max else {
192+
break
193+
}
194+
195+
result.unicodeScalars.append(Unicode.Scalar(scalar)!)
196+
}
197+
}
198+
199+
return result
200+
}
201+
}

stdlib/public/stubs/Unicode/Common/CaseData.h

Lines changed: 522 additions & 0 deletions
Large diffs are not rendered by default.

stdlib/public/stubs/Unicode/UnicodeScalarProps.cpp

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include "Common/ScalarPropsData.h"
1919
#endif
2020

21+
#include "Common/CaseData.h"
2122
#include "Common/ScriptData.h"
2223

2324
#else
@@ -510,3 +511,62 @@ const __swift_uint8_t *_swift_stdlib_getScriptExtensions(__swift_uint32_t scalar
510511
return _swift_stdlib_script_extensions_data + (scalarDataIdx & 0x7FF);
511512
#endif
512513
}
514+
515+
SWIFT_RUNTIME_STDLIB_INTERNAL
516+
void _swift_stdlib_getCaseMapping(__swift_uint32_t scalar,
517+
__swift_uint32_t *buffer) {
518+
#if !SWIFT_STDLIB_ENABLE_UNICODE_DATA
519+
swift::swift_abortDisabledUnicodeSupport();
520+
#else
521+
auto mphIdx = _swift_stdlib_getMphIdx(scalar, CASE_FOLD_LEVEL_COUNT,
522+
_swift_stdlib_case_keys,
523+
_swift_stdlib_case_ranks,
524+
_swift_stdlib_case_sizes);
525+
526+
auto caseValue = _swift_stdlib_case[mphIdx];
527+
__swift_uint32_t hashedScalar = (caseValue << 43) >> 43;
528+
529+
// If our scalar is not the original one we hashed, then this scalar has no
530+
// case mapping. It maps to itself.
531+
if (scalar != hashedScalar) {
532+
buffer[0] = scalar;
533+
return;
534+
}
535+
536+
// If the top bit is NOT set, then this scalar simply maps to another scalar.
537+
// We have stored the distance to said scalar in this value.
538+
if ((caseValue & ((__swift_uint64_t)(0x1) << 63)) == 0) {
539+
auto distance = (__swift_int32_t)((caseValue << 1) >> 22);
540+
auto mappedScalar = (__swift_uint32_t)((__swift_int32_t)(scalar) - distance);
541+
542+
buffer[0] = mappedScalar;
543+
return;
544+
}
545+
546+
// Our top bit WAS set which means this scalar maps to multiple scalars.
547+
// Lookup our mapping in the full mph.
548+
auto fullMphIdx = _swift_stdlib_getMphIdx(scalar, CASE_FULL_FOLD_LEVEL_COUNT,
549+
_swift_stdlib_case_full_keys,
550+
_swift_stdlib_case_full_ranks,
551+
_swift_stdlib_case_full_sizes);
552+
553+
auto fullCaseValue = _swift_stdlib_case_full[fullMphIdx];
554+
555+
// Count is either 2 or 3.
556+
auto count = fullCaseValue >> 62;
557+
558+
for (__swift_uint64_t i = 0; i != count; i += 1) {
559+
auto distance = (__swift_int32_t)(fullCaseValue & 0xFFFF);
560+
561+
if ((fullCaseValue & 0x10000) != 0) {
562+
distance = -distance;
563+
}
564+
565+
fullCaseValue >>= 17;
566+
567+
auto mappedScalar = (__swift_uint32_t)((__swift_int32_t)(scalar) - distance);
568+
569+
buffer[i] = mappedScalar;
570+
}
571+
#endif
572+
}

0 commit comments

Comments
 (0)