Skip to content

[DNM] [stdlib] Implement string case-folding and normalization APIs #17933

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions stdlib/public/SwiftShims/UnicodeShims.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,11 @@ __swift_int32_t _swift_stdlib_unicode_strToLower(
__swift_uint16_t *Destination, __swift_int32_t DestinationCapacity,
const __swift_uint16_t *Source, __swift_int32_t SourceLength);

SWIFT_RUNTIME_STDLIB_INTERFACE
__swift_int32_t _swift_stdlib_unicode_strFoldCase(
__swift_uint16_t *Destination, __swift_int32_t DestinationCapacity,
const __swift_uint16_t *Source, __swift_int32_t SourceLength);

typedef enum __swift_stdlib_UProperty {
__swift_stdlib_UCHAR_ALPHABETIC = 0,
__swift_stdlib_UCHAR_BINARY_START = __swift_stdlib_UCHAR_ALPHABETIC,
Expand Down Expand Up @@ -420,6 +425,13 @@ typedef enum __swift_stdlib_UCharNameChoice {
#endif
} __swift_stdlib_UCharNameChoice;

typedef enum __swift_stdlib_UNormalization2Mode {
__swift_stdlib_UNORM2_COMPOSE,
__swift_stdlib_UNORM2_DECOMPOSE,
__swift_stdlib_UNORM2_FCD,
__swift_stdlib_UNORM2_COMPOSE_CONTIGUOUS
} __swift_stdlib_UNormalization2Mode;

typedef enum __swift_stdlib_UNumericType {
__swift_stdlib_U_NT_NONE,
__swift_stdlib_U_NT_DECIMAL,
Expand Down Expand Up @@ -494,10 +506,34 @@ __swift_stdlib_UBool
__swift_stdlib_unorm2_hasBoundaryBefore(const __swift_stdlib_UNormalizer2 *,
__swift_stdlib_UChar32);

SWIFT_RUNTIME_STDLIB_INTERFACE
const __swift_stdlib_UNormalizer2 *
__swift_stdlib_unorm2_getNFDInstance(__swift_stdlib_UErrorCode *);

SWIFT_RUNTIME_STDLIB_INTERFACE
const __swift_stdlib_UNormalizer2 *
__swift_stdlib_unorm2_getNFCInstance(__swift_stdlib_UErrorCode *);

SWIFT_RUNTIME_STDLIB_INTERFACE
const __swift_stdlib_UNormalizer2 *
__swift_stdlib_unorm2_getNFKDInstance(__swift_stdlib_UErrorCode *);

SWIFT_RUNTIME_STDLIB_INTERFACE
const __swift_stdlib_UNormalizer2 *
__swift_stdlib_unorm2_getNFKCInstance(__swift_stdlib_UErrorCode *);

SWIFT_RUNTIME_STDLIB_INTERFACE
const __swift_stdlib_UNormalizer2 *
__swift_stdlib_unorm2_getNFKCCasefoldInstance(__swift_stdlib_UErrorCode *);

SWIFT_RUNTIME_STDLIB_INTERFACE
const __swift_stdlib_UNormalizer2 *
__swift_stdlib_unorm2_getFCDInstance(__swift_stdlib_UErrorCode *);

SWIFT_RUNTIME_STDLIB_INTERFACE
const __swift_stdlib_UNormalizer2 *
__swift_stdlib_unorm2_getFCCInstance(__swift_stdlib_UErrorCode *);

SWIFT_RUNTIME_STDLIB_INTERFACE
__swift_int32_t
__swift_stdlib_unorm2_normalize(const __swift_stdlib_UNormalizer2 *,
Expand Down
146 changes: 146 additions & 0 deletions stdlib/public/core/String.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1199,6 +1199,90 @@ internal func _nativeUnicodeUppercaseString(_ str: String) -> String {
}
#endif

@usableFromInline // FIXME(sil-serialize-all)
internal func _nativeUnicodeCaseFoldString(_ str: String) -> String {

// TODO (TODO: JIRA): check for small

let guts = str._guts._extractContiguousUTF16()
defer { _fixLifetime(guts) }
let utf16 = guts._unmanagedUTF16View
var storage = _SwiftStringStorage<UTF16.CodeUnit>.create(
capacity: utf16.count,
count: utf16.count)

// Try to write it out to the same length.
let z = _swift_stdlib_unicode_strFoldCase(
storage.start, Int32(storage.capacity), // FIXME: handle overflow case
utf16.start, Int32(utf16.count))
let correctSize = Int(z)

// If more space is needed, do it again with the correct buffer size.
if correctSize > storage.capacity {
storage = _SwiftStringStorage<UTF16.CodeUnit>.create(
capacity: correctSize,
count: correctSize)
_swift_stdlib_unicode_strFoldCase(
storage.start, Int32(storage.capacity), // FIXME: handle overflow case
utf16.start, Int32(utf16.count))
}
storage.count = correctSize
return String(_largeStorage: storage)
}

@usableFromInline // FIXME(sil-serialize-all)
internal func _nativeUnicodeNormalizeString(
_ str: String, _ form: Unicode.NormalizationForm
) -> String {

// TODO (TODO: JIRA): check for small

let guts = str._guts._extractContiguousUTF16()
defer { _fixLifetime(guts) }
let utf16 = guts._unmanagedUTF16View

let norm = _Normalization._normalizer(form)
guard !_Normalization._prenormalQuickCheckYes(normalizer: norm, utf16) else {
return str
}

var storage = _SwiftStringStorage<UTF16.CodeUnit>.create(
capacity: utf16.count, count: utf16.count)

// Try to write it out to the same length.
var err = __swift_stdlib_U_ZERO_ERROR
let z = __swift_stdlib_unorm2_normalize(
norm,
utf16.start,
numericCast(utf16.count),
storage.start,
numericCast(storage.capacity), // FIXME: handle overflow case
&err)
guard err.isSuccess || err == __swift_stdlib_U_BUFFER_OVERFLOW_ERROR else {
fatalError("unorm2_normalize: Unexpected error normalizing Unicode string.")
}
let correctSize = Int(z)

// If more space is needed, do it again with the correct buffer size.
if correctSize > storage.capacity {
storage = _SwiftStringStorage<UTF16.CodeUnit>.create(
capacity: correctSize, count: correctSize)

_ = __swift_stdlib_unorm2_normalize(
norm,
utf16.start,
numericCast(utf16.count),
storage.start,
numericCast(storage.capacity), // FIXME: handle overflow case
&err)
}
guard err.isSuccess else {
fatalError("unorm2_normalize: Unexpected error normalizing Unicode string.")
}
storage.count = correctSize
return String(_largeStorage: storage)
}

// Unicode algorithms
extension String {
// FIXME: implement case folding without relying on Foundation.
Expand Down Expand Up @@ -1308,6 +1392,68 @@ extension String {
#endif
}

/// Returns a case-folded version of the string using locale-independent
/// default case folding.
///
/// The main purpose of case folding is to support caseless string matching.
/// Although it's related to and based on case conversion operations, case
/// folding is language-neutral and omits context-sensitive mappings.
/// Therefore, it's not suitable for transforming natural language text for
/// human consumption.
///
/// Most characters are mapped to their lowercase counterparts. However, a
/// case-folded string is not necessarily lowercase. For example, Cherokee
/// letters are case-folded to their uppercase counterparts.
///
/// - Note: Case folding does not preserve Unicode normalization forms.
///
/// - Returns: A case-folded copy of the string.
///
/// - Complexity: O(*n*)
public func caseFolded() -> String {
if _guts.isASCII {
var guts = _guts
guts.withMutableASCIIStorage(unusedCapacity: 0) { storage in
for i in 0..<storage._value.count {
// See the comment above in lowercased.
let value = storage._value.start[i]
let isUpper =
_asciiUpperCaseTable &>>
UInt64(((value &- 1) & 0b0111_1111) &>> 1)
let add = (isUpper & 0x1) &<< 5
storage._value.start[i] = value &+ UInt8(truncatingIfNeeded: add)
}
}
return String(guts)
}

return _nativeUnicodeCaseFoldString(self)
}

/// Returns a normalized version of the string using the given Unicode
/// normalization form.
///
/// Unicode extended grapheme clusters can be equivalent to each other despite
/// differences in their underlying representation. Normalization removes
/// unwanted differences in representation; roughly speaking, a normalization
/// form specifies which differences are unwanted and how they should be
/// normalized.
///
/// - Parameter form: The normalization form to be used.
/// - Returns: A normalized copy of the string.
///
/// - Complexity: O(*n*)
public func normalized(_ form: Unicode.NormalizationForm) -> String {
if _guts.isASCII {
switch form {
case .nfd, .nfc, .nfkd, .nfkc, .fcd, .fcc: return self
// @unknown default: fatalError("Unrecognized normalization form")
}
}

return _nativeUnicodeNormalizeString(self, form)
}

/// Creates an instance from the description of a given
/// `LosslessStringConvertible` instance.
@inlinable // FIXME(sil-serialize-all)
Expand Down
Loading