Skip to content

Commit a73f49e

Browse files
committed
Implement SE-0221: Character Properties
Provide convenience properties on Character.
1 parent 25d380c commit a73f49e

File tree

6 files changed

+592
-0
lines changed

6 files changed

+592
-0
lines changed

stdlib/public/core/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,7 @@ set(SWIFTLIB_ESSENTIAL
176176
UnicodeHelpers.swift
177177
UnicodeParser.swift
178178
UnicodeScalarProperties.swift
179+
CharacterProperties.swift # ORDER DEPENDENCY: UnicodeScalarProperties.swift
179180
Unmanaged.swift
180181
UnmanagedOpaqueString.swift
181182
UnmanagedString.swift
Lines changed: 289 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,289 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
extension Character {
14+
@inlinable
15+
internal var _firstScalar: Unicode.Scalar {
16+
return self.unicodeScalars.first!
17+
}
18+
@inlinable
19+
internal var _isSingleScalar: Bool {
20+
return self.unicodeScalars.index(
21+
after: self.unicodeScalars.startIndex
22+
) == self.unicodeScalars.endIndex
23+
}
24+
25+
/// Whether this Character is ASCII.
26+
@inlinable
27+
public var isASCII: Bool {
28+
return asciiValue != nil
29+
}
30+
31+
/// Returns the ASCII encoding value of this Character, if ASCII.
32+
///
33+
/// Note: "\r\n" (CR-LF) is normalized to "\n" (LF), which will return 0x0A
34+
@inlinable
35+
public var asciiValue: UInt8? {
36+
if _slowPath(self == "\r\n") { return 0x000A /* LINE FEED (LF) */ }
37+
if _slowPath(!_isSingleScalar || _firstScalar.value >= 0x80) { return nil }
38+
return UInt8(_firstScalar.value)
39+
}
40+
41+
/// Whether this Character represents whitespace, including newlines.
42+
///
43+
/// Examples:
44+
/// * "\t" (U+0009 CHARACTER TABULATION)
45+
/// * " " (U+0020 SPACE)
46+
/// * U+2029 PARAGRAPH SEPARATOR
47+
/// * U+3000 IDEOGRAPHIC SPACE
48+
///
49+
public var isWhitespace: Bool {
50+
return _firstScalar.properties.isWhitespace
51+
}
52+
53+
/// Whether this Character represents a newline.
54+
///
55+
/// Examples:
56+
/// * "\n" (U+000A): LINE FEED (LF)
57+
/// * U+000B: LINE TABULATION (VT)
58+
/// * U+000C: FORM FEED (FF)
59+
/// * "\r" (U+000D): CARRIAGE RETURN (CR)
60+
/// * "\r\n" (U+000A U+000D): CR-LF
61+
/// * U+0085: NEXT LINE (NEL)
62+
/// * U+2028: LINE SEPARATOR
63+
/// * U+2029: PARAGRAPH SEPARATOR
64+
///
65+
@inlinable
66+
public var isNewline: Bool {
67+
switch _firstScalar.value {
68+
case 0x000A...0x000D /* LF ... CR */: return true
69+
case 0x0085 /* NEXT LINE (NEL) */: return true
70+
case 0x2028 /* LINE SEPARATOR */: return true
71+
case 0x2029 /* PARAGRAPH SEPARATOR */: return true
72+
default: return false
73+
}
74+
}
75+
76+
/// Whether this Character represents a number.
77+
///
78+
/// Examples:
79+
/// * "7" (U+0037 DIGIT SEVEN)
80+
/// * "⅚" (U+215A VULGAR FRACTION FIVE SIXTHS)
81+
/// * "㊈" (U+3288 CIRCLED IDEOGRAPH NINE)
82+
/// * "𝟠" (U+1D7E0 MATHEMATICAL DOUBLE-STRUCK DIGIT EIGHT)
83+
/// * "๒" (U+0E52 THAI DIGIT TWO)
84+
///
85+
public var isNumber: Bool {
86+
return _firstScalar.properties.numericType != nil
87+
}
88+
89+
/// Whether this Character represents a whole number. See
90+
/// `Character.wholeNumberValue`
91+
@inlinable
92+
public var isWholeNumber: Bool {
93+
return wholeNumberValue != nil
94+
}
95+
96+
/// If this Character is a whole number, return the value it represents, else
97+
/// nil.
98+
///
99+
/// Examples:
100+
/// * "1" (U+0031 DIGIT ONE) => 1
101+
/// * "५" (U+096B DEVANAGARI DIGIT FIVE) => 5
102+
/// * "๙" (U+0E59 THAI DIGIT NINE) => 9
103+
/// * "万" (U+4E07 CJK UNIFIED IDEOGRAPH-4E07) => 10_000
104+
///
105+
/// Note: Returns nil on 32-bit platforms if the result would overflow `Int`.
106+
public var wholeNumberValue: Int? {
107+
guard _isSingleScalar else { return nil }
108+
guard let value = _firstScalar.properties.numericValue else { return nil }
109+
return Int(exactly: value)
110+
}
111+
112+
/// Whether this Character represents a hexadecimal digit.
113+
///
114+
/// Hexadecimal digits include 0-9, Latin letters a-f and A-F, and their
115+
/// fullwidth compatibility forms. To get their value, see
116+
/// `Character.hexDigitValue`
117+
@inlinable
118+
public var isHexDigit: Bool {
119+
return hexDigitValue != nil
120+
}
121+
122+
/// If this Character is a hexadecimal digit, returns the value it represents,
123+
/// else nil.
124+
public var hexDigitValue: Int? {
125+
guard _isSingleScalar else { return nil }
126+
let value = _firstScalar.value
127+
switch value {
128+
// DIGIT ZERO..DIGIT NINE
129+
case 0x0030...0x0039: return Int(value &- 0x0030)
130+
// LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER F
131+
case 0x0041...0x0046: return Int((value &+ 10) &- 0x0041)
132+
// LATIN SMALL LETTER A..LATIN SMALL LETTER F
133+
case 0x0061...0x0066: return Int((value &+ 10) &- 0x0061)
134+
// FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
135+
case 0xFF10...0xFF19: return Int(value &- 0xFF10)
136+
// FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER F
137+
case 0xFF21...0xFF26: return Int((value &+ 10) &- 0xFF21)
138+
// FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER F
139+
case 0xFF41...0xFF46: return Int((value &+ 10) &- 0xFF41)
140+
141+
default: return nil
142+
}
143+
}
144+
145+
/// Whether this Character is a letter.
146+
///
147+
/// Examples:
148+
/// * "A" (U+0041 LATIN CAPITAL LETTER A)
149+
/// * "é" (U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT)
150+
/// * "ϴ" (U+03F4 GREEK CAPITAL THETA SYMBOL)
151+
/// * "ڈ" (U+0688 ARABIC LETTER DDAL)
152+
/// * "日" (U+65E5 CJK UNIFIED IDEOGRAPH-65E5)
153+
/// * "ᚨ" (U+16A8 RUNIC LETTER ANSUZ A)
154+
///
155+
public var isLetter: Bool {
156+
return _firstScalar.properties.isAlphabetic
157+
}
158+
159+
/// Perform case conversion to uppercase
160+
///
161+
/// Examples:
162+
/// * "é" (U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT)
163+
/// => "É" (U+0045 LATIN CAPITAL LETTER E, U+0301 COMBINING ACUTE ACCENT)
164+
/// * "и" (U+0438 CYRILLIC SMALL LETTER I)
165+
/// => "И" (U+0418 CYRILLIC CAPITAL LETTER I)
166+
/// * "π" (U+03C0 GREEK SMALL LETTER PI)
167+
/// => "Π" (U+03A0 GREEK CAPITAL LETTER PI)
168+
/// * "ß" (U+00DF LATIN SMALL LETTER SHARP S)
169+
/// => "SS" (U+0053 LATIN CAPITAL LETTER S, U+0053 LATIN CAPITAL LETTER S)
170+
///
171+
/// Note: Returns a String as case conversion can result in multiple
172+
/// Characters.
173+
public func uppercased() -> String { return String(self).uppercased() }
174+
175+
/// Perform case conversion to lowercase
176+
///
177+
/// Examples:
178+
/// * "É" (U+0045 LATIN CAPITAL LETTER E, U+0301 COMBINING ACUTE ACCENT)
179+
/// => "é" (U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT)
180+
/// * "И" (U+0418 CYRILLIC CAPITAL LETTER I)
181+
/// => "и" (U+0438 CYRILLIC SMALL LETTER I)
182+
/// * "Π" (U+03A0 GREEK CAPITAL LETTER PI)
183+
/// => "π" (U+03C0 GREEK SMALL LETTER PI)
184+
///
185+
/// Note: Returns a String as case conversion can result in multiple
186+
/// Characters.
187+
public func lowercased() -> String { return String(self).lowercased() }
188+
189+
@usableFromInline
190+
internal var _isUppercased: Bool { return String(self) == self.uppercased() }
191+
@usableFromInline
192+
internal var _isLowercased: Bool { return String(self) == self.lowercased() }
193+
194+
/// Whether this Character is considered uppercase.
195+
///
196+
/// Uppercase Characters vary under case-conversion to lowercase, but not when
197+
/// converted to uppercase.
198+
///
199+
/// Examples:
200+
/// * "É" (U+0045 LATIN CAPITAL LETTER E, U+0301 COMBINING ACUTE ACCENT)
201+
/// * "И" (U+0418 CYRILLIC CAPITAL LETTER I)
202+
/// * "Π" (U+03A0 GREEK CAPITAL LETTER PI)
203+
///
204+
@inlinable
205+
public var isUppercase: Bool {
206+
if _fastPath(_isSingleScalar && _firstScalar.properties.isUppercase) {
207+
return true
208+
}
209+
return _isUppercased && isCased
210+
}
211+
212+
/// Whether this Character is considered lowercase.
213+
///
214+
/// Lowercase Characters vary under case-conversion to uppercase, but not when
215+
/// converted to lowercase.
216+
///
217+
/// Examples:
218+
/// * "é" (U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT)
219+
/// * "и" (U+0438 CYRILLIC SMALL LETTER I)
220+
/// * "π" (U+03C0 GREEK SMALL LETTER PI)
221+
///
222+
@inlinable
223+
public var isLowercase: Bool {
224+
if _fastPath(_isSingleScalar && _firstScalar.properties.isLowercase) {
225+
return true
226+
}
227+
return _isLowercased && isCased
228+
}
229+
230+
/// Whether this Character changes under any form of case conversion.
231+
@inlinable
232+
public var isCased: Bool {
233+
if _fastPath(_isSingleScalar && _firstScalar.properties.isCased) {
234+
return true
235+
}
236+
return !_isUppercased || !_isLowercased
237+
}
238+
239+
/// Whether this Character represents a symbol
240+
///
241+
/// Examples:
242+
/// * "®" (U+00AE REGISTERED SIGN)
243+
/// * "⌹" (U+2339 APL FUNCTIONAL SYMBOL QUAD DIVIDE)
244+
/// * "⡆" (U+2846 BRAILLE PATTERN DOTS-237)
245+
///
246+
public var isSymbol: Bool {
247+
return _firstScalar.properties.generalCategory._isSymbol
248+
}
249+
250+
/// Whether this Character represents a symbol used mathematical formulas
251+
///
252+
/// Examples:
253+
/// * "+" (U+002B PLUS SIGN)
254+
/// * "∫" (U+222B INTEGRAL)
255+
/// * "ϰ" (U+03F0 GREEK KAPPA SYMBOL)
256+
///
257+
/// Note: This is not a strict subset of isSymbol. This includes characters
258+
/// used both as letters and commonly in mathematical formulas. For example,
259+
/// "ϰ" (U+03F0 GREEK KAPPA SYMBOL) is considered a both mathematical symbol
260+
/// and a letter.
261+
///
262+
public var isMathSymbol: Bool {
263+
return _firstScalar.properties.isMath
264+
}
265+
266+
/// Whether this Character represents a currency symbol
267+
///
268+
/// Examples:
269+
/// * "$" (U+0024 DOLLAR SIGN)
270+
/// * "¥" (U+00A5 YEN SIGN)
271+
/// * "€" (U+20AC EURO SIGN)
272+
///
273+
public var isCurrencySymbol: Bool {
274+
return _firstScalar.properties.generalCategory == .currencySymbol
275+
}
276+
277+
/// Whether this Character represents punctuation
278+
///
279+
/// Examples:
280+
/// * "!" (U+0021 EXCLAMATION MARK)
281+
/// * "؟" (U+061F ARABIC QUESTION MARK)
282+
/// * "…" (U+2026 HORIZONTAL ELLIPSIS)
283+
/// * "—" (U+2014 EM DASH)
284+
/// * "“" (U+201C LEFT DOUBLE QUOTATION MARK)
285+
///
286+
public var isPunctuation: Bool {
287+
return _firstScalar.properties.generalCategory._isPunctuation
288+
}
289+
}

stdlib/public/core/GroupInfo.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"ASCII.swift",
88
"CString.swift",
99
"Character.swift",
10+
"CharacterProperties.swift",
1011
"ICU.swift",
1112
"NormalizedCodeUnitIterator.swift",
1213
"SmallString.swift",

stdlib/public/core/UnicodeScalarProperties.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1050,6 +1050,26 @@ extension Unicode {
10501050
}
10511051
}
10521052

1053+
// Internal helpers
1054+
extension Unicode.GeneralCategory {
1055+
internal var _isSymbol: Bool {
1056+
switch self {
1057+
case .mathSymbol, .currencySymbol, .modifierSymbol, .otherSymbol:
1058+
return true
1059+
default: return false
1060+
}
1061+
}
1062+
internal var _isPunctuation: Bool {
1063+
switch self {
1064+
case .connectorPunctuation, .dashPunctuation, .openPunctuation,
1065+
.closePunctuation, .initialPunctuation, .finalPunctuation,
1066+
.otherPunctuation:
1067+
return true
1068+
default: return false
1069+
}
1070+
}
1071+
}
1072+
10531073
extension Unicode.Scalar.Properties {
10541074

10551075
/// The general category (most usual classification) of the scalar.

0 commit comments

Comments
 (0)