Skip to content

Commit e2dc4fb

Browse files
committed
Implement SE-0221: Character Properties
Provide convenience properties on Character.
1 parent 9544a00 commit e2dc4fb

File tree

6 files changed

+580
-0
lines changed

6 files changed

+580
-0
lines changed

stdlib/public/core/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -170,6 +170,7 @@ set(SWIFTLIB_ESSENTIAL
170170
UnicodeEncoding.swift
171171
UnicodeParser.swift
172172
UnicodeScalarProperties.swift
173+
CharacterProperties.swift # ORDER DEPENDENCY: UnicodeScalarProperties.swift
173174
Unmanaged.swift
174175
UnmanagedOpaqueString.swift
175176
UnmanagedString.swift
Lines changed: 293 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,293 @@
1+
//===----------------------------------------------------------------------===//
2+
//
3+
// This source file is part of the Swift.org open source project
4+
//
5+
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
6+
// Licensed under Apache License v2.0 with Runtime Library Exception
7+
//
8+
// See https://swift.org/LICENSE.txt for license information
9+
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
10+
//
11+
//===----------------------------------------------------------------------===//
12+
13+
extension Character {
14+
@inlinable
15+
internal var _firstScalar: Unicode.Scalar {
16+
return self.unicodeScalars.first!
17+
}
18+
@inlinable
19+
internal var _isSingleScalar: Bool {
20+
return self.unicodeScalars.count == 1
21+
}
22+
23+
@inlinable
24+
static internal var _crlf: Character { return "\r\n" }
25+
26+
@inlinable
27+
static internal var _lf: Character { return "\n" }
28+
29+
/// Whether this Character is ASCII.
30+
@inlinable
31+
public var isASCII: Bool {
32+
return asciiValue != nil
33+
}
34+
35+
/// Returns the ASCII encoding value of this Character, if ASCII.
36+
///
37+
/// Note: "\r\n" (CR-LF) is normalized to "\n" (LF), which will return 0x0A
38+
@inlinable
39+
public var asciiValue: UInt8? {
40+
if _slowPath(self == ._crlf) { return 0x000A /* LINE FEED (LF) */ }
41+
if _slowPath(!_isSingleScalar || _firstScalar.value >= 0x80) { return nil }
42+
return UInt8(_firstScalar.value)
43+
}
44+
45+
/// Whether this Character represents whitespace, including newlines.
46+
///
47+
/// Examples:
48+
/// * "\t" (U+0009 CHARACTER TABULATION)
49+
/// * " " (U+0020 SPACE)
50+
/// * U+2029 PARAGRAPH SEPARATOR
51+
/// * U+3000 IDEOGRAPHIC SPACE
52+
///
53+
public var isWhitespace: Bool {
54+
return _firstScalar.properties.isWhitespace
55+
}
56+
57+
/// Whether this Character represents a newline.
58+
///
59+
/// Examples:
60+
/// * "\n" (U+000A): LINE FEED (LF)
61+
/// * U+000B: LINE TABULATION (VT)
62+
/// * U+000C: FORM FEED (FF)
63+
/// * "\r" (U+000D): CARRIAGE RETURN (CR)
64+
/// * "\r\n" (U+000A U+000D): CR-LF
65+
/// * U+0085: NEXT LINE (NEL)
66+
/// * U+2028: LINE SEPARATOR
67+
/// * U+2029: PARAGRAPH SEPARATOR
68+
///
69+
@inlinable
70+
public var isNewline: Bool {
71+
switch _firstScalar.value {
72+
case 0x000A...0x000D /* LF ... CR */: return true
73+
case 0x0085 /* NEXT LINE (NEL) */: return true
74+
case 0x2028 /* LINE SEPARATOR */: return true
75+
case 0x2029 /* PARAGRAPH SEPARATOR */: return true
76+
default: return false
77+
}
78+
}
79+
80+
/// Whether this Character represents a number.
81+
///
82+
/// Examples:
83+
/// * "7" (U+0037 DIGIT SEVEN)
84+
/// * "⅚" (U+215A VULGAR FRACTION FIVE SIXTHS)
85+
/// * "㊈" (U+3288 CIRCLED IDEOGRAPH NINE)
86+
/// * "𝟠" (U+1D7E0 MATHEMATICAL DOUBLE-STRUCK DIGIT EIGHT)
87+
/// * "๒" (U+0E52 THAI DIGIT TWO)
88+
///
89+
public var isNumber: Bool {
90+
return _firstScalar.properties.numericType != nil
91+
}
92+
93+
/// Whether this Character represents a whole number. See
94+
/// `Character.wholeNumberValue`
95+
@inlinable
96+
public var isWholeNumber: Bool {
97+
return wholeNumberValue != nil
98+
}
99+
100+
/// If this Character is a whole number, return the value it represents, else
101+
/// nil.
102+
///
103+
/// Examples:
104+
/// * "1" (U+0031 DIGIT ONE) => 1
105+
/// * "५" (U+096B DEVANAGARI DIGIT FIVE) => 5
106+
/// * "๙" (U+0E59 THAI DIGIT NINE) => 9
107+
/// * "万" (U+4E07 CJK UNIFIED IDEOGRAPH-4E07) => 10_000
108+
///
109+
/// Note: Returns nil on 32-bit platforms if the result would overflow `Int`.
110+
public var wholeNumberValue: Int? {
111+
guard _isSingleScalar else { return nil }
112+
guard let value = _firstScalar.properties.numericValue else { return nil }
113+
return Int(exactly: value)
114+
}
115+
116+
/// Whether this Character represents a hexadecimal digit.
117+
///
118+
/// Hexadecimal digits include 0-9, Latin letters a-f and A-F, and their
119+
/// fullwidth compatibility forms. To get their value, see
120+
/// `Character.hexDigitValue`
121+
@inlinable
122+
public var isHexDigit: Bool {
123+
return hexDigitValue != nil
124+
}
125+
126+
/// If this Character is a hexadecimal digit, returns the value it represents,
127+
/// else nil.
128+
public var hexDigitValue: Int? {
129+
guard _isSingleScalar else { return nil }
130+
let value = _firstScalar.value
131+
switch value {
132+
// DIGIT ZERO..DIGIT NINE
133+
case 0x0030...0x0039: return Int(value &- 0x0030)
134+
// LATIN CAPITAL LETTER A..LATIN CAPITAL LETTER F
135+
case 0x0041...0x0046: return Int((value &+ 10) &- 0x0041)
136+
// LATIN SMALL LETTER A..LATIN SMALL LETTER F
137+
case 0x0061...0x0066: return Int((value &+ 10) &- 0x0061)
138+
// FULLWIDTH DIGIT ZERO..FULLWIDTH DIGIT NINE
139+
case 0xFF10...0xFF19: return Int(value &- 0xFF10)
140+
// FULLWIDTH LATIN CAPITAL LETTER A..FULLWIDTH LATIN CAPITAL LETTER F
141+
case 0xFF21...0xFF26: return Int((value &+ 10) &- 0xFF21)
142+
// FULLWIDTH LATIN SMALL LETTER A..FULLWIDTH LATIN SMALL LETTER F
143+
case 0xFF41...0xFF46: return Int((value &+ 10) &- 0xFF41)
144+
145+
default: return nil
146+
}
147+
}
148+
149+
/// Whether this Character is a letter.
150+
///
151+
/// Examples:
152+
/// * "A" (U+0041 LATIN CAPITAL LETTER A)
153+
/// * "é" (U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT)
154+
/// * "ϴ" (U+03F4 GREEK CAPITAL THETA SYMBOL)
155+
/// * "ڈ" (U+0688 ARABIC LETTER DDAL)
156+
/// * "日" (U+65E5 CJK UNIFIED IDEOGRAPH-65E5)
157+
/// * "ᚨ" (U+16A8 RUNIC LETTER ANSUZ A)
158+
///
159+
public var isLetter: Bool {
160+
return _firstScalar.properties.isAlphabetic
161+
}
162+
163+
/// Perform case conversion to uppercase
164+
///
165+
/// Examples:
166+
/// * "é" (U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT)
167+
/// => "É" (U+0045 LATIN CAPITAL LETTER E, U+0301 COMBINING ACUTE ACCENT)
168+
/// * "и" (U+0438 CYRILLIC SMALL LETTER I)
169+
/// => "И" (U+0418 CYRILLIC CAPITAL LETTER I)
170+
/// * "π" (U+03C0 GREEK SMALL LETTER PI)
171+
/// => "Π" (U+03A0 GREEK CAPITAL LETTER PI)
172+
/// * "ß" (U+00DF LATIN SMALL LETTER SHARP S)
173+
/// => "SS" (U+0053 LATIN CAPITAL LETTER S, U+0053 LATIN CAPITAL LETTER S)
174+
///
175+
/// Note: Returns a String as case conversion can result in multiple
176+
/// Characters.
177+
public func uppercased() -> String { return String(self).uppercased() }
178+
179+
/// Perform case conversion to lowercase
180+
///
181+
/// Examples:
182+
/// * "É" (U+0045 LATIN CAPITAL LETTER E, U+0301 COMBINING ACUTE ACCENT)
183+
/// => "é" (U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT)
184+
/// * "И" (U+0418 CYRILLIC CAPITAL LETTER I)
185+
/// => "и" (U+0438 CYRILLIC SMALL LETTER I)
186+
/// * "Π" (U+03A0 GREEK CAPITAL LETTER PI)
187+
/// => "π" (U+03C0 GREEK SMALL LETTER PI)
188+
///
189+
/// Note: Returns a String as case conversion can result in multiple
190+
/// Characters.
191+
public func lowercased() -> String { return String(self).lowercased() }
192+
193+
@usableFromInline
194+
internal var _isUppercased: Bool { return String(self) == self.uppercased() }
195+
@usableFromInline
196+
internal var _isLowercased: Bool { return String(self) == self.lowercased() }
197+
198+
/// Whether this Character is considered uppercase.
199+
///
200+
/// Uppercase Characters vary under case-conversion to lowercase, but not when
201+
/// converted to uppercase.
202+
///
203+
/// Examples:
204+
/// * "É" (U+0045 LATIN CAPITAL LETTER E, U+0301 COMBINING ACUTE ACCENT)
205+
/// * "И" (U+0418 CYRILLIC CAPITAL LETTER I)
206+
/// * "Π" (U+03A0 GREEK CAPITAL LETTER PI)
207+
///
208+
@inlinable
209+
public var isUppercase: Bool {
210+
if _fastPath(_isSingleScalar && _firstScalar.properties.isUppercase) {
211+
return true
212+
}
213+
return _isUppercased && isCased
214+
}
215+
216+
/// Whether this Character is considered lowercase.
217+
///
218+
/// Lowercase Characters vary under case-conversion to uppercase, but not when
219+
/// converted to lowercase.
220+
///
221+
/// Examples:
222+
/// * "é" (U+0065 LATIN SMALL LETTER E, U+0301 COMBINING ACUTE ACCENT)
223+
/// * "и" (U+0438 CYRILLIC SMALL LETTER I)
224+
/// * "π" (U+03C0 GREEK SMALL LETTER PI)
225+
///
226+
@inlinable
227+
public var isLowercase: Bool {
228+
if _fastPath(_isSingleScalar && _firstScalar.properties.isLowercase) {
229+
return true
230+
}
231+
return _isLowercased && isCased
232+
}
233+
234+
/// Whether this Character changes under any form of case conversion.
235+
@inlinable
236+
public var isCased: Bool {
237+
if _fastPath(_isSingleScalar && _firstScalar.properties.isCased) {
238+
return true
239+
}
240+
return !_isUppercased || !_isLowercased
241+
}
242+
243+
/// Whether this Character represents a symbol
244+
///
245+
/// Examples:
246+
/// * "®" (U+00AE REGISTERED SIGN)
247+
/// * "⌹" (U+2339 APL FUNCTIONAL SYMBOL QUAD DIVIDE)
248+
/// * "⡆" (U+2846 BRAILLE PATTERN DOTS-237)
249+
///
250+
public var isSymbol: Bool {
251+
return _firstScalar.properties.generalCategory._isSymbol
252+
}
253+
254+
/// Whether this Character represents a symbol used mathematical formulas
255+
///
256+
/// Examples:
257+
/// * "+" (U+002B PLUS SIGN)
258+
/// * "∫" (U+222B INTEGRAL)
259+
/// * "ϰ" (U+03F0 GREEK KAPPA SYMBOL)
260+
///
261+
/// Note: This is not a strict subset of isSymbol. This includes characters
262+
/// used both as letters and commonly in mathematical formulas. For example,
263+
/// "ϰ" (U+03F0 GREEK KAPPA SYMBOL) is considered a both mathematical symbol
264+
/// and a letter.
265+
///
266+
public var isMathSymbol: Bool {
267+
return _firstScalar.properties.isMath
268+
}
269+
270+
/// Whether this Character represents a currency symbol
271+
///
272+
/// Examples:
273+
/// * "$" (U+0024 DOLLAR SIGN)
274+
/// * "¥" (U+00A5 YEN SIGN)
275+
/// * "€" (U+20AC EURO SIGN)
276+
///
277+
public var isCurrencySymbol: Bool {
278+
return _firstScalar.properties.generalCategory == .currencySymbol
279+
}
280+
281+
/// Whether this Character represents punctuation
282+
///
283+
/// Examples:
284+
/// * "!" (U+0021 EXCLAMATION MARK)
285+
/// * "؟" (U+061F ARABIC QUESTION MARK)
286+
/// * "…" (U+2026 HORIZONTAL ELLIPSIS)
287+
/// * "—" (U+2014 EM DASH)
288+
/// * "“" (U+201C LEFT DOUBLE QUOTATION MARK)
289+
///
290+
public var isPunctuation: Bool {
291+
return _firstScalar.properties.generalCategory._isPunctuation
292+
}
293+
}

stdlib/public/core/GroupInfo.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
"ASCII.swift",
88
"CString.swift",
99
"Character.swift",
10+
"CharacterProperties.swift",
1011
"CharacterUnicodeScalars.swift",
1112
"ICU.swift",
1213
"NormalizedCodeUnitIterator.swift",

stdlib/public/core/UnicodeScalarProperties.swift

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1066,6 +1066,26 @@ extension Unicode {
10661066
}
10671067
}
10681068

1069+
// Internal helpers
1070+
extension Unicode.GeneralCategory {
1071+
internal var _isSymbol: Bool {
1072+
switch self {
1073+
case .mathSymbol, .currencySymbol, .modifierSymbol, .otherSymbol:
1074+
return true
1075+
default: return false
1076+
}
1077+
}
1078+
internal var _isPunctuation: Bool {
1079+
switch self {
1080+
case .connectorPunctuation, .dashPunctuation, .openPunctuation,
1081+
.closePunctuation, .initialPunctuation, .finalPunctuation,
1082+
.otherPunctuation:
1083+
return true
1084+
default: return false
1085+
}
1086+
}
1087+
}
1088+
10691089
extension Unicode.Scalar.Properties {
10701090

10711091
/// The general category (most usual classification) of the scalar.

0 commit comments

Comments
 (0)