Skip to content

Commit 8f27bdd

Browse files
Add char::to_titlecase()
1 parent f0dc6dd commit 8f27bdd

File tree

5 files changed

+288
-36
lines changed

5 files changed

+288
-36
lines changed

library/core/src/char/methods.rs

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,14 +1134,112 @@ impl char {
11341134
/// // convert into themselves.
11351135
/// assert_eq!('山'.to_lowercase().to_string(), "山");
11361136
/// ```
1137-
#[must_use = "this returns the lowercase character as a new iterator, \
1137+
#[must_use = "this returns the lowercased character as a new iterator, \
11381138
without modifying the original"]
11391139
#[stable(feature = "rust1", since = "1.0.0")]
11401140
#[inline]
11411141
pub fn to_lowercase(self) -> ToLowercase {
11421142
ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
11431143
}
11441144

1145+
/// Returns an iterator that yields the titlecase mapping of this `char` as one or more
1146+
/// `char`s.
1147+
///
1148+
/// If this `char` does not have an titlecase mapping, the iterator yields the same `char`.
1149+
///
1150+
/// If this `char` has a one-to-one titlecase mapping given by the [Unicode Character
1151+
/// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`.
1152+
///
1153+
/// [ucd]: https://www.unicode.org/reports/tr44/
1154+
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1155+
///
1156+
/// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
1157+
/// the `char`(s) given by [`SpecialCasing.txt`].
1158+
///
1159+
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
1160+
///
1161+
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
1162+
/// is independent of context and language.
1163+
///
1164+
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
1165+
/// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
1166+
///
1167+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
1168+
///
1169+
/// # Examples
1170+
///
1171+
/// As an iterator:
1172+
///
1173+
/// ```
1174+
/// #![feature(titlecase)]
1175+
/// for c in 'ß'.to_titlecase() {
1176+
/// print!("{c}");
1177+
/// }
1178+
/// println!();
1179+
/// ```
1180+
///
1181+
/// Using `println!` directly:
1182+
///
1183+
/// ```
1184+
/// #![feature(titlecase)]
1185+
/// println!("{}", 'ß'.to_titlecase());
1186+
/// ```
1187+
///
1188+
/// Both are equivalent to:
1189+
///
1190+
/// ```
1191+
/// #![feature(titlecase)]
1192+
/// println!("Ss");
1193+
/// ```
1194+
///
1195+
/// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string):
1196+
///
1197+
/// ```
1198+
/// #![feature(titlecase)]
1199+
/// assert_eq!('c'.to_titlecase().to_string(), "C");
1200+
///
1201+
/// // Sometimes the result is more than one character:
1202+
/// assert_eq!('ß'.to_titlecase().to_string(), "Ss");
1203+
///
1204+
/// // Characters that do not have separate cased forms
1205+
/// // convert into themselves.
1206+
/// assert_eq!('山'.to_titlecase().to_string(), "山");
1207+
/// ```
1208+
///
1209+
/// # Note on locale
1210+
///
1211+
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
1212+
///
1213+
/// * 'Dotless': I / ı, sometimes written ï
1214+
/// * 'Dotted': İ / i
1215+
///
1216+
/// Note that the lowercase dotted 'i' is the same as the Latin. Therefore:
1217+
///
1218+
/// ```
1219+
/// #![feature(titlecase)]
1220+
/// let upper_i = 'i'.to_titlecase().to_string();
1221+
/// ```
1222+
///
1223+
/// The value of `upper_i` here relies on the language of the text: if we're
1224+
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
1225+
/// be `"İ"`. `to_titlecase()` does not take this into account, and so:
1226+
///
1227+
/// ```
1228+
/// #![feature(titlecase)]
1229+
/// let upper_i = 'i'.to_titlecase().to_string();
1230+
///
1231+
/// assert_eq!(upper_i, "I");
1232+
/// ```
1233+
///
1234+
/// holds across languages.
1235+
#[must_use = "this returns the titlecased character as a new iterator, \
1236+
without modifying the original"]
1237+
#[unstable(feature = "titlecase", issue = "none")]
1238+
#[inline]
1239+
pub fn to_titlecase(self) -> ToTitlecase {
1240+
ToTitlecase(CaseMappingIter::new(conversions::to_title(self)))
1241+
}
1242+
11451243
/// Returns an iterator that yields the uppercase mapping of this `char` as one or more
11461244
/// `char`s.
11471245
///
@@ -1204,7 +1302,7 @@ impl char {
12041302
///
12051303
/// # Note on locale
12061304
///
1207-
/// In Turkish, the equivalent of 'i' in Latin has five forms instead of two:
1305+
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
12081306
///
12091307
/// * 'Dotless': I / ı, sometimes written ï
12101308
/// * 'Dotted': İ / i
@@ -1216,7 +1314,7 @@ impl char {
12161314
/// ```
12171315
///
12181316
/// The value of `upper_i` here relies on the language of the text: if we're
1219-
/// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should
1317+
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
12201318
/// be `"İ"`. `to_uppercase()` does not take this into account, and so:
12211319
///
12221320
/// ```
@@ -1226,7 +1324,7 @@ impl char {
12261324
/// ```
12271325
///
12281326
/// holds across languages.
1229-
#[must_use = "this returns the uppercase character as a new iterator, \
1327+
#[must_use = "this returns the uppercased character as a new iterator, \
12301328
without modifying the original"]
12311329
#[stable(feature = "rust1", since = "1.0.0")]
12321330
#[inline]

library/core/src/char/mod.rs

Lines changed: 55 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -407,6 +407,54 @@ impl FusedIterator for ToLowercase {}
407407
#[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
408408
impl ExactSizeIterator for ToLowercase {}
409409

410+
#[stable(feature = "char_struct_display", since = "1.16.0")]
411+
impl fmt::Display for ToLowercase {
412+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
413+
fmt::Display::fmt(&self.0, f)
414+
}
415+
}
416+
417+
/// Returns an iterator that yields the titlecase equivalent of a `char`.
418+
///
419+
/// This `struct` is created by the [`to_titlecase`] method on [`char`]. See
420+
/// its documentation for more.
421+
///
422+
/// [`to_titlecase`]: char::to_titlecase
423+
#[unstable(feature = "titlecase", issue = "none")]
424+
#[derive(Debug, Clone)]
425+
pub struct ToTitlecase(CaseMappingIter);
426+
427+
#[unstable(feature = "titlecase", issue = "none")]
428+
impl Iterator for ToTitlecase {
429+
type Item = char;
430+
fn next(&mut self) -> Option<char> {
431+
self.0.next()
432+
}
433+
fn size_hint(&self) -> (usize, Option<usize>) {
434+
self.0.size_hint()
435+
}
436+
}
437+
438+
#[unstable(feature = "titlecase", issue = "none")]
439+
impl DoubleEndedIterator for ToTitlecase {
440+
fn next_back(&mut self) -> Option<char> {
441+
self.0.next_back()
442+
}
443+
}
444+
445+
#[unstable(feature = "titlecase", issue = "none")]
446+
impl FusedIterator for ToTitlecase {}
447+
448+
#[unstable(feature = "titlecase", issue = "none")]
449+
impl ExactSizeIterator for ToTitlecase {}
450+
451+
#[unstable(feature = "titlecase", issue = "none")]
452+
impl fmt::Display for ToTitlecase {
453+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
454+
fmt::Display::fmt(&self.0, f)
455+
}
456+
}
457+
410458
/// Returns an iterator that yields the uppercase equivalent of a `char`.
411459
///
412460
/// This `struct` is created by the [`to_uppercase`] method on [`char`]. See
@@ -441,6 +489,13 @@ impl FusedIterator for ToUppercase {}
441489
#[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
442490
impl ExactSizeIterator for ToUppercase {}
443491

492+
#[stable(feature = "char_struct_display", since = "1.16.0")]
493+
impl fmt::Display for ToUppercase {
494+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
495+
fmt::Display::fmt(&self.0, f)
496+
}
497+
}
498+
444499
#[derive(Debug, Clone)]
445500
enum CaseMappingIter {
446501
Three(char, char, char),
@@ -532,20 +587,6 @@ impl fmt::Display for CaseMappingIter {
532587
}
533588
}
534589

535-
#[stable(feature = "char_struct_display", since = "1.16.0")]
536-
impl fmt::Display for ToLowercase {
537-
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
538-
fmt::Display::fmt(&self.0, f)
539-
}
540-
}
541-
542-
#[stable(feature = "char_struct_display", since = "1.16.0")]
543-
impl fmt::Display for ToUppercase {
544-
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
545-
fmt::Display::fmt(&self.0, f)
546-
}
547-
}
548-
549590
/// The error type returned when a checked char conversion fails.
550591
#[stable(feature = "u8_from_char", since = "1.59.0")]
551592
#[derive(Debug, Copy, Clone, PartialEq, Eq)]

library/core/src/unicode/unicode_data.rs

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,24 @@ pub mod conversions {
616616
}
617617
}
618618

619-
static LOWERCASE_TABLE: &[(char, u32)] = &[
619+
pub fn to_title(c: char) -> [char; 3] {
620+
if c.is_ascii() {
621+
[(c as u8).to_ascii_uppercase() as char, '\0', '\0']
622+
} else {
623+
TITLECASE_TABLE
624+
.binary_search_by(|&(key, _)| key.cmp(&c))
625+
.map(|i| {
626+
let u = TITLECASE_TABLE[i].1;
627+
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
628+
// SAFETY: Index comes from statically generated table
629+
unsafe { *TITLECASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
630+
})
631+
})
632+
.unwrap_or(to_upper(c))
633+
}
634+
}
635+
636+
static LOWERCASE_TABLE: [(char, u32); 1407] = [
620637
('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228),
621638
('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233),
622639
('\u{ca}', 234), ('\u{cb}', 235), ('\u{cc}', 236), ('\u{cd}', 237), ('\u{ce}', 238),
@@ -959,11 +976,11 @@ pub mod conversions {
959976
('\u{1e920}', 125250), ('\u{1e921}', 125251),
960977
];
961978

962-
static LOWERCASE_TABLE_MULTI: &[[char; 3]] = &[
979+
static LOWERCASE_TABLE_MULTI: [[char; 3]; 1] = [
963980
['i', '\u{307}', '\u{0}'],
964981
];
965982

966-
static UPPERCASE_TABLE: &[(char, u32)] = &[
983+
static UPPERCASE_TABLE: [(char, u32); 1499] = [
967984
('\u{b5}', 924), ('\u{df}', 4194304), ('\u{e0}', 192), ('\u{e1}', 193), ('\u{e2}', 194),
968985
('\u{e3}', 195), ('\u{e4}', 196), ('\u{e5}', 197), ('\u{e6}', 198), ('\u{e7}', 199),
969986
('\u{e8}', 200), ('\u{e9}', 201), ('\u{ea}', 202), ('\u{eb}', 203), ('\u{ec}', 204),
@@ -1330,7 +1347,7 @@ pub mod conversions {
13301347
('\u{1e942}', 125216), ('\u{1e943}', 125217),
13311348
];
13321349

1333-
static UPPERCASE_TABLE_MULTI: &[[char; 3]] = &[
1350+
static UPPERCASE_TABLE_MULTI: [[char; 3]; 102] = [
13341351
['S', 'S', '\u{0}'], ['\u{2bc}', 'N', '\u{0}'], ['J', '\u{30c}', '\u{0}'],
13351352
['\u{399}', '\u{308}', '\u{301}'], ['\u{3a5}', '\u{308}', '\u{301}'],
13361353
['\u{535}', '\u{552}', '\u{0}'], ['H', '\u{331}', '\u{0}'], ['T', '\u{308}', '\u{0}'],
@@ -1380,4 +1397,53 @@ pub mod conversions {
13801397
['\u{544}', '\u{53b}', '\u{0}'], ['\u{54e}', '\u{546}', '\u{0}'],
13811398
['\u{544}', '\u{53d}', '\u{0}'],
13821399
];
1400+
1401+
static TITLECASE_TABLE: [(char, u32); 135] = [
1402+
('\u{df}', 4194304), ('\u{1c4}', 453), ('\u{1c5}', 453), ('\u{1c6}', 453),
1403+
('\u{1c7}', 456), ('\u{1c8}', 456), ('\u{1c9}', 456), ('\u{1ca}', 459), ('\u{1cb}', 459),
1404+
('\u{1cc}', 459), ('\u{1f1}', 498), ('\u{1f2}', 498), ('\u{1f3}', 498),
1405+
('\u{587}', 4194305), ('\u{10d0}', 4304), ('\u{10d1}', 4305), ('\u{10d2}', 4306),
1406+
('\u{10d3}', 4307), ('\u{10d4}', 4308), ('\u{10d5}', 4309), ('\u{10d6}', 4310),
1407+
('\u{10d7}', 4311), ('\u{10d8}', 4312), ('\u{10d9}', 4313), ('\u{10da}', 4314),
1408+
('\u{10db}', 4315), ('\u{10dc}', 4316), ('\u{10dd}', 4317), ('\u{10de}', 4318),
1409+
('\u{10df}', 4319), ('\u{10e0}', 4320), ('\u{10e1}', 4321), ('\u{10e2}', 4322),
1410+
('\u{10e3}', 4323), ('\u{10e4}', 4324), ('\u{10e5}', 4325), ('\u{10e6}', 4326),
1411+
('\u{10e7}', 4327), ('\u{10e8}', 4328), ('\u{10e9}', 4329), ('\u{10ea}', 4330),
1412+
('\u{10eb}', 4331), ('\u{10ec}', 4332), ('\u{10ed}', 4333), ('\u{10ee}', 4334),
1413+
('\u{10ef}', 4335), ('\u{10f0}', 4336), ('\u{10f1}', 4337), ('\u{10f2}', 4338),
1414+
('\u{10f3}', 4339), ('\u{10f4}', 4340), ('\u{10f5}', 4341), ('\u{10f6}', 4342),
1415+
('\u{10f7}', 4343), ('\u{10f8}', 4344), ('\u{10f9}', 4345), ('\u{10fa}', 4346),
1416+
('\u{10fd}', 4349), ('\u{10fe}', 4350), ('\u{10ff}', 4351), ('\u{1f80}', 8072),
1417+
('\u{1f81}', 8073), ('\u{1f82}', 8074), ('\u{1f83}', 8075), ('\u{1f84}', 8076),
1418+
('\u{1f85}', 8077), ('\u{1f86}', 8078), ('\u{1f87}', 8079), ('\u{1f88}', 8072),
1419+
('\u{1f89}', 8073), ('\u{1f8a}', 8074), ('\u{1f8b}', 8075), ('\u{1f8c}', 8076),
1420+
('\u{1f8d}', 8077), ('\u{1f8e}', 8078), ('\u{1f8f}', 8079), ('\u{1f90}', 8088),
1421+
('\u{1f91}', 8089), ('\u{1f92}', 8090), ('\u{1f93}', 8091), ('\u{1f94}', 8092),
1422+
('\u{1f95}', 8093), ('\u{1f96}', 8094), ('\u{1f97}', 8095), ('\u{1f98}', 8088),
1423+
('\u{1f99}', 8089), ('\u{1f9a}', 8090), ('\u{1f9b}', 8091), ('\u{1f9c}', 8092),
1424+
('\u{1f9d}', 8093), ('\u{1f9e}', 8094), ('\u{1f9f}', 8095), ('\u{1fa0}', 8104),
1425+
('\u{1fa1}', 8105), ('\u{1fa2}', 8106), ('\u{1fa3}', 8107), ('\u{1fa4}', 8108),
1426+
('\u{1fa5}', 8109), ('\u{1fa6}', 8110), ('\u{1fa7}', 8111), ('\u{1fa8}', 8104),
1427+
('\u{1fa9}', 8105), ('\u{1faa}', 8106), ('\u{1fab}', 8107), ('\u{1fac}', 8108),
1428+
('\u{1fad}', 8109), ('\u{1fae}', 8110), ('\u{1faf}', 8111), ('\u{1fb2}', 4194306),
1429+
('\u{1fb3}', 8124), ('\u{1fb4}', 4194307), ('\u{1fb7}', 4194308), ('\u{1fbc}', 8124),
1430+
('\u{1fc2}', 4194309), ('\u{1fc3}', 8140), ('\u{1fc4}', 4194310), ('\u{1fc7}', 4194311),
1431+
('\u{1fcc}', 8140), ('\u{1ff2}', 4194312), ('\u{1ff3}', 8188), ('\u{1ff4}', 4194313),
1432+
('\u{1ff7}', 4194314), ('\u{1ffc}', 8188), ('\u{fb00}', 4194315), ('\u{fb01}', 4194316),
1433+
('\u{fb02}', 4194317), ('\u{fb03}', 4194318), ('\u{fb04}', 4194319), ('\u{fb05}', 4194320),
1434+
('\u{fb06}', 4194321), ('\u{fb13}', 4194322), ('\u{fb14}', 4194323), ('\u{fb15}', 4194324),
1435+
('\u{fb16}', 4194325), ('\u{fb17}', 4194326),
1436+
];
1437+
1438+
static TITLECASE_TABLE_MULTI: [[char; 3]; 23] = [
1439+
['S', 's', '\u{0}'], ['\u{535}', '\u{582}', '\u{0}'], ['\u{1fba}', '\u{345}', '\u{0}'],
1440+
['\u{386}', '\u{345}', '\u{0}'], ['\u{391}', '\u{342}', '\u{345}'],
1441+
['\u{1fca}', '\u{345}', '\u{0}'], ['\u{389}', '\u{345}', '\u{0}'],
1442+
['\u{397}', '\u{342}', '\u{345}'], ['\u{1ffa}', '\u{345}', '\u{0}'],
1443+
['\u{38f}', '\u{345}', '\u{0}'], ['\u{3a9}', '\u{342}', '\u{345}'], ['F', 'f', '\u{0}'],
1444+
['F', 'i', '\u{0}'], ['F', 'l', '\u{0}'], ['F', 'f', 'i'], ['F', 'f', 'l'],
1445+
['S', 't', '\u{0}'], ['S', 't', '\u{0}'], ['\u{544}', '\u{576}', '\u{0}'],
1446+
['\u{544}', '\u{565}', '\u{0}'], ['\u{544}', '\u{56b}', '\u{0}'],
1447+
['\u{54e}', '\u{576}', '\u{0}'], ['\u{544}', '\u{56d}', '\u{0}'],
1448+
];
13831449
}

src/tools/unicode-table-generator/src/case_mapping.rs

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ pub(crate) fn generate_case_mapping(data: &UnicodeData) -> String {
1717
file.push_str(&generate_tables("LOWER", &data.to_lower));
1818
file.push_str("\n\n");
1919
file.push_str(&generate_tables("UPPER", &data.to_upper));
20+
file.push_str("\n\n");
21+
file.push_str(&generate_tables("TITLE", &data.to_title));
2022
file
2123
}
2224

@@ -48,13 +50,25 @@ fn generate_tables(case: &str, data: &BTreeMap<u32, (u32, u32, u32)>) -> String
4850

4951
let mut tables = String::new();
5052

51-
write!(tables, "static {}CASE_TABLE: &[(char, u32)] = &[{}];", case, fmt_list(mappings))
52-
.unwrap();
53+
write!(
54+
tables,
55+
"static {}CASE_TABLE: [(char, u32); {}] = [{}];",
56+
case,
57+
mappings.len(),
58+
fmt_list(mappings)
59+
)
60+
.unwrap();
5361

5462
tables.push_str("\n\n");
5563

56-
write!(tables, "static {}CASE_TABLE_MULTI: &[[char; 3]] = &[{}];", case, fmt_list(multis))
57-
.unwrap();
64+
write!(
65+
tables,
66+
"static {}CASE_TABLE_MULTI: [[char; 3]; {}] = [{}];",
67+
case,
68+
multis.len(),
69+
fmt_list(multis)
70+
)
71+
.unwrap();
5872

5973
tables
6074
}
@@ -101,4 +115,21 @@ pub fn to_upper(c: char) -> [char; 3] {
101115
.unwrap_or([c, '\0', '\0'])
102116
}
103117
}
118+
119+
pub fn to_title(c: char) -> [char; 3] {
120+
if c.is_ascii() {
121+
[(c as u8).to_ascii_uppercase() as char, '\0', '\0']
122+
} else {
123+
TITLECASE_TABLE
124+
.binary_search_by(|&(key, _)| key.cmp(&c))
125+
.map(|i| {
126+
let u = TITLECASE_TABLE[i].1;
127+
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
128+
// SAFETY: Index comes from statically generated table
129+
unsafe { *TITLECASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
130+
})
131+
})
132+
.unwrap_or(to_upper(c))
133+
}
134+
}
104135
";

0 commit comments

Comments
 (0)