Skip to content

Commit d0058f5

Browse files
Add char::to_titlecase()
1 parent 373e272 commit d0058f5

File tree

5 files changed

+281
-37
lines changed

5 files changed

+281
-37
lines changed

library/core/src/char/methods.rs

Lines changed: 102 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1134,14 +1134,112 @@ impl char {
11341134
/// // convert into themselves.
11351135
/// assert_eq!('山'.to_lowercase().to_string(), "山");
11361136
/// ```
1137-
#[must_use = "this returns the lowercase character as a new iterator, \
1137+
#[must_use = "this returns the lowercased character as a new iterator, \
11381138
without modifying the original"]
11391139
#[stable(feature = "rust1", since = "1.0.0")]
11401140
#[inline]
11411141
pub fn to_lowercase(self) -> ToLowercase {
11421142
ToLowercase(CaseMappingIter::new(conversions::to_lower(self)))
11431143
}
11441144

1145+
/// Returns an iterator that yields the titlecase mapping of this `char` as one or more
1146+
/// `char`s.
1147+
///
1148+
/// If this `char` does not have an titlecase mapping, the iterator yields the same `char`.
1149+
///
1150+
/// If this `char` has a one-to-one titlecase mapping given by the [Unicode Character
1151+
/// Database][ucd] [`UnicodeData.txt`], the iterator yields that `char`.
1152+
///
1153+
/// [ucd]: https://www.unicode.org/reports/tr44/
1154+
/// [`UnicodeData.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
1155+
///
1156+
/// If this `char` requires special considerations (e.g. multiple `char`s) the iterator yields
1157+
/// the `char`(s) given by [`SpecialCasing.txt`].
1158+
///
1159+
/// [`SpecialCasing.txt`]: https://www.unicode.org/Public/UCD/latest/ucd/SpecialCasing.txt
1160+
///
1161+
/// This operation performs an unconditional mapping without tailoring. That is, the conversion
1162+
/// is independent of context and language.
1163+
///
1164+
/// In the [Unicode Standard], Chapter 4 (Character Properties) discusses case mapping in
1165+
/// general and Chapter 3 (Conformance) discusses the default algorithm for case conversion.
1166+
///
1167+
/// [Unicode Standard]: https://www.unicode.org/versions/latest/
1168+
///
1169+
/// # Examples
1170+
///
1171+
/// As an iterator:
1172+
///
1173+
/// ```
1174+
/// #![feature(titlecase)]
1175+
/// for c in 'ß'.to_titlecase() {
1176+
/// print!("{c}");
1177+
/// }
1178+
/// println!();
1179+
/// ```
1180+
///
1181+
/// Using `println!` directly:
1182+
///
1183+
/// ```
1184+
/// #![feature(titlecase)]
1185+
/// println!("{}", 'ß'.to_titlecase());
1186+
/// ```
1187+
///
1188+
/// Both are equivalent to:
1189+
///
1190+
/// ```
1191+
/// #![feature(titlecase)]
1192+
/// println!("Ss");
1193+
/// ```
1194+
///
1195+
/// Using [`to_string`](../std/string/trait.ToString.html#tymethod.to_string):
1196+
///
1197+
/// ```
1198+
/// #![feature(titlecase)]
1199+
/// assert_eq!('c'.to_titlecase().to_string(), "C");
1200+
///
1201+
/// // Sometimes the result is more than one character:
1202+
/// assert_eq!('ß'.to_titlecase().to_string(), "Ss");
1203+
///
1204+
/// // Characters that do not have separate cased forms
1205+
/// // convert into themselves.
1206+
/// assert_eq!('山'.to_titlecase().to_string(), "山");
1207+
/// ```
1208+
///
1209+
/// # Note on locale
1210+
///
1211+
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
1212+
///
1213+
/// * 'Dotless': I / ı, sometimes written ï
1214+
/// * 'Dotted': İ / i
1215+
///
1216+
/// Note that the lowercase dotted 'i' is the same as the Latin. Therefore:
1217+
///
1218+
/// ```
1219+
/// #![feature(titlecase)]
1220+
/// let upper_i = 'i'.to_titlecase().to_string();
1221+
/// ```
1222+
///
1223+
/// The value of `upper_i` here relies on the language of the text: if we're
1224+
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
1225+
/// be `"İ"`. `to_titlecase()` does not take this into account, and so:
1226+
///
1227+
/// ```
1228+
/// #![feature(titlecase)]
1229+
/// let upper_i = 'i'.to_titlecase().to_string();
1230+
///
1231+
/// assert_eq!(upper_i, "I");
1232+
/// ```
1233+
///
1234+
/// holds across languages.
1235+
#[must_use = "this returns the titlecased character as a new iterator, \
1236+
without modifying the original"]
1237+
#[unstable(feature = "titlecase", issue = "none")]
1238+
#[inline]
1239+
pub fn to_titlecase(self) -> ToTitlecase {
1240+
ToTitlecase(CaseMappingIter::new(conversions::to_title(self)))
1241+
}
1242+
11451243
/// Returns an iterator that yields the uppercase mapping of this `char` as one or more
11461244
/// `char`s.
11471245
///
@@ -1204,7 +1302,7 @@ impl char {
12041302
///
12051303
/// # Note on locale
12061304
///
1207-
/// In Turkish, the equivalent of 'i' in Latin has five forms instead of two:
1305+
/// In Turkish and Azeri, the equivalent of 'i' in Latin has five forms instead of two:
12081306
///
12091307
/// * 'Dotless': I / ı, sometimes written ï
12101308
/// * 'Dotted': İ / i
@@ -1216,7 +1314,7 @@ impl char {
12161314
/// ```
12171315
///
12181316
/// The value of `upper_i` here relies on the language of the text: if we're
1219-
/// in `en-US`, it should be `"I"`, but if we're in `tr_TR`, it should
1317+
/// in `en-US`, it should be `"I"`, but if we're in `tr-TR` or `az-AZ`, it should
12201318
/// be `"İ"`. `to_uppercase()` does not take this into account, and so:
12211319
///
12221320
/// ```
@@ -1226,7 +1324,7 @@ impl char {
12261324
/// ```
12271325
///
12281326
/// holds across languages.
1229-
#[must_use = "this returns the uppercase character as a new iterator, \
1327+
#[must_use = "this returns the uppercased character as a new iterator, \
12301328
without modifying the original"]
12311329
#[stable(feature = "rust1", since = "1.0.0")]
12321330
#[inline]

library/core/src/char/mod.rs

Lines changed: 48 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -374,13 +374,21 @@ impl fmt::Display for EscapeDebug {
374374
}
375375

376376
macro_rules! casemappingiter_impls {
377-
($(#[$attr:meta])* $ITER_NAME:ident) => {
377+
(
378+
#[$stab:meta]
379+
#[$dendstab:meta]
380+
#[$fusedstab:meta]
381+
#[$exactstab:meta]
382+
#[$displaystab:meta]
383+
$(#[$attr:meta])*
384+
$ITER_NAME:ident
385+
) => {
378386
$(#[$attr])*
379-
#[stable(feature = "rust1", since = "1.0.0")]
387+
#[$stab]
380388
#[derive(Debug, Clone)]
381389
pub struct $ITER_NAME(CaseMappingIter);
382390

383-
#[stable(feature = "rust1", since = "1.0.0")]
391+
#[$stab]
384392
impl Iterator for $ITER_NAME {
385393
type Item = char;
386394
fn next(&mut self) -> Option<char> {
@@ -416,7 +424,7 @@ macro_rules! casemappingiter_impls {
416424
}
417425
}
418426

419-
#[stable(feature = "case_mapping_double_ended", since = "1.59.0")]
427+
#[$dendstab]
420428
impl DoubleEndedIterator for $ITER_NAME {
421429
fn next_back(&mut self) -> Option<char> {
422430
self.0.next_back()
@@ -434,10 +442,10 @@ macro_rules! casemappingiter_impls {
434442
}
435443
}
436444

437-
#[stable(feature = "fused", since = "1.26.0")]
445+
#[$fusedstab]
438446
impl FusedIterator for $ITER_NAME {}
439447

440-
#[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
448+
#[$exactstab]
441449
impl ExactSizeIterator for $ITER_NAME {
442450
fn len(&self) -> usize {
443451
self.0.len()
@@ -464,7 +472,7 @@ macro_rules! casemappingiter_impls {
464472
#[unstable(feature = "std_internals", issue = "none")]
465473
unsafe impl TrustedRandomAccess for $ITER_NAME {}
466474

467-
#[stable(feature = "char_struct_display", since = "1.16.0")]
475+
#[$displaystab]
468476
impl fmt::Display for $ITER_NAME {
469477
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
470478
fmt::Display::fmt(&self.0, f)
@@ -474,23 +482,48 @@ macro_rules! casemappingiter_impls {
474482
}
475483

476484
casemappingiter_impls! {
477-
/// Returns an iterator that yields the lowercase equivalent of a `char`.
485+
#[stable(feature = "rust1", since = "1.0.0")]
486+
#[stable(feature = "case_mapping_double_ended", since = "1.59.0")]
487+
#[stable(feature = "fused", since = "1.26.0")]
488+
#[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
489+
#[stable(feature = "char_struct_display", since = "1.16.0")]
490+
/// Returns an iterator that yields the uppercase equivalent of a `char`.
478491
///
479-
/// This `struct` is created by the [`to_lowercase`] method on [`char`]. See
492+
/// This `struct` is created by the [`to_uppercase`] method on [`char`]. See
480493
/// its documentation for more.
481494
///
482-
/// [`to_lowercase`]: char::to_lowercase
483-
ToLowercase
495+
/// [`to_uppercase`]: char::to_uppercase
496+
ToUppercase
484497
}
485498

486499
casemappingiter_impls! {
487-
/// Returns an iterator that yields the uppercase equivalent of a `char`.
500+
#[unstable(feature = "titlecase", issue = "none")]
501+
#[unstable(feature = "titlecase", issue = "none")]
502+
#[unstable(feature = "titlecase", issue = "none")]
503+
#[unstable(feature = "titlecase", issue = "none")]
504+
#[unstable(feature = "titlecase", issue = "none")]
505+
/// Returns an iterator that yields the titlecase equivalent of a `char`.
488506
///
489-
/// This `struct` is created by the [`to_uppercase`] method on [`char`]. See
507+
/// This `struct` is created by the [`to_titlecase`] method on [`char`]. See
490508
/// its documentation for more.
491509
///
492-
/// [`to_uppercase`]: char::to_uppercase
493-
ToUppercase
510+
/// [`to_titlecase`]: char::to_titlecase
511+
ToTitlecase
512+
}
513+
514+
casemappingiter_impls! {
515+
#[stable(feature = "rust1", since = "1.0.0")]
516+
#[stable(feature = "case_mapping_double_ended", since = "1.59.0")]
517+
#[stable(feature = "fused", since = "1.26.0")]
518+
#[stable(feature = "exact_size_case_mapping_iter", since = "1.35.0")]
519+
#[stable(feature = "char_struct_display", since = "1.16.0")]
520+
/// Returns an iterator that yields the lowercase equivalent of a `char`.
521+
///
522+
/// This `struct` is created by the [`to_lowercase`] method on [`char`]. See
523+
/// its documentation for more.
524+
///
525+
/// [`to_lowercase`]: char::to_lowercase
526+
ToLowercase
494527
}
495528

496529
#[derive(Debug, Clone)]

library/core/src/unicode/unicode_data.rs

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -616,7 +616,24 @@ pub mod conversions {
616616
}
617617
}
618618

619-
static LOWERCASE_TABLE: &[(char, u32)] = &[
619+
pub fn to_title(c: char) -> [char; 3] {
620+
if c.is_ascii() {
621+
[(c as u8).to_ascii_uppercase() as char, '\0', '\0']
622+
} else {
623+
TITLECASE_TABLE
624+
.binary_search_by(|&(key, _)| key.cmp(&c))
625+
.map(|i| {
626+
let u = TITLECASE_TABLE[i].1;
627+
char::from_u32(u).map(|c| [c, '\0', '\0']).unwrap_or_else(|| {
628+
// SAFETY: Index comes from statically generated table
629+
unsafe { *TITLECASE_TABLE_MULTI.get_unchecked((u & (INDEX_MASK - 1)) as usize) }
630+
})
631+
})
632+
.unwrap_or(to_upper(c))
633+
}
634+
}
635+
636+
static LOWERCASE_TABLE: [(char, u32); 1407] = [
620637
('\u{c0}', 224), ('\u{c1}', 225), ('\u{c2}', 226), ('\u{c3}', 227), ('\u{c4}', 228),
621638
('\u{c5}', 229), ('\u{c6}', 230), ('\u{c7}', 231), ('\u{c8}', 232), ('\u{c9}', 233),
622639
('\u{ca}', 234), ('\u{cb}', 235), ('\u{cc}', 236), ('\u{cd}', 237), ('\u{ce}', 238),
@@ -959,11 +976,11 @@ pub mod conversions {
959976
('\u{1e920}', 125250), ('\u{1e921}', 125251),
960977
];
961978

962-
static LOWERCASE_TABLE_MULTI: &[[char; 3]] = &[
979+
static LOWERCASE_TABLE_MULTI: [[char; 3]; 1] = [
963980
['i', '\u{307}', '\u{0}'],
964981
];
965982

966-
static UPPERCASE_TABLE: &[(char, u32)] = &[
983+
static UPPERCASE_TABLE: [(char, u32); 1499] = [
967984
('\u{b5}', 924), ('\u{df}', 4194304), ('\u{e0}', 192), ('\u{e1}', 193), ('\u{e2}', 194),
968985
('\u{e3}', 195), ('\u{e4}', 196), ('\u{e5}', 197), ('\u{e6}', 198), ('\u{e7}', 199),
969986
('\u{e8}', 200), ('\u{e9}', 201), ('\u{ea}', 202), ('\u{eb}', 203), ('\u{ec}', 204),
@@ -1330,7 +1347,7 @@ pub mod conversions {
13301347
('\u{1e942}', 125216), ('\u{1e943}', 125217),
13311348
];
13321349

1333-
static UPPERCASE_TABLE_MULTI: &[[char; 3]] = &[
1350+
static UPPERCASE_TABLE_MULTI: [[char; 3]; 102] = [
13341351
['S', 'S', '\u{0}'], ['\u{2bc}', 'N', '\u{0}'], ['J', '\u{30c}', '\u{0}'],
13351352
['\u{399}', '\u{308}', '\u{301}'], ['\u{3a5}', '\u{308}', '\u{301}'],
13361353
['\u{535}', '\u{552}', '\u{0}'], ['H', '\u{331}', '\u{0}'], ['T', '\u{308}', '\u{0}'],
@@ -1380,4 +1397,53 @@ pub mod conversions {
13801397
['\u{544}', '\u{53b}', '\u{0}'], ['\u{54e}', '\u{546}', '\u{0}'],
13811398
['\u{544}', '\u{53d}', '\u{0}'],
13821399
];
1400+
1401+
static TITLECASE_TABLE: [(char, u32); 135] = [
1402+
('\u{df}', 4194304), ('\u{1c4}', 453), ('\u{1c5}', 453), ('\u{1c6}', 453),
1403+
('\u{1c7}', 456), ('\u{1c8}', 456), ('\u{1c9}', 456), ('\u{1ca}', 459), ('\u{1cb}', 459),
1404+
('\u{1cc}', 459), ('\u{1f1}', 498), ('\u{1f2}', 498), ('\u{1f3}', 498),
1405+
('\u{587}', 4194305), ('\u{10d0}', 4304), ('\u{10d1}', 4305), ('\u{10d2}', 4306),
1406+
('\u{10d3}', 4307), ('\u{10d4}', 4308), ('\u{10d5}', 4309), ('\u{10d6}', 4310),
1407+
('\u{10d7}', 4311), ('\u{10d8}', 4312), ('\u{10d9}', 4313), ('\u{10da}', 4314),
1408+
('\u{10db}', 4315), ('\u{10dc}', 4316), ('\u{10dd}', 4317), ('\u{10de}', 4318),
1409+
('\u{10df}', 4319), ('\u{10e0}', 4320), ('\u{10e1}', 4321), ('\u{10e2}', 4322),
1410+
('\u{10e3}', 4323), ('\u{10e4}', 4324), ('\u{10e5}', 4325), ('\u{10e6}', 4326),
1411+
('\u{10e7}', 4327), ('\u{10e8}', 4328), ('\u{10e9}', 4329), ('\u{10ea}', 4330),
1412+
('\u{10eb}', 4331), ('\u{10ec}', 4332), ('\u{10ed}', 4333), ('\u{10ee}', 4334),
1413+
('\u{10ef}', 4335), ('\u{10f0}', 4336), ('\u{10f1}', 4337), ('\u{10f2}', 4338),
1414+
('\u{10f3}', 4339), ('\u{10f4}', 4340), ('\u{10f5}', 4341), ('\u{10f6}', 4342),
1415+
('\u{10f7}', 4343), ('\u{10f8}', 4344), ('\u{10f9}', 4345), ('\u{10fa}', 4346),
1416+
('\u{10fd}', 4349), ('\u{10fe}', 4350), ('\u{10ff}', 4351), ('\u{1f80}', 8072),
1417+
('\u{1f81}', 8073), ('\u{1f82}', 8074), ('\u{1f83}', 8075), ('\u{1f84}', 8076),
1418+
('\u{1f85}', 8077), ('\u{1f86}', 8078), ('\u{1f87}', 8079), ('\u{1f88}', 8072),
1419+
('\u{1f89}', 8073), ('\u{1f8a}', 8074), ('\u{1f8b}', 8075), ('\u{1f8c}', 8076),
1420+
('\u{1f8d}', 8077), ('\u{1f8e}', 8078), ('\u{1f8f}', 8079), ('\u{1f90}', 8088),
1421+
('\u{1f91}', 8089), ('\u{1f92}', 8090), ('\u{1f93}', 8091), ('\u{1f94}', 8092),
1422+
('\u{1f95}', 8093), ('\u{1f96}', 8094), ('\u{1f97}', 8095), ('\u{1f98}', 8088),
1423+
('\u{1f99}', 8089), ('\u{1f9a}', 8090), ('\u{1f9b}', 8091), ('\u{1f9c}', 8092),
1424+
('\u{1f9d}', 8093), ('\u{1f9e}', 8094), ('\u{1f9f}', 8095), ('\u{1fa0}', 8104),
1425+
('\u{1fa1}', 8105), ('\u{1fa2}', 8106), ('\u{1fa3}', 8107), ('\u{1fa4}', 8108),
1426+
('\u{1fa5}', 8109), ('\u{1fa6}', 8110), ('\u{1fa7}', 8111), ('\u{1fa8}', 8104),
1427+
('\u{1fa9}', 8105), ('\u{1faa}', 8106), ('\u{1fab}', 8107), ('\u{1fac}', 8108),
1428+
('\u{1fad}', 8109), ('\u{1fae}', 8110), ('\u{1faf}', 8111), ('\u{1fb2}', 4194306),
1429+
('\u{1fb3}', 8124), ('\u{1fb4}', 4194307), ('\u{1fb7}', 4194308), ('\u{1fbc}', 8124),
1430+
('\u{1fc2}', 4194309), ('\u{1fc3}', 8140), ('\u{1fc4}', 4194310), ('\u{1fc7}', 4194311),
1431+
('\u{1fcc}', 8140), ('\u{1ff2}', 4194312), ('\u{1ff3}', 8188), ('\u{1ff4}', 4194313),
1432+
('\u{1ff7}', 4194314), ('\u{1ffc}', 8188), ('\u{fb00}', 4194315), ('\u{fb01}', 4194316),
1433+
('\u{fb02}', 4194317), ('\u{fb03}', 4194318), ('\u{fb04}', 4194319), ('\u{fb05}', 4194320),
1434+
('\u{fb06}', 4194321), ('\u{fb13}', 4194322), ('\u{fb14}', 4194323), ('\u{fb15}', 4194324),
1435+
('\u{fb16}', 4194325), ('\u{fb17}', 4194326),
1436+
];
1437+
1438+
static TITLECASE_TABLE_MULTI: [[char; 3]; 23] = [
1439+
['S', 's', '\u{0}'], ['\u{535}', '\u{582}', '\u{0}'], ['\u{1fba}', '\u{345}', '\u{0}'],
1440+
['\u{386}', '\u{345}', '\u{0}'], ['\u{391}', '\u{342}', '\u{345}'],
1441+
['\u{1fca}', '\u{345}', '\u{0}'], ['\u{389}', '\u{345}', '\u{0}'],
1442+
['\u{397}', '\u{342}', '\u{345}'], ['\u{1ffa}', '\u{345}', '\u{0}'],
1443+
['\u{38f}', '\u{345}', '\u{0}'], ['\u{3a9}', '\u{342}', '\u{345}'], ['F', 'f', '\u{0}'],
1444+
['F', 'i', '\u{0}'], ['F', 'l', '\u{0}'], ['F', 'f', 'i'], ['F', 'f', 'l'],
1445+
['S', 't', '\u{0}'], ['S', 't', '\u{0}'], ['\u{544}', '\u{576}', '\u{0}'],
1446+
['\u{544}', '\u{565}', '\u{0}'], ['\u{544}', '\u{56b}', '\u{0}'],
1447+
['\u{54e}', '\u{576}', '\u{0}'], ['\u{544}', '\u{56d}', '\u{0}'],
1448+
];
13831449
}

0 commit comments

Comments
 (0)