Skip to content

Commit 3d720c6

Browse files
committed
Add support for performing NFD and NFKD on strings
1 parent 2675f3e commit 3d720c6

File tree

1 file changed

+143
-0
lines changed

1 file changed

+143
-0
lines changed

src/libstd/str.rs

Lines changed: 143 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,97 @@ impl<'self> Iterator<&'self str> for StrSplitIterator<'self> {
450450
}
451451
}
452452

453+
// Helper functions used for Unicode normalization
454+
fn canonical_sort(comb: &mut [(char, u8)]) {
455+
use iterator::range;
456+
use tuple::CopyableTuple;
457+
458+
let len = comb.len();
459+
for i in range(0, len) {
460+
let mut swapped = false;
461+
for j in range(1, len-i) {
462+
let classA = comb[j-1].second();
463+
let classB = comb[j].second();
464+
if classA != 0 && classB != 0 && classA > classB {
465+
comb.swap(j-1, j);
466+
swapped = true;
467+
}
468+
}
469+
if !swapped { break; }
470+
}
471+
}
472+
473+
#[deriving(Clone)]
474+
enum NormalizationForm {
475+
NFD,
476+
NFKD
477+
}
478+
479+
/// External iterator for a string's normalization's characters.
480+
/// Use with the `std::iterator` module.
481+
#[deriving(Clone)]
482+
struct NormalizationIterator<'self> {
483+
priv kind: NormalizationForm,
484+
priv index: uint,
485+
priv string: &'self str,
486+
priv buffer: ~[(char, u8)],
487+
priv sorted: bool
488+
}
489+
490+
impl<'self> Iterator<char> for NormalizationIterator<'self> {
491+
#[inline]
492+
fn next(&mut self) -> Option<char> {
493+
use unicode::decompose::canonical_combining_class;
494+
495+
match self.buffer.head_opt() {
496+
Some(&(c, 0)) => {
497+
self.sorted = false;
498+
self.buffer.shift();
499+
return Some(c);
500+
}
501+
Some(&(c, _)) if self.sorted => {
502+
self.buffer.shift();
503+
return Some(c);
504+
}
505+
_ => self.sorted = false
506+
}
507+
508+
let decomposer = match self.kind {
509+
NFD => char::decompose_canonical,
510+
NFKD => char::decompose_compatible
511+
};
512+
513+
while !self.sorted && self.index < self.string.len() {
514+
let CharRange {ch, next} = self.string.char_range_at(self.index);
515+
self.index = next;
516+
do decomposer(ch) |d| {
517+
let class = canonical_combining_class(d);
518+
if class == 0 && !self.sorted {
519+
canonical_sort(self.buffer);
520+
self.sorted = true;
521+
}
522+
self.buffer.push((d, class));
523+
}
524+
}
525+
526+
if !self.sorted {
527+
canonical_sort(self.buffer);
528+
self.sorted = true;
529+
}
530+
531+
match self.buffer.shift_opt() {
532+
Some((c, 0)) => {
533+
self.sorted = false;
534+
Some(c)
535+
}
536+
Some((c, _)) => Some(c),
537+
None => None
538+
}
539+
}
540+
541+
fn size_hint(&self) -> (uint, Option<uint>) { (self.string.len(), None) }
542+
}
543+
453544
/// Replace all occurrences of one string with another
454545
///
455546
/// # Arguments
@@ -1128,6 +1219,8 @@ pub trait StrSlice<'self> {
11281219
fn line_iter(&self) -> CharSplitIterator<'self, char>;
11291220
fn any_line_iter(&self) -> AnyLineIterator<'self>;
11301221
fn word_iter(&self) -> WordIterator<'self>;
1222+
fn nfd_iter(&self) -> NormalizationIterator<'self>;
1223+
fn nfkd_iter(&self) -> NormalizationIterator<'self>;
11311224
fn ends_with(&self, needle: &str) -> bool;
11321225
fn is_whitespace(&self) -> bool;
11331226
fn is_alphanumeric(&self) -> bool;
@@ -1343,6 +1436,28 @@ impl<'self> StrSlice<'self> for &'self str {
13431436
self.split_iter(char::is_whitespace).filter(|s| !s.is_empty())
13441437
}
13451438
1439+
/// Returns the string in Unicode Normalization Form D (canonical decomposition)
1440+
fn nfd_iter(&self) -> NormalizationIterator<'self> {
1441+
NormalizationIterator {
1442+
index: 0,
1443+
string: *self,
1444+
buffer: ~[],
1445+
sorted: false,
1446+
kind: NFD
1447+
}
1448+
}
1449+
1450+
/// Returns the string in Unicode Normalization Form KD (compatibility decomposition)
1451+
fn nfkd_iter(&self) -> NormalizationIterator<'self> {
1452+
NormalizationIterator {
1453+
index: 0,
1454+
string: *self,
1455+
buffer: ~[],
1456+
sorted: false,
1457+
kind: NFKD
1458+
}
1459+
}
1460+
13461461
/// Returns true if the string contains only whitespace
13471462
///
13481463
/// Whitespace characters are determined by `char::is_whitespace`
@@ -3217,6 +3332,34 @@ mod tests {
32173332
assert_eq!(words, ~["Märy", "häd", "ä", "little", "lämb", "Little", "lämb"])
32183333
}
32193334

3335+
#[test]
3336+
fn test_nfd_iter() {
3337+
assert_eq!("abc".nfd_iter().collect::<~str>(), ~"abc");
3338+
assert_eq!("\u1e0b\u01c4".nfd_iter().collect::<~str>(), ~"d\u0307\u01c4");
3339+
assert_eq!("\u2026".nfd_iter().collect::<~str>(), ~"\u2026");
3340+
assert_eq!("\u2126".nfd_iter().collect::<~str>(), ~"\u03a9");
3341+
assert_eq!("\u1e0b\u0323".nfd_iter().collect::<~str>(), ~"d\u0323\u0307");
3342+
assert_eq!("\u1e0d\u0307".nfd_iter().collect::<~str>(), ~"d\u0323\u0307");
3343+
assert_eq!("a\u0301".nfd_iter().collect::<~str>(), ~"a\u0301");
3344+
assert_eq!("\u0301a".nfd_iter().collect::<~str>(), ~"\u0301a");
3345+
assert_eq!("\ud4db".nfd_iter().collect::<~str>(), ~"\u1111\u1171\u11b6");
3346+
assert_eq!("\uac1c".nfd_iter().collect::<~str>(), ~"\u1100\u1162");
3347+
}
3348+
3349+
#[test]
3350+
fn test_nfkd_iter() {
3351+
assert_eq!("abc".nfkd_iter().collect::<~str>(), ~"abc");
3352+
assert_eq!("\u1e0b\u01c4".nfkd_iter().collect::<~str>(), ~"d\u0307DZ\u030c");
3353+
assert_eq!("\u2026".nfkd_iter().collect::<~str>(), ~"...");
3354+
assert_eq!("\u2126".nfkd_iter().collect::<~str>(), ~"\u03a9");
3355+
assert_eq!("\u1e0b\u0323".nfkd_iter().collect::<~str>(), ~"d\u0323\u0307");
3356+
assert_eq!("\u1e0d\u0307".nfkd_iter().collect::<~str>(), ~"d\u0323\u0307");
3357+
assert_eq!("a\u0301".nfkd_iter().collect::<~str>(), ~"a\u0301");
3358+
assert_eq!("\u0301a".nfkd_iter().collect::<~str>(), ~"\u0301a");
3359+
assert_eq!("\ud4db".nfkd_iter().collect::<~str>(), ~"\u1111\u1171\u11b6");
3360+
assert_eq!("\uac1c".nfkd_iter().collect::<~str>(), ~"\u1100\u1162");
3361+
}
3362+
32203363
#[test]
32213364
fn test_line_iter() {
32223365
let data = "\nMäry häd ä little lämb\n\nLittle lämb\n";

0 commit comments

Comments
 (0)