@@ -450,6 +450,97 @@ impl<'self> Iterator<&'self str> for StrSplitIterator<'self> {
450
450
}
451
451
}
452
452
453
+ // Helper functions used for Unicode normalization
454
+ fn canonical_sort ( comb : & mut [ ( char , u8 ) ] ) {
455
+ use iterator:: range;
456
+ use tuple:: CopyableTuple ;
457
+
458
+ let len = comb. len ( ) ;
459
+ for i in range ( 0 , len) {
460
+ let mut swapped = false ;
461
+ for j in range ( 1 , len-i) {
462
+ let classA = comb[ j-1 ] . second ( ) ;
463
+ let classB = comb[ j] . second ( ) ;
464
+ if classA != 0 && classB != 0 && classA > classB {
465
+ comb. swap ( j-1 , j) ;
466
+ swapped = true ;
467
+ }
468
+ }
469
+ if !swapped { break ; }
470
+ }
471
+ }
472
+
473
+ #[ deriving( Clone ) ]
474
+ enum NormalizationForm {
475
+ NFD ,
476
+ NFKD
477
+ }
478
+
479
+ /// External iterator for a string's normalization's characters.
480
+ /// Use with the `std::iterator` module.
481
+ #[ deriving( Clone ) ]
482
+ struct NormalizationIterator < ' self > {
483
+ priv kind : NormalizationForm ,
484
+ priv index : uint ,
485
+ priv string : & ' self str ,
486
+ priv buffer : ~[ ( char , u8 ) ] ,
487
+ priv sorted : bool
488
+ }
489
+
490
+ impl < ' self > Iterator < char > for NormalizationIterator < ' self > {
491
+ #[ inline]
492
+ fn next ( & mut self ) -> Option < char > {
493
+ use unicode:: decompose:: canonical_combining_class;
494
+
495
+ match self . buffer . head_opt ( ) {
496
+ Some ( & ( c, 0 ) ) => {
497
+ self . sorted = false ;
498
+ self . buffer . shift ( ) ;
499
+ return Some ( c) ;
500
+ }
501
+ Some ( & ( c, _) ) if self . sorted => {
502
+ self . buffer . shift ( ) ;
503
+ return Some ( c) ;
504
+ }
505
+ _ => self . sorted = false
506
+ }
507
+
508
+ let decomposer = match self . kind {
509
+ NFD => char:: decompose_canonical,
510
+ NFKD => char:: decompose_compatible
511
+ } ;
512
+
513
+ while !self . sorted && self . index < self . string . len ( ) {
514
+ let CharRange { ch, next} = self . string . char_range_at ( self . index ) ;
515
+ self . index = next;
516
+ do decomposer( ch) |d| {
517
+ let class = canonical_combining_class ( d) ;
518
+ if class == 0 && !self . sorted {
519
+ canonical_sort ( self . buffer ) ;
520
+ self . sorted = true ;
521
+ }
522
+ self . buffer . push ( ( d, class) ) ;
523
+ }
524
+ }
525
+
526
+ if !self . sorted {
527
+ canonical_sort ( self . buffer ) ;
528
+ self . sorted = true ;
529
+ }
530
+
531
+ match self . buffer . shift_opt ( ) {
532
+ Some ( ( c, 0 ) ) => {
533
+ self . sorted = false ;
534
+ Some ( c)
535
+ }
536
+ Some ( ( c, _) ) => Some ( c) ,
537
+ None => None
538
+ }
539
+ }
540
+
541
+ fn size_hint ( & self ) -> ( uint , Option < uint > ) { ( self . string . len ( ) , None ) }
542
+ }
543
+
453
544
/// Replace all occurrences of one string with another
454
545
///
455
546
/// # Arguments
@@ -1128,6 +1219,8 @@ pub trait StrSlice<'self> {
1128
1219
fn line_iter(&self) -> CharSplitIterator<'self, char>;
1129
1220
fn any_line_iter(&self) -> AnyLineIterator<'self>;
1130
1221
fn word_iter(&self) -> WordIterator<'self>;
1222
+ fn nfd_iter(&self) -> NormalizationIterator<'self>;
1223
+ fn nfkd_iter(&self) -> NormalizationIterator<'self>;
1131
1224
fn ends_with(&self, needle: &str) -> bool;
1132
1225
fn is_whitespace(&self) -> bool;
1133
1226
fn is_alphanumeric(&self) -> bool;
@@ -1343,6 +1436,28 @@ impl<'self> StrSlice<'self> for &'self str {
1343
1436
self.split_iter(char::is_whitespace).filter(|s| !s.is_empty())
1344
1437
}
1345
1438
1439
+ /// Returns the string in Unicode Normalization Form D (canonical decomposition)
1440
+ fn nfd_iter(&self) -> NormalizationIterator<'self> {
1441
+ NormalizationIterator {
1442
+ index: 0,
1443
+ string: *self,
1444
+ buffer: ~[],
1445
+ sorted: false,
1446
+ kind: NFD
1447
+ }
1448
+ }
1449
+
1450
+ /// Returns the string in Unicode Normalization Form KD (compatibility decomposition)
1451
+ fn nfkd_iter(&self) -> NormalizationIterator<'self> {
1452
+ NormalizationIterator {
1453
+ index: 0,
1454
+ string: *self,
1455
+ buffer: ~[],
1456
+ sorted: false,
1457
+ kind: NFKD
1458
+ }
1459
+ }
1460
+
1346
1461
/// Returns true if the string contains only whitespace
1347
1462
///
1348
1463
/// Whitespace characters are determined by `char::is_whitespace`
@@ -3217,6 +3332,34 @@ mod tests {
3217
3332
assert_eq!( words, ~[ "Märy" , "häd" , "ä" , "little" , "lämb" , "Little" , "lämb" ] )
3218
3333
}
3219
3334
3335
+ #[ test]
3336
+ fn test_nfd_iter( ) {
3337
+ assert_eq!( "abc" . nfd_iter( ) . collect:: <~str >( ) , ~"abc");
3338
+ assert_eq!("\u1e0b \u01c4 " . nfd_iter( ) . collect:: <~str >( ) , ~"d\u0307 \u01c4 ") ;
3339
+ assert_eq!( "\u2026 " . nfd_iter( ) . collect:: <~str >( ) , ~"\u2026 ") ;
3340
+ assert_eq!( "\u2126 " . nfd_iter( ) . collect:: <~str >( ) , ~"\u03a9 ") ;
3341
+ assert_eq!( "\u1e0b \u0323 " . nfd_iter( ) . collect:: <~str >( ) , ~"d\u0323 \u0307 ") ;
3342
+ assert_eq!( "\u1e0d \u0307 " . nfd_iter( ) . collect:: <~str >( ) , ~"d\u0323 \u0307 ") ;
3343
+ assert_eq!( "a\u0301 " . nfd_iter( ) . collect:: <~str >( ) , ~"a\u0301 ") ;
3344
+ assert_eq!( "\u0301 a" . nfd_iter( ) . collect:: <~str >( ) , ~"\u0301 a");
3345
+ assert_eq!("\ud4db " . nfd_iter( ) . collect:: <~str >( ) , ~"\u1111 \u1171 \u11b6 ") ;
3346
+ assert_eq!( "\uac1c " . nfd_iter( ) . collect:: <~str >( ) , ~"\u1100 \u1162 ") ;
3347
+ }
3348
+
3349
+ #[ test]
3350
+ fn test_nfkd_iter( ) {
3351
+ assert_eq!( "abc" . nfkd_iter( ) . collect:: <~str >( ) , ~"abc");
3352
+ assert_eq!("\u1e0b \u01c4 " . nfkd_iter( ) . collect:: <~str >( ) , ~"d\u0307 DZ \u030c ") ;
3353
+ assert_eq!( "\u2026 " . nfkd_iter( ) . collect:: <~str >( ) , ~"...");
3354
+ assert_eq!("\u2126 " . nfkd_iter( ) . collect:: <~str >( ) , ~"\u03a9 ") ;
3355
+ assert_eq!( "\u1e0b \u0323 " . nfkd_iter( ) . collect:: <~str >( ) , ~"d\u0323 \u0307 ") ;
3356
+ assert_eq!( "\u1e0d \u0307 " . nfkd_iter( ) . collect:: <~str >( ) , ~"d\u0323 \u0307 ") ;
3357
+ assert_eq!( "a\u0301 " . nfkd_iter( ) . collect:: <~str >( ) , ~"a\u0301 ") ;
3358
+ assert_eq!( "\u0301 a" . nfkd_iter( ) . collect:: <~str >( ) , ~"\u0301 a");
3359
+ assert_eq!("\ud4db " . nfkd_iter( ) . collect:: <~str >( ) , ~"\u1111 \u1171 \u11b6 ") ;
3360
+ assert_eq!( "\uac1c " . nfkd_iter( ) . collect:: <~str >( ) , ~"\u1100 \u1162 ") ;
3361
+ }
3362
+
3220
3363
#[ test]
3221
3364
fn test_line_iter( ) {
3222
3365
let data = "\n Märy häd ä little lämb\n \n Little lämb\n " ;
0 commit comments