51
51
'Cc' : ['C' ], 'Cf' : ['C' ], 'Cs' : ['C' ], 'Co' : ['C' ], 'Cn' : ['C' ],
52
52
}
53
53
54
-
55
- # Grapheme cluster data
56
- # taken from UAX29, http://www.unicode.org/reports/tr29/
57
- # these code points are excluded from the Control category
58
- # NOTE: CR and LF are also technically excluded, but for
59
- # the sake of convenience we leave them in the Control group
60
- # and manually check them in the appropriate place. This is
61
- # still compliant with the implementation requirements.
62
- grapheme_control_exceptions = set ([0x200c , 0x200d ])
63
-
64
- # the Regional_Indicator category
65
- grapheme_regional_indicator = [(0x1f1e6 , 0x1f1ff )]
66
-
67
- # "The following ... are specifically excluded" from the SpacingMark category
68
- # http://www.unicode.org/reports/tr29/#SpacingMark
69
- grapheme_spacingmark_exceptions = [(0x102b , 0x102c ), (0x1038 , 0x1038 ),
70
- (0x1062 , 0x1064 ), (0x1067 , 0x106d ), (0x1083 , 0x1083 ), (0x1087 , 0x108c ),
71
- (0x108f , 0x108f ), (0x109a , 0x109c ), (0x19b0 , 0x19b4 ), (0x19b8 , 0x19b9 ),
72
- (0x19bb , 0x19c0 ), (0x19c8 , 0x19c9 ), (0x1a61 , 0x1a61 ), (0x1a63 , 0x1a64 ),
73
- (0xaa7b , 0xaa7b ), (0xaa7d , 0xaa7d )]
74
-
75
- # these are included in the SpacingMark category
76
- grapheme_spacingmark_extra = set ([0xe33 , 0xeb3 ])
77
-
78
54
def fetch (f ):
79
55
if not os .path .exists (f ):
80
56
os .system ("curl -O http://www.unicode.org/Public/UNIDATA/%s"
@@ -133,7 +109,7 @@ def load_unicode_data(f):
133
109
canon_decomp [code ] = seq
134
110
135
111
# place letter in categories as appropriate
136
- for cat in [gencat , "Assigned" ] + expanded_categories .get (gencat , []):
112
+ for cat in [gencat ] + expanded_categories .get (gencat , []):
137
113
if cat not in gencats :
138
114
gencats [cat ] = []
139
115
gencats [cat ].append (code )
@@ -144,12 +120,6 @@ def load_unicode_data(f):
144
120
combines [combine ] = []
145
121
combines [combine ].append (code )
146
122
147
- # generate Not_Assigned from Assigned
148
- gencats ["Cn" ] = gen_unassigned (gencats ["Assigned" ])
149
- # Assigned is not a real category
150
- del (gencats ["Assigned" ])
151
- # Other contains Not_Assigned
152
- gencats ["C" ].extend (gencats ["Cn" ])
153
123
gencats = group_cats (gencats )
154
124
combines = to_combines (group_cats (combines ))
155
125
@@ -185,11 +155,6 @@ def ungroup_cat(cat):
185
155
lo += 1
186
156
return cat_out
187
157
188
- def gen_unassigned (assigned ):
189
- assigned = set (assigned )
190
- return ([i for i in range (0 , 0xd800 ) if i not in assigned ] +
191
- [i for i in range (0xe000 , 0x110000 ) if i not in assigned ])
192
-
193
158
def to_combines (combs ):
194
159
combs_out = []
195
160
for comb in combs :
@@ -385,45 +350,6 @@ def emit_conversions_module(f, lowerupper, upperlower):
385
350
sorted (lowerupper .iteritems (), key = operator .itemgetter (0 )), is_pub = False )
386
351
f .write ("}\n \n " )
387
352
388
- def emit_grapheme_module (f , grapheme_table , grapheme_cats ):
389
- f .write ("""pub mod grapheme {
390
- use core::option::{Some, None};
391
- use core::slice::ImmutableVector;
392
-
393
- #[allow(non_camel_case_types)]
394
- #[deriving(Clone)]
395
- pub enum GraphemeCat {
396
- """ )
397
- for cat in grapheme_cats + ["Any" ]:
398
- f .write (" GC_" + cat + ",\n " )
399
- f .write (""" }
400
-
401
- fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
402
- use core::cmp::{Equal, Less, Greater};
403
- match r.bsearch(|&(lo, hi, _)| {
404
- if lo <= c && c <= hi { Equal }
405
- else if hi < c { Less }
406
- else { Greater }
407
- }) {
408
- Some(idx) => {
409
- let (_, _, cat) = r[idx];
410
- cat
411
- }
412
- None => GC_Any
413
- }
414
- }
415
-
416
- pub fn grapheme_category(c: char) -> GraphemeCat {
417
- bsearch_range_value_table(c, grapheme_cat_table)
418
- }
419
-
420
- """ )
421
-
422
- emit_table (f , "grapheme_cat_table" , grapheme_table , "&'static [(char, char, GraphemeCat)]" ,
423
- pfun = lambda x : "(%s,%s,GC_%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ]),
424
- is_pub = False )
425
- f .write ("}\n " )
426
-
427
353
def emit_charwidth_module (f , width_table ):
428
354
f .write ("pub mod charwidth {\n " )
429
355
f .write (" use core::option::{Option, Some, None};\n " )
@@ -462,7 +388,7 @@ def emit_charwidth_module(f, width_table):
462
388
f .write (" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n " )
463
389
emit_table (f , "charwidth_table" , width_table , "&'static [(char, char, u8, u8)]" , is_pub = False ,
464
390
pfun = lambda x : "(%s,%s,%s,%s)" % (escape_char (x [0 ]), escape_char (x [1 ]), x [2 ], x [3 ]))
465
- f .write ("}\n \n " )
391
+ f .write ("}\n " )
466
392
467
393
def emit_norm_module (f , canon , compat , combine ):
468
394
canon_keys = canon .keys ()
@@ -547,8 +473,6 @@ def remove_from_wtable(wtable, val):
547
473
wtable_out .extend (wtable )
548
474
return wtable_out
549
475
550
-
551
-
552
476
def optimize_width_table (wtable ):
553
477
wtable_out = []
554
478
w_this = wtable .pop (0 )
@@ -563,7 +487,7 @@ def optimize_width_table(wtable):
563
487
return wtable_out
564
488
565
489
if __name__ == "__main__" :
566
- r = "tables .rs"
490
+ r = "unicode .rs"
567
491
if os .path .exists (r ):
568
492
os .remove (r )
569
493
with open (r , "w" ) as rf :
@@ -574,18 +498,12 @@ def optimize_width_table(wtable):
574
498
(canon_decomp , compat_decomp , gencats , combines ,
575
499
lowerupper , upperlower ) = load_unicode_data ("UnicodeData.txt" )
576
500
want_derived = ["XID_Start" , "XID_Continue" , "Alphabetic" , "Lowercase" , "Uppercase" ]
577
- other_derived = ["Default_Ignorable_Code_Point" , "Grapheme_Extend" ]
501
+ other_derived = ["Default_Ignorable_Code_Point" ]
578
502
derived = load_properties ("DerivedCoreProperties.txt" , want_derived + other_derived )
579
503
scripts = load_properties ("Scripts.txt" , [])
580
504
props = load_properties ("PropList.txt" ,
581
505
["White_Space" , "Join_Control" , "Noncharacter_Code_Point" ])
582
506
583
- # grapheme cluster category from DerivedCoreProperties
584
- # the rest are defined below
585
- grapheme_cats = {}
586
- grapheme_cats ["Extend" ] = derived ["Grapheme_Extend" ]
587
- del (derived ["Grapheme_Extend" ])
588
-
589
507
# bsearch_range_table is used in all the property modules below
590
508
emit_bsearch_range_table (rf )
591
509
@@ -615,7 +533,7 @@ def optimize_width_table(wtable):
615
533
emit_norm_module (rf , canon_decomp , compat_decomp , combines )
616
534
emit_conversions_module (rf , lowerupper , upperlower )
617
535
618
- ### character width module
536
+ # character width module
619
537
width_table = []
620
538
for zwcat in ["Me" , "Mn" , "Cf" ]:
621
539
width_table .extend (map (lambda (lo , hi ): (lo , hi , 0 , 0 ), gencats [zwcat ]))
@@ -637,40 +555,3 @@ def optimize_width_table(wtable):
637
555
# optimize the width table by collapsing adjacent entities when possible
638
556
width_table = optimize_width_table (width_table )
639
557
emit_charwidth_module (rf , width_table )
640
-
641
- ### grapheme cluster module
642
- # from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
643
- # Hangul syllable categories
644
- want_hangul = ["L" , "V" , "T" , "LV" , "LVT" ]
645
- grapheme_cats .update (load_properties ("HangulSyllableType.txt" , want_hangul ))
646
-
647
- # Control
648
- # This category also includes Cs (surrogate codepoints), but Rust's `char`s are
649
- # Unicode Scalar Values only, and surrogates are thus invalid `char`s.
650
- grapheme_cats ["Control" ] = set ()
651
- for cat in ["Zl" , "Zp" , "Cc" , "Cf" ]:
652
- grapheme_cats ["Control" ] |= set (ungroup_cat (gencats [cat ]))
653
- grapheme_cats ["Control" ] = group_cat (list (
654
- grapheme_cats ["Control" ]
655
- - grapheme_control_exceptions
656
- | (set (ungroup_cat (gencats ["Cn" ]))
657
- & set (ungroup_cat (derived ["Default_Ignorable_Code_Point" ])))))
658
-
659
- # Regional Indicator
660
- grapheme_cats ["RegionalIndicator" ] = grapheme_regional_indicator
661
-
662
- # Prepend - "Currently there are no characters with this value"
663
- # (from UAX#29, Unicode 7.0)
664
-
665
- # SpacingMark
666
- grapheme_cats ["SpacingMark" ] = group_cat (list (
667
- set (ungroup_cat (gencats ["Mc" ]))
668
- - set (ungroup_cat (grapheme_cats ["Extend" ]))
669
- | grapheme_spacingmark_extra
670
- - set (ungroup_cat (grapheme_spacingmark_exceptions ))))
671
-
672
- grapheme_table = []
673
- for cat in grapheme_cats :
674
- grapheme_table .extend ([(x , y , cat ) for (x , y ) in grapheme_cats [cat ]])
675
- grapheme_table .sort (key = lambda w : w [0 ])
676
- emit_grapheme_module (rf , grapheme_table , grapheme_cats .keys ())
0 commit comments