Skip to content

Commit 1e521c3

Browse files
committed
---
yaml --- r: 153463 b: refs/heads/try2 c: 154ca08 h: refs/heads/master i: 153461: aeef75e 153459: 91c38a6 153455: 09f6c44 v: v3
1 parent 39eb80b commit 1e521c3

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+638
-2321
lines changed

[refs]

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ refs/heads/snap-stage3: 78a7676898d9f80ab540c6df5d4c9ce35bb50463
55
refs/heads/try: 519addf6277dbafccbb4159db4b710c37eaa2ec5
66
refs/tags/release-0.1: 1f5c5126e96c79d22cb7862f75304136e204f105
77
refs/heads/ndm: f3868061cd7988080c30d6d5bf352a5a5fe2460b
8-
refs/heads/try2: 6c35d513cea468b30759b4f78becf28f11a123c0
8+
refs/heads/try2: 154ca0838868ca08a8aae20f6af245e2b970a3de
99
refs/heads/dist-snap: ba4081a5a8573875fed17545846f6f6902c8ba8d
1010
refs/tags/release-0.2: c870d2dffb391e14efb05aa27898f1f6333a9596
1111
refs/tags/release-0.3: b5f0d0f648d9a6153664837026ba1be43d3e2503

branches/try2/src/etc/unicode.py

Lines changed: 5 additions & 124 deletions
Original file line numberDiff line numberDiff line change
@@ -51,30 +51,6 @@
5151
'Cc': ['C'], 'Cf': ['C'], 'Cs': ['C'], 'Co': ['C'], 'Cn': ['C'],
5252
}
5353

54-
55-
# Grapheme cluster data
56-
# taken from UAX29, http://www.unicode.org/reports/tr29/
57-
# these code points are excluded from the Control category
58-
# NOTE: CR and LF are also technically excluded, but for
59-
# the sake of convenience we leave them in the Control group
60-
# and manually check them in the appropriate place. This is
61-
# still compliant with the implementation requirements.
62-
grapheme_control_exceptions = set([0x200c, 0x200d])
63-
64-
# the Regional_Indicator category
65-
grapheme_regional_indicator = [(0x1f1e6, 0x1f1ff)]
66-
67-
# "The following ... are specifically excluded" from the SpacingMark category
68-
# http://www.unicode.org/reports/tr29/#SpacingMark
69-
grapheme_spacingmark_exceptions = [(0x102b, 0x102c), (0x1038, 0x1038),
70-
(0x1062, 0x1064), (0x1067, 0x106d), (0x1083, 0x1083), (0x1087, 0x108c),
71-
(0x108f, 0x108f), (0x109a, 0x109c), (0x19b0, 0x19b4), (0x19b8, 0x19b9),
72-
(0x19bb, 0x19c0), (0x19c8, 0x19c9), (0x1a61, 0x1a61), (0x1a63, 0x1a64),
73-
(0xaa7b, 0xaa7b), (0xaa7d, 0xaa7d)]
74-
75-
# these are included in the SpacingMark category
76-
grapheme_spacingmark_extra = set([0xe33, 0xeb3])
77-
7854
def fetch(f):
7955
if not os.path.exists(f):
8056
os.system("curl -O http://www.unicode.org/Public/UNIDATA/%s"
@@ -133,7 +109,7 @@ def load_unicode_data(f):
133109
canon_decomp[code] = seq
134110

135111
# place letter in categories as appropriate
136-
for cat in [gencat, "Assigned"] + expanded_categories.get(gencat, []):
112+
for cat in [gencat] + expanded_categories.get(gencat, []):
137113
if cat not in gencats:
138114
gencats[cat] = []
139115
gencats[cat].append(code)
@@ -144,12 +120,6 @@ def load_unicode_data(f):
144120
combines[combine] = []
145121
combines[combine].append(code)
146122

147-
# generate Not_Assigned from Assigned
148-
gencats["Cn"] = gen_unassigned(gencats["Assigned"])
149-
# Assigned is not a real category
150-
del(gencats["Assigned"])
151-
# Other contains Not_Assigned
152-
gencats["C"].extend(gencats["Cn"])
153123
gencats = group_cats(gencats)
154124
combines = to_combines(group_cats(combines))
155125

@@ -185,11 +155,6 @@ def ungroup_cat(cat):
185155
lo += 1
186156
return cat_out
187157

188-
def gen_unassigned(assigned):
189-
assigned = set(assigned)
190-
return ([i for i in range(0, 0xd800) if i not in assigned] +
191-
[i for i in range(0xe000, 0x110000) if i not in assigned])
192-
193158
def to_combines(combs):
194159
combs_out = []
195160
for comb in combs:
@@ -385,45 +350,6 @@ def emit_conversions_module(f, lowerupper, upperlower):
385350
sorted(lowerupper.iteritems(), key=operator.itemgetter(0)), is_pub=False)
386351
f.write("}\n\n")
387352

388-
def emit_grapheme_module(f, grapheme_table, grapheme_cats):
389-
f.write("""pub mod grapheme {
390-
use core::option::{Some, None};
391-
use core::slice::ImmutableVector;
392-
393-
#[allow(non_camel_case_types)]
394-
#[deriving(Clone)]
395-
pub enum GraphemeCat {
396-
""")
397-
for cat in grapheme_cats + ["Any"]:
398-
f.write(" GC_" + cat + ",\n")
399-
f.write(""" }
400-
401-
fn bsearch_range_value_table(c: char, r: &'static [(char, char, GraphemeCat)]) -> GraphemeCat {
402-
use core::cmp::{Equal, Less, Greater};
403-
match r.bsearch(|&(lo, hi, _)| {
404-
if lo <= c && c <= hi { Equal }
405-
else if hi < c { Less }
406-
else { Greater }
407-
}) {
408-
Some(idx) => {
409-
let (_, _, cat) = r[idx];
410-
cat
411-
}
412-
None => GC_Any
413-
}
414-
}
415-
416-
pub fn grapheme_category(c: char) -> GraphemeCat {
417-
bsearch_range_value_table(c, grapheme_cat_table)
418-
}
419-
420-
""")
421-
422-
emit_table(f, "grapheme_cat_table", grapheme_table, "&'static [(char, char, GraphemeCat)]",
423-
pfun=lambda x: "(%s,%s,GC_%s)" % (escape_char(x[0]), escape_char(x[1]), x[2]),
424-
is_pub=False)
425-
f.write("}\n")
426-
427353
def emit_charwidth_module(f, width_table):
428354
f.write("pub mod charwidth {\n")
429355
f.write(" use core::option::{Option, Some, None};\n")
@@ -462,7 +388,7 @@ def emit_charwidth_module(f, width_table):
462388
f.write(" // http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c\n")
463389
emit_table(f, "charwidth_table", width_table, "&'static [(char, char, u8, u8)]", is_pub=False,
464390
pfun=lambda x: "(%s,%s,%s,%s)" % (escape_char(x[0]), escape_char(x[1]), x[2], x[3]))
465-
f.write("}\n\n")
391+
f.write("}\n")
466392

467393
def emit_norm_module(f, canon, compat, combine):
468394
canon_keys = canon.keys()
@@ -547,8 +473,6 @@ def remove_from_wtable(wtable, val):
547473
wtable_out.extend(wtable)
548474
return wtable_out
549475

550-
551-
552476
def optimize_width_table(wtable):
553477
wtable_out = []
554478
w_this = wtable.pop(0)
@@ -563,7 +487,7 @@ def optimize_width_table(wtable):
563487
return wtable_out
564488

565489
if __name__ == "__main__":
566-
r = "tables.rs"
490+
r = "unicode.rs"
567491
if os.path.exists(r):
568492
os.remove(r)
569493
with open(r, "w") as rf:
@@ -574,18 +498,12 @@ def optimize_width_table(wtable):
574498
(canon_decomp, compat_decomp, gencats, combines,
575499
lowerupper, upperlower) = load_unicode_data("UnicodeData.txt")
576500
want_derived = ["XID_Start", "XID_Continue", "Alphabetic", "Lowercase", "Uppercase"]
577-
other_derived = ["Default_Ignorable_Code_Point", "Grapheme_Extend"]
501+
other_derived = ["Default_Ignorable_Code_Point"]
578502
derived = load_properties("DerivedCoreProperties.txt", want_derived + other_derived)
579503
scripts = load_properties("Scripts.txt", [])
580504
props = load_properties("PropList.txt",
581505
["White_Space", "Join_Control", "Noncharacter_Code_Point"])
582506

583-
# grapheme cluster category from DerivedCoreProperties
584-
# the rest are defined below
585-
grapheme_cats = {}
586-
grapheme_cats["Extend"] = derived["Grapheme_Extend"]
587-
del(derived["Grapheme_Extend"])
588-
589507
# bsearch_range_table is used in all the property modules below
590508
emit_bsearch_range_table(rf)
591509

@@ -615,7 +533,7 @@ def optimize_width_table(wtable):
615533
emit_norm_module(rf, canon_decomp, compat_decomp, combines)
616534
emit_conversions_module(rf, lowerupper, upperlower)
617535

618-
### character width module
536+
# character width module
619537
width_table = []
620538
for zwcat in ["Me", "Mn", "Cf"]:
621539
width_table.extend(map(lambda (lo, hi): (lo, hi, 0, 0), gencats[zwcat]))
@@ -637,40 +555,3 @@ def optimize_width_table(wtable):
637555
# optimize the width table by collapsing adjacent entities when possible
638556
width_table = optimize_width_table(width_table)
639557
emit_charwidth_module(rf, width_table)
640-
641-
### grapheme cluster module
642-
# from http://www.unicode.org/reports/tr29/#Grapheme_Cluster_Break_Property_Values
643-
# Hangul syllable categories
644-
want_hangul = ["L", "V", "T", "LV", "LVT"]
645-
grapheme_cats.update(load_properties("HangulSyllableType.txt", want_hangul))
646-
647-
# Control
648-
# This category also includes Cs (surrogate codepoints), but Rust's `char`s are
649-
# Unicode Scalar Values only, and surrogates are thus invalid `char`s.
650-
grapheme_cats["Control"] = set()
651-
for cat in ["Zl", "Zp", "Cc", "Cf"]:
652-
grapheme_cats["Control"] |= set(ungroup_cat(gencats[cat]))
653-
grapheme_cats["Control"] = group_cat(list(
654-
grapheme_cats["Control"]
655-
- grapheme_control_exceptions
656-
| (set(ungroup_cat(gencats["Cn"]))
657-
& set(ungroup_cat(derived["Default_Ignorable_Code_Point"])))))
658-
659-
# Regional Indicator
660-
grapheme_cats["RegionalIndicator"] = grapheme_regional_indicator
661-
662-
# Prepend - "Currently there are no characters with this value"
663-
# (from UAX#29, Unicode 7.0)
664-
665-
# SpacingMark
666-
grapheme_cats["SpacingMark"] = group_cat(list(
667-
set(ungroup_cat(gencats["Mc"]))
668-
- set(ungroup_cat(grapheme_cats["Extend"]))
669-
| grapheme_spacingmark_extra
670-
- set(ungroup_cat(grapheme_spacingmark_exceptions))))
671-
672-
grapheme_table = []
673-
for cat in grapheme_cats:
674-
grapheme_table.extend([(x, y, cat) for (x, y) in grapheme_cats[cat]])
675-
grapheme_table.sort(key=lambda w: w[0])
676-
emit_grapheme_module(rf, grapheme_table, grapheme_cats.keys())

0 commit comments

Comments
 (0)