Skip to content

Commit bb459bf

Browse files
committed
Rollup merge of rust-lang#23000 - Florob:unicode-FL, r=brson
This handles the ranges contained in UnicodeData.txt. Counterintuitively this actually makes the tables shorter.
2 parents 478c396 + c9e2de4 commit bb459bf

File tree

2 files changed

+285
-285
lines changed

2 files changed

+285
-285
lines changed

src/etc/unicode.py

Lines changed: 21 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ def fetch(f):
8484
sys.stderr.write("cannot load %s" % f)
8585
exit(1)
8686

87-
def is_valid_unicode(n):
88-
return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
87+
def is_surrogate(n):
88+
return 0xD800 <= n <= 0xDFFF
8989

9090
def load_unicode_data(f):
9191
fetch(f)
@@ -96,19 +96,28 @@ def load_unicode_data(f):
9696
canon_decomp = {}
9797
compat_decomp = {}
9898

99+
udict = {};
100+
range_start = -1;
99101
for line in fileinput.input(f):
100-
fields = line.split(";")
101-
if len(fields) != 15:
102+
data = line.split(';');
103+
if len(data) != 15:
102104
continue
103-
[code, name, gencat, combine, bidi,
104-
decomp, deci, digit, num, mirror,
105-
old, iso, upcase, lowcase, titlecase ] = fields
106-
107-
code_org = code
108-
code = int(code, 16)
109-
110-
if not is_valid_unicode(code):
105+
cp = int(data[0], 16);
106+
if is_surrogate(cp):
111107
continue
108+
if range_start >= 0:
109+
for i in xrange(range_start, cp):
110+
udict[i] = data;
111+
range_start = -1;
112+
if data[1].endswith(", First>"):
113+
range_start = cp;
114+
continue;
115+
udict[cp] = data;
116+
117+
for code in udict:
118+
[code_org, name, gencat, combine, bidi,
119+
decomp, deci, digit, num, mirror,
120+
old, iso, upcase, lowcase, titlecase ] = udict[code];
112121

113122
# generate char to char direct common and simple conversions
114123
# uppercase to lowercase

0 commit comments

Comments
 (0)