Skip to content

Commit 70ca1fe

Browse files
author
jaime-m-p
committed
Clean gen-unicode-data.py
1 parent 7761f8e commit 70ca1fe

File tree

1 file changed

+4
-54
lines changed

1 file changed

+4
-54
lines changed

scripts/gen-unicode-data.py

Lines changed: 4 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,4 @@
11
import regex
2-
import unicodedata
3-
4-
5-
if False:
6-
7-
# This code is equivalent to: cpt.to_bytes(4, "little"))
8-
def cpt_to_utf8_str(cpt):
9-
if cpt <= 0xFF:
10-
return bytes([cpt, 0, 0, 0])
11-
elif cpt <= 0xFFFF:
12-
return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
13-
elif cpt <= 0xFFFFFF:
14-
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
15-
else:
16-
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
17-
18-
# This code is equivalent to: regex_expr_compiled.match(chr(codepoint))
19-
def is_match(codepoint, regex_expr):
20-
try:
21-
res = regex_expr.match(cpt_to_utf8_str(codepoint).decode('utf-32'))
22-
return res is not None
23-
except Exception:
24-
return False
25-
26-
# Verify previous statements, using chr() and ord()
27-
for codepoint in range(0x110000):
28-
temp = cpt_to_utf8_str(codepoint)
29-
assert(temp == codepoint.to_bytes(4, "little"))
30-
try:
31-
char = temp.decode('utf-32')
32-
if codepoint == 0xFEFF: # BOM
33-
assert(char == "") # why?
34-
char = "\uFEFF"
35-
except UnicodeDecodeError:
36-
continue
37-
assert(char == chr(codepoint) )
38-
assert(ord(char) == codepoint )
392

403

414
def get_matches(regex_expr):
@@ -63,13 +26,11 @@ def get_matches(regex_expr):
6326
def print_cat(mode, cat, ranges):
6427
if mode == "range":
6528
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
66-
if mode == "range_value":
67-
print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
6829
if mode == "map":
6930
print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat))
7031
for i, values in enumerate(ranges):
7132
end = ",\n" if (i%4 == 3 or i+1 == len(ranges)) else ", "
72-
values = ["0x%08X"%value for value in values]
33+
values = ["0x%08X" % value for value in values]
7334
print("{" + ", ".join(values) + "}", end=end)
7435
print("};")
7536
print("")
@@ -93,22 +54,11 @@ def print_cat(mode, cat, ranges):
9354
lower = ord(char.lower()[0])
9455
upper = ord(char.upper()[0])
9556
if codepoint != lower:
96-
map_lowercase.append((codepoint,lower))
57+
map_lowercase.append((codepoint, lower))
9758
if codepoint != upper:
98-
map_uppercase.append((codepoint,upper))
59+
map_uppercase.append((codepoint, upper))
9960
print_cat("map", "lowercase", map_lowercase)
10061
print_cat("map", "uppercase", map_uppercase)
10162

10263

103-
# TODO: this is wrong
104-
# inv_map_nfd = {}
105-
# for codepoint in range(0x110000):
106-
# char = chr(codepoint)
107-
# norm = ord(unicodedata.normalize('NFD', char)[0])
108-
# if codepoint != norm:
109-
# a, b = inv_map_nfd.get(norm, (codepoint, codepoint))
110-
# inv_map_nfd[norm] = (min(a, codepoint), max(b, codepoint))
111-
# nfd_ranges = [ (a, b, nfd) for nfd,(a,b) in inv_map_nfd.items() ]
112-
# nfd_ranges = list(sorted(nfd_ranges))
113-
# del inv_map_nfd
114-
# print_cat("range_value", "nfd", nfd_ranges)
64+
# TODO: generate unicode_map_nfd

0 commit comments

Comments
 (0)