Skip to content

Commit cd7c728

Browse files
committed
unicode : regenerate unicode tables
1 parent 3275e60 commit cd7c728

File tree

4 files changed

+488
-398
lines changed

4 files changed

+488
-398
lines changed

scripts/gen-unicode-data.py

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import regex
2+
import struct
3+
4+
def cpt_to_utf8_str(cpt):
5+
if cpt <= 0xFF:
6+
return bytes([cpt, 0, 0, 0])
7+
elif cpt <= 0xFFFF:
8+
return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
9+
elif cpt <= 0xFFFFFF:
10+
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
11+
else:
12+
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
13+
14+
def is_match(codepoint, regex_expr):
15+
try:
16+
res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
17+
return res is not None
18+
except:
19+
return False
20+
21+
def get_matches(regex_expr):
22+
unicode_ranges = []
23+
current_range = None
24+
25+
for codepoint in range(0x110000):
26+
if is_match(codepoint, regex_expr):
27+
if current_range is None:
28+
current_range = [codepoint, codepoint]
29+
else:
30+
current_range[1] = codepoint
31+
elif current_range is not None:
32+
unicode_ranges.append(tuple(current_range))
33+
current_range = None
34+
35+
if current_range is not None:
36+
unicode_ranges.append(tuple(current_range))
37+
38+
return unicode_ranges
39+
40+
def print_cat(cat, ranges):
41+
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat))
42+
cnt = 0
43+
for start, end in ranges:
44+
if cnt % 4 != 0:
45+
print(" ", end="")
46+
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="")
47+
if cnt % 4 == 3:
48+
print("")
49+
cnt += 1
50+
51+
if cnt % 4 != 0:
52+
print("")
53+
print("};")
54+
print("")
55+
56+
print_cat("number", get_matches(r'\p{N}'))
57+
print_cat("letter", get_matches(r'\p{L}'))
58+
print_cat("whitespace", get_matches(r'\p{Z}'))
59+
print_cat("accent_mark", get_matches(r'\p{M}'))
60+
print_cat("punctuation", get_matches(r'\p{P}'))
61+
print_cat("symbol", get_matches(r'\p{S}'))
62+
print_cat("control", get_matches(r'\p{C}'))
63+

0 commit comments

Comments
 (0)