1
1
import regex
2
- import struct
2
+
3
3
4
4
def cpt_to_utf8_str (cpt ):
5
5
if cpt <= 0xFF :
@@ -11,13 +11,15 @@ def cpt_to_utf8_str(cpt):
11
11
else :
12
12
return bytes ([cpt & 0xFF , (cpt >> 8 ) & 0xFF , (cpt >> 16 ) & 0xFF , cpt >> 24 ])
13
13
14
+
14
15
def is_match (codepoint , regex_expr ):
15
16
try :
16
17
res = regex .match (regex_expr , cpt_to_utf8_str (codepoint ).decode ('utf-32' ))
17
18
return res is not None
18
- except :
19
+ except Exception :
19
20
return False
20
21
22
+
21
23
def get_matches (regex_expr ):
22
24
unicode_ranges = []
23
25
current_range = None
@@ -37,6 +39,7 @@ def get_matches(regex_expr):
37
39
38
40
return unicode_ranges
39
41
42
+
40
43
def print_cat (cat , ranges ):
41
44
print ("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{" .format (cat ))
42
45
cnt = 0
@@ -53,11 +56,11 @@ def print_cat(cat, ranges):
53
56
print ("};" )
54
57
print ("" )
55
58
59
+
56
60
print_cat ("number" , get_matches (r'\p{N}' ))
57
61
print_cat ("letter" , get_matches (r'\p{L}' ))
58
62
print_cat ("whitespace" , get_matches (r'\p{Z}' ))
59
63
print_cat ("accent_mark" , get_matches (r'\p{M}' ))
60
64
print_cat ("punctuation" , get_matches (r'\p{P}' ))
61
65
print_cat ("symbol" , get_matches (r'\p{S}' ))
62
66
print_cat ("control" , get_matches (r'\p{C}' ))
63
-
0 commit comments