1
1
import regex
2
2
3
3
4
- def cpt_to_utf8_str (cpt ):
5
- if cpt <= 0xFF :
6
- return bytes ([cpt , 0 , 0 , 0 ])
7
- elif cpt <= 0xFFFF :
8
- return bytes ([cpt & 0xFF , cpt >> 8 , 0 , 0 ])
9
- elif cpt <= 0xFFFFFF :
10
- return bytes ([cpt & 0xFF , (cpt >> 8 ) & 0xFF , (cpt >> 16 ) & 0xFF , 0 ])
11
- else :
12
- return bytes ([cpt & 0xFF , (cpt >> 8 ) & 0xFF , (cpt >> 16 ) & 0xFF , cpt >> 24 ])
13
-
14
-
15
- def is_match (codepoint , regex_expr ):
16
- try :
17
- res = regex .match (regex_expr , cpt_to_utf8_str (codepoint ).decode ('utf-32' ))
18
- return res is not None
19
- except Exception :
20
- return False
21
-
22
-
23
4
def get_matches (regex_expr ):
5
+ regex_expr_compiled = regex .compile (regex_expr )
24
6
unicode_ranges = []
25
7
current_range = None
26
8
27
9
for codepoint in range (0x110000 ):
28
- if is_match (codepoint , regex_expr ):
10
+ char = chr (codepoint )
11
+ if regex_expr_compiled .match (char ):
29
12
if current_range is None :
30
13
current_range = [codepoint , codepoint ]
31
14
else :
@@ -40,27 +23,42 @@ def get_matches(regex_expr):
40
23
return unicode_ranges
41
24
42
25
43
- def print_cat (cat , ranges ):
44
- print ("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{" .format (cat )) # noqa: NP100
45
- cnt = 0
46
- for start , end in ranges :
47
- if cnt % 4 != 0 :
48
- print (" " , end = "" ) # noqa: NP100
49
- print ("{{0x{:08X}, 0x{:08X}}}," .format (start , end ), end = "" ) # noqa: NP100
50
- if cnt % 4 == 3 :
51
- print ("" ) # noqa: NP100
52
- cnt += 1
53
-
54
- if cnt % 4 != 0 :
55
- print ("" ) # noqa: NP100
26
+ def print_cat (mode , cat , ranges ):
27
+ if mode == "range" :
28
+ print ("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{" .format (cat )) # noqa: NP100
29
+ if mode == "map" :
30
+ print ("const std::map<uint32_t, uint32_t> unicode_map_{} = {{" .format (cat )) # noqa: NP100
31
+ for i , values in enumerate (ranges ):
32
+ end = ",\n " if (i % 4 == 3 or i + 1 == len (ranges )) else ", "
33
+ values = ["0x%08X" % value for value in values ]
34
+ print ("{" + ", " .join (values ) + "}" , end = end ) # noqa: NP100
56
35
print ("};" ) # noqa: NP100
57
36
print ("" ) # noqa: NP100
58
37
59
38
60
- print_cat ("number" , get_matches (r'\p{N}' ))
61
- print_cat ("letter" , get_matches (r'\p{L}' ))
62
- print_cat ("whitespace" , get_matches (r'\p{Z}' ))
63
- print_cat ("accent_mark" , get_matches (r'\p{M}' ))
64
- print_cat ("punctuation" , get_matches (r'\p{P}' ))
65
- print_cat ("symbol" , get_matches (r'\p{S}' ))
66
- print_cat ("control" , get_matches (r'\p{C}' ))
39
+ print_cat ("range" , "number" , get_matches (r'\p{N}' ))
40
+ print_cat ("range" , "letter" , get_matches (r'\p{L}' ))
41
+ print_cat ("range" , "separator" , get_matches (r'\p{Z}' ))
42
+ print_cat ("range" , "accent_mark" , get_matches (r'\p{M}' ))
43
+ print_cat ("range" , "punctuation" , get_matches (r'\p{P}' ))
44
+ print_cat ("range" , "symbol" , get_matches (r'\p{S}' ))
45
+ print_cat ("range" , "control" , get_matches (r'\p{C}' ))
46
+
47
+ print_cat ("range" , "whitespace" , get_matches (r'\s' ))
48
+
49
+
50
+ map_lowercase = []
51
+ map_uppercase = []
52
+ for codepoint in range (0x110000 ):
53
+ char = chr (codepoint )
54
+ lower = ord (char .lower ()[0 ])
55
+ upper = ord (char .upper ()[0 ])
56
+ if codepoint != lower :
57
+ map_lowercase .append ((codepoint , lower ))
58
+ if codepoint != upper :
59
+ map_uppercase .append ((codepoint , upper ))
60
+ print_cat ("map" , "lowercase" , map_lowercase )
61
+ print_cat ("map" , "uppercase" , map_uppercase )
62
+
63
+
64
+ # TODO: generate unicode_map_nfd
0 commit comments